~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to storage/innobase/buf/buf0flu.c

  • Committer: Monty Taylor
  • Date: 2008-11-16 23:47:43 UTC
  • mto: (584.1.10 devel)
  • mto: This revision was merged to the branch mainline in revision 589.
  • Revision ID: monty@inaugust.com-20081116234743-c38gmv0pa2kdefaj
BrokeĀ outĀ cached_item.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/******************************************************
 
2
The database buffer buf_pool flush algorithm
 
3
 
 
4
(c) 1995-2001 Innobase Oy
 
5
 
 
6
Created 11/11/1995 Heikki Tuuri
 
7
*******************************************************/
 
8
 
 
9
#include "buf0flu.h"
 
10
 
 
11
#ifdef UNIV_NONINL
 
12
#include "buf0flu.ic"
 
13
#include "trx0sys.h"
 
14
#endif
 
15
 
 
16
#include "ut0byte.h"
 
17
#include "ut0lst.h"
 
18
#include "page0page.h"
 
19
#include "page0zip.h"
 
20
#include "fil0fil.h"
 
21
#include "buf0buf.h"
 
22
#include "buf0lru.h"
 
23
#include "buf0rea.h"
 
24
#include "ibuf0ibuf.h"
 
25
#include "log0log.h"
 
26
#include "os0file.h"
 
27
#include "trx0sys.h"
 
28
#include "srv0srv.h"
 
29
 
 
30
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
31
/**********************************************************************
 
32
Validates the flush list. */
 
33
static
 
34
ibool
 
35
buf_flush_validate_low(void);
 
36
/*========================*/
 
37
                /* out: TRUE if ok */
 
38
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
39
 
 
40
/************************************************************************
 
41
Inserts a modified block into the flush list. */
 
42
UNIV_INTERN
 
43
void
 
44
buf_flush_insert_into_flush_list(
 
45
/*=============================*/
 
46
        buf_page_t*     bpage)  /* in: block which is modified */
 
47
{
 
48
        ut_ad(buf_pool_mutex_own());
 
49
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
 
50
              || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
 
51
                  <= bpage->oldest_modification));
 
52
 
 
53
        switch (buf_page_get_state(bpage)) {
 
54
        case BUF_BLOCK_ZIP_PAGE:
 
55
                mutex_enter(&buf_pool_zip_mutex);
 
56
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_DIRTY);
 
57
                mutex_exit(&buf_pool_zip_mutex);
 
58
                UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
 
59
                /* fall through */
 
60
        case BUF_BLOCK_ZIP_DIRTY:
 
61
        case BUF_BLOCK_FILE_PAGE:
 
62
                ut_ad(bpage->in_LRU_list);
 
63
                ut_ad(bpage->in_page_hash);
 
64
                ut_ad(!bpage->in_zip_hash);
 
65
                ut_ad(!bpage->in_flush_list);
 
66
                ut_d(bpage->in_flush_list = TRUE);
 
67
                UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage);
 
68
                break;
 
69
        case BUF_BLOCK_ZIP_FREE:
 
70
        case BUF_BLOCK_NOT_USED:
 
71
        case BUF_BLOCK_READY_FOR_USE:
 
72
        case BUF_BLOCK_MEMORY:
 
73
        case BUF_BLOCK_REMOVE_HASH:
 
74
                ut_error;
 
75
                return;
 
76
        }
 
77
 
 
78
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
79
        ut_a(buf_flush_validate_low());
 
80
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
81
}
 
82
 
 
83
/************************************************************************
 
84
Inserts a modified block into the flush list in the right sorted position.
 
85
This function is used by recovery, because there the modifications do not
 
86
necessarily come in the order of lsn's. */
 
87
UNIV_INTERN
 
88
void
 
89
buf_flush_insert_sorted_into_flush_list(
 
90
/*====================================*/
 
91
        buf_page_t*     bpage)  /* in: block which is modified */
 
92
{
 
93
        buf_page_t*     prev_b;
 
94
        buf_page_t*     b;
 
95
 
 
96
        ut_ad(buf_pool_mutex_own());
 
97
 
 
98
        switch (buf_page_get_state(bpage)) {
 
99
        case BUF_BLOCK_ZIP_PAGE:
 
100
                mutex_enter(&buf_pool_zip_mutex);
 
101
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_DIRTY);
 
102
                mutex_exit(&buf_pool_zip_mutex);
 
103
                UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
 
104
                /* fall through */
 
105
        case BUF_BLOCK_ZIP_DIRTY:
 
106
        case BUF_BLOCK_FILE_PAGE:
 
107
                ut_ad(bpage->in_LRU_list);
 
108
                ut_ad(bpage->in_page_hash);
 
109
                ut_ad(!bpage->in_zip_hash);
 
110
                ut_ad(!bpage->in_flush_list);
 
111
                ut_d(bpage->in_flush_list = TRUE);
 
112
                break;
 
113
        case BUF_BLOCK_ZIP_FREE:
 
114
        case BUF_BLOCK_NOT_USED:
 
115
        case BUF_BLOCK_READY_FOR_USE:
 
116
        case BUF_BLOCK_MEMORY:
 
117
        case BUF_BLOCK_REMOVE_HASH:
 
118
                ut_error;
 
119
                return;
 
120
        }
 
121
 
 
122
        prev_b = NULL;
 
123
        b = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
124
 
 
125
        while (b && b->oldest_modification > bpage->oldest_modification) {
 
126
                ut_ad(b->in_flush_list);
 
127
                prev_b = b;
 
128
                b = UT_LIST_GET_NEXT(list, b);
 
129
        }
 
130
 
 
131
        if (prev_b == NULL) {
 
132
                UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage);
 
133
        } else {
 
134
                UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
 
135
                                     prev_b, bpage);
 
136
        }
 
137
 
 
138
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
139
        ut_a(buf_flush_validate_low());
 
140
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
141
}
 
142
 
 
143
/************************************************************************
 
144
Returns TRUE if the file page block is immediately suitable for replacement,
 
145
i.e., the transition FILE_PAGE => NOT_USED allowed. */
 
146
UNIV_INTERN
 
147
ibool
 
148
buf_flush_ready_for_replace(
 
149
/*========================*/
 
150
                                /* out: TRUE if can replace immediately */
 
151
        buf_page_t*     bpage)  /* in: buffer control block, must be
 
152
                                buf_page_in_file(bpage) and in the LRU list */
 
153
{
 
154
        ut_ad(buf_pool_mutex_own());
 
155
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
156
        ut_ad(bpage->in_LRU_list);
 
157
 
 
158
        if (UNIV_LIKELY(buf_page_in_file(bpage))) {
 
159
 
 
160
                return(bpage->oldest_modification == 0
 
161
                       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
 
162
                       && bpage->buf_fix_count == 0);
 
163
        }
 
164
 
 
165
        ut_print_timestamp(stderr);
 
166
        fprintf(stderr,
 
167
                "  InnoDB: Error: buffer block state %lu"
 
168
                " in the LRU list!\n",
 
169
                (ulong) buf_page_get_state(bpage));
 
170
        ut_print_buf(stderr, bpage, sizeof(buf_page_t));
 
171
 
 
172
        return(FALSE);
 
173
}
 
174
 
 
175
/************************************************************************
 
176
Returns TRUE if the block is modified and ready for flushing. */
 
177
UNIV_INLINE
 
178
ibool
 
179
buf_flush_ready_for_flush(
 
180
/*======================*/
 
181
                                /* out: TRUE if can flush immediately */
 
182
        buf_page_t*     bpage,  /* in: buffer control block, must be
 
183
                                buf_page_in_file(bpage) */
 
184
        enum buf_flush  flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
185
{
 
186
        ut_a(buf_page_in_file(bpage));
 
187
        ut_ad(buf_pool_mutex_own());
 
188
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
189
 
 
190
        if (bpage->oldest_modification != 0
 
191
            && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
 
192
                ut_ad(bpage->in_flush_list);
 
193
 
 
194
                if (flush_type != BUF_FLUSH_LRU) {
 
195
 
 
196
                        return(TRUE);
 
197
 
 
198
                } else if (bpage->buf_fix_count == 0) {
 
199
 
 
200
                        /* If we are flushing the LRU list, to avoid deadlocks
 
201
                        we require the block not to be bufferfixed, and hence
 
202
                        not latched. */
 
203
 
 
204
                        return(TRUE);
 
205
                }
 
206
        }
 
207
 
 
208
        return(FALSE);
 
209
}
 
210
 
 
211
/************************************************************************
 
212
Remove a block from the flush list of modified blocks. */
 
213
UNIV_INTERN
 
214
void
 
215
buf_flush_remove(
 
216
/*=============*/
 
217
        buf_page_t*     bpage)  /* in: pointer to the block in question */
 
218
{
 
219
        ut_ad(buf_pool_mutex_own());
 
220
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
221
        ut_ad(bpage->in_flush_list);
 
222
        ut_d(bpage->in_flush_list = FALSE);
 
223
 
 
224
        switch (buf_page_get_state(bpage)) {
 
225
        case BUF_BLOCK_ZIP_PAGE:
 
226
                /* clean compressed pages should not be on the flush list */
 
227
        case BUF_BLOCK_ZIP_FREE:
 
228
        case BUF_BLOCK_NOT_USED:
 
229
        case BUF_BLOCK_READY_FOR_USE:
 
230
        case BUF_BLOCK_MEMORY:
 
231
        case BUF_BLOCK_REMOVE_HASH:
 
232
                ut_error;
 
233
                return;
 
234
        case BUF_BLOCK_ZIP_DIRTY:
 
235
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
 
236
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
 
237
                buf_LRU_insert_zip_clean(bpage);
 
238
                break;
 
239
        case BUF_BLOCK_FILE_PAGE:
 
240
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
 
241
                break;
 
242
        }
 
243
 
 
244
        bpage->oldest_modification = 0;
 
245
 
 
246
        ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list));
 
247
}
 
248
 
 
249
/************************************************************************
 
250
Updates the flush system data structures when a write is completed. */
 
251
UNIV_INTERN
 
252
void
 
253
buf_flush_write_complete(
 
254
/*=====================*/
 
255
        buf_page_t*     bpage)  /* in: pointer to the block in question */
 
256
{
 
257
        enum buf_flush  flush_type;
 
258
 
 
259
        ut_ad(bpage);
 
260
 
 
261
        buf_flush_remove(bpage);
 
262
 
 
263
        flush_type = buf_page_get_flush_type(bpage);
 
264
        buf_pool->n_flush[flush_type]--;
 
265
 
 
266
        if (flush_type == BUF_FLUSH_LRU) {
 
267
                /* Put the block to the end of the LRU list to wait to be
 
268
                moved to the free list */
 
269
 
 
270
                buf_LRU_make_block_old(bpage);
 
271
 
 
272
                buf_pool->LRU_flush_ended++;
 
273
        }
 
274
 
 
275
        /* fprintf(stderr, "n pending flush %lu\n",
 
276
        buf_pool->n_flush[flush_type]); */
 
277
 
 
278
        if ((buf_pool->n_flush[flush_type] == 0)
 
279
            && (buf_pool->init_flush[flush_type] == FALSE)) {
 
280
 
 
281
                /* The running flush batch has ended */
 
282
 
 
283
                os_event_set(buf_pool->no_flush[flush_type]);
 
284
        }
 
285
}
 
286
 
 
287
/************************************************************************
 
288
Flushes possible buffered writes from the doublewrite memory buffer to disk,
 
289
and also wakes up the aio thread if simulated aio is used. It is very
 
290
important to call this function after a batch of writes has been posted,
 
291
and also when we may have to wait for a page latch! Otherwise a deadlock
 
292
of threads can occur. */
 
293
static
 
294
void
 
295
buf_flush_buffered_writes(void)
 
296
/*===========================*/
 
297
{
 
298
        byte*           write_buf;
 
299
        ulint           len;
 
300
        ulint           len2;
 
301
        ulint           i;
 
302
 
 
303
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
 
304
                os_aio_simulated_wake_handler_threads();
 
305
 
 
306
                return;
 
307
        }
 
308
 
 
309
        mutex_enter(&(trx_doublewrite->mutex));
 
310
 
 
311
        /* Write first to doublewrite buffer blocks. We use synchronous
 
312
        aio and thus know that file write has been completed when the
 
313
        control returns. */
 
314
 
 
315
        if (trx_doublewrite->first_free == 0) {
 
316
 
 
317
                mutex_exit(&(trx_doublewrite->mutex));
 
318
 
 
319
                return;
 
320
        }
 
321
 
 
322
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
323
 
 
324
                const buf_block_t*      block;
 
325
 
 
326
                block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
 
327
 
 
328
                if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
 
329
                    || block->page.zip.data) {
 
330
                        /* No simple validate for compressed pages exists. */
 
331
                        continue;
 
332
                }
 
333
 
 
334
                if (UNIV_UNLIKELY
 
335
                    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
 
336
                            block->frame + (UNIV_PAGE_SIZE
 
337
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
 
338
                            4))) {
 
339
                        ut_print_timestamp(stderr);
 
340
                        fprintf(stderr,
 
341
                                "  InnoDB: ERROR: The page to be written"
 
342
                                " seems corrupt!\n"
 
343
                                "InnoDB: The lsn fields do not match!"
 
344
                                " Noticed in the buffer pool\n"
 
345
                                "InnoDB: before posting to the"
 
346
                                " doublewrite buffer.\n");
 
347
                }
 
348
 
 
349
                if (!block->check_index_page_at_flush) {
 
350
                } else if (page_is_comp(block->frame)) {
 
351
                        if (UNIV_UNLIKELY
 
352
                            (!page_simple_validate_new(block->frame))) {
 
353
corrupted_page:
 
354
                                buf_page_print(block->frame, 0);
 
355
 
 
356
                                ut_print_timestamp(stderr);
 
357
                                fprintf(stderr,
 
358
                                        "  InnoDB: Apparent corruption of an"
 
359
                                        " index page n:o %lu in space %lu\n"
 
360
                                        "InnoDB: to be written to data file."
 
361
                                        " We intentionally crash server\n"
 
362
                                        "InnoDB: to prevent corrupt data"
 
363
                                        " from ending up in data\n"
 
364
                                        "InnoDB: files.\n",
 
365
                                        (ulong) buf_block_get_page_no(block),
 
366
                                        (ulong) buf_block_get_space(block));
 
367
 
 
368
                                ut_error;
 
369
                        }
 
370
                } else if (UNIV_UNLIKELY
 
371
                           (!page_simple_validate_old(block->frame))) {
 
372
 
 
373
                        goto corrupted_page;
 
374
                }
 
375
        }
 
376
 
 
377
        /* increment the doublewrite flushed pages counter */
 
378
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
 
379
        srv_dblwr_writes++;
 
380
 
 
381
        len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
 
382
                     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
 
383
 
 
384
        write_buf = trx_doublewrite->write_buf;
 
385
        i = 0;
 
386
 
 
387
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
 
388
               trx_doublewrite->block1, 0, len,
 
389
               (void*) write_buf, NULL);
 
390
 
 
391
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
392
             len2 += UNIV_PAGE_SIZE, i++) {
 
393
                const buf_block_t* block = (buf_block_t*)
 
394
                        trx_doublewrite->buf_block_arr[i];
 
395
 
 
396
                if (UNIV_LIKELY(!block->page.zip.data)
 
397
                    && UNIV_LIKELY(buf_block_get_state(block)
 
398
                                   == BUF_BLOCK_FILE_PAGE)
 
399
                    && UNIV_UNLIKELY
 
400
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
 
401
                            write_buf + len2
 
402
                            + (UNIV_PAGE_SIZE
 
403
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
 
404
                        ut_print_timestamp(stderr);
 
405
                        fprintf(stderr,
 
406
                                "  InnoDB: ERROR: The page to be written"
 
407
                                " seems corrupt!\n"
 
408
                                "InnoDB: The lsn fields do not match!"
 
409
                                " Noticed in the doublewrite block1.\n");
 
410
                }
 
411
        }
 
412
 
 
413
        if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
414
                goto flush;
 
415
        }
 
416
 
 
417
        len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
 
418
                * UNIV_PAGE_SIZE;
 
419
 
 
420
        write_buf = trx_doublewrite->write_buf
 
421
                + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 
422
        ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
 
423
 
 
424
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
 
425
               trx_doublewrite->block2, 0, len,
 
426
               (void*) write_buf, NULL);
 
427
 
 
428
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
429
             len2 += UNIV_PAGE_SIZE, i++) {
 
430
                const buf_block_t* block = (buf_block_t*)
 
431
                        trx_doublewrite->buf_block_arr[i];
 
432
 
 
433
                if (UNIV_LIKELY(!block->page.zip.data)
 
434
                    && UNIV_LIKELY(buf_block_get_state(block)
 
435
                                   == BUF_BLOCK_FILE_PAGE)
 
436
                    && UNIV_UNLIKELY
 
437
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
 
438
                            write_buf + len2
 
439
                            + (UNIV_PAGE_SIZE
 
440
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
 
441
                        ut_print_timestamp(stderr);
 
442
                        fprintf(stderr,
 
443
                                "  InnoDB: ERROR: The page to be"
 
444
                                " written seems corrupt!\n"
 
445
                                "InnoDB: The lsn fields do not match!"
 
446
                                " Noticed in"
 
447
                                " the doublewrite block2.\n");
 
448
                }
 
449
        }
 
450
 
 
451
flush:
 
452
        /* Now flush the doublewrite buffer data to disk */
 
453
 
 
454
        fil_flush(TRX_SYS_SPACE);
 
455
 
 
456
        /* We know that the writes have been flushed to disk now
 
457
        and in recovery we will find them in the doublewrite buffer
 
458
        blocks. Next do the writes to the intended positions. */
 
459
 
 
460
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
461
                const buf_block_t* block = (buf_block_t*)
 
462
                        trx_doublewrite->buf_block_arr[i];
 
463
 
 
464
                ut_a(buf_page_in_file(&block->page));
 
465
                if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 
466
                        fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
467
                               FALSE, buf_page_get_space(&block->page),
 
468
                               buf_page_get_zip_size(&block->page),
 
469
                               buf_page_get_page_no(&block->page), 0,
 
470
                               buf_page_get_zip_size(&block->page),
 
471
                               (void*)block->page.zip.data,
 
472
                               (void*)block);
 
473
 
 
474
                        /* Increment the counter of I/O operations used
 
475
                        for selecting LRU policy. */
 
476
                        buf_LRU_stat_inc_io();
 
477
 
 
478
                        continue;
 
479
                }
 
480
 
 
481
                ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
482
 
 
483
                if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
 
484
                                         block->frame
 
485
                                         + (UNIV_PAGE_SIZE
 
486
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
 
487
                                         4))) {
 
488
                        ut_print_timestamp(stderr);
 
489
                        fprintf(stderr,
 
490
                                "  InnoDB: ERROR: The page to be written"
 
491
                                " seems corrupt!\n"
 
492
                                "InnoDB: The lsn fields do not match!"
 
493
                                " Noticed in the buffer pool\n"
 
494
                                "InnoDB: after posting and flushing"
 
495
                                " the doublewrite buffer.\n"
 
496
                                "InnoDB: Page buf fix count %lu,"
 
497
                                " io fix %lu, state %lu\n",
 
498
                                (ulong)block->page.buf_fix_count,
 
499
                                (ulong)buf_block_get_io_fix(block),
 
500
                                (ulong)buf_block_get_state(block));
 
501
                }
 
502
 
 
503
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
504
                       FALSE, buf_block_get_space(block), 0,
 
505
                       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
 
506
                       (void*)block->frame, (void*)block);
 
507
 
 
508
                /* Increment the counter of I/O operations used
 
509
                for selecting LRU policy. */
 
510
                buf_LRU_stat_inc_io();
 
511
        }
 
512
 
 
513
        /* Wake possible simulated aio thread to actually post the
 
514
        writes to the operating system */
 
515
 
 
516
        os_aio_simulated_wake_handler_threads();
 
517
 
 
518
        /* Wait that all async writes to tablespaces have been posted to
 
519
        the OS */
 
520
 
 
521
        os_aio_wait_until_no_pending_writes();
 
522
 
 
523
        /* Now we flush the data to disk (for example, with fsync) */
 
524
 
 
525
        fil_flush_file_spaces(FIL_TABLESPACE);
 
526
 
 
527
        /* We can now reuse the doublewrite memory buffer: */
 
528
 
 
529
        trx_doublewrite->first_free = 0;
 
530
 
 
531
        mutex_exit(&(trx_doublewrite->mutex));
 
532
}
 
533
 
 
534
/************************************************************************
 
535
Posts a buffer page for writing. If the doublewrite memory buffer is
 
536
full, calls buf_flush_buffered_writes and waits for for free space to
 
537
appear. */
 
538
static
 
539
void
 
540
buf_flush_post_to_doublewrite_buf(
 
541
/*==============================*/
 
542
        buf_page_t*     bpage)  /* in: buffer block to write */
 
543
{
 
544
        ulint   zip_size;
 
545
try_again:
 
546
        mutex_enter(&(trx_doublewrite->mutex));
 
547
 
 
548
        ut_a(buf_page_in_file(bpage));
 
549
 
 
550
        if (trx_doublewrite->first_free
 
551
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
552
                mutex_exit(&(trx_doublewrite->mutex));
 
553
 
 
554
                buf_flush_buffered_writes();
 
555
 
 
556
                goto try_again;
 
557
        }
 
558
 
 
559
        zip_size = buf_page_get_zip_size(bpage);
 
560
 
 
561
        if (UNIV_UNLIKELY(zip_size)) {
 
562
                /* Copy the compressed page and clear the rest. */
 
563
                memcpy(trx_doublewrite->write_buf
 
564
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
565
                       bpage->zip.data, zip_size);
 
566
                memset(trx_doublewrite->write_buf
 
567
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
 
568
                       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
 
569
        } else {
 
570
                ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
571
 
 
572
                memcpy(trx_doublewrite->write_buf
 
573
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
574
                       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
 
575
        }
 
576
 
 
577
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
 
578
 
 
579
        trx_doublewrite->first_free++;
 
580
 
 
581
        if (trx_doublewrite->first_free
 
582
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
583
                mutex_exit(&(trx_doublewrite->mutex));
 
584
 
 
585
                buf_flush_buffered_writes();
 
586
 
 
587
                return;
 
588
        }
 
589
 
 
590
        mutex_exit(&(trx_doublewrite->mutex));
 
591
}
 
592
 
 
593
/************************************************************************
 
594
Initializes a page for writing to the tablespace. */
 
595
UNIV_INTERN
 
596
void
 
597
buf_flush_init_for_writing(
 
598
/*=======================*/
 
599
        byte*           page,           /* in/out: page */
 
600
        void*           page_zip_,      /* in/out: compressed page, or NULL */
 
601
        ib_uint64_t     newest_lsn)     /* in: newest modification lsn
 
602
                                        to the page */
 
603
{
 
604
        ut_ad(page);
 
605
 
 
606
        if (page_zip_) {
 
607
                page_zip_des_t* page_zip = page_zip_;
 
608
                ulint           zip_size = page_zip_get_size(page_zip);
 
609
                ut_ad(zip_size);
 
610
                ut_ad(ut_is_2pow(zip_size));
 
611
                ut_ad(zip_size <= UNIV_PAGE_SIZE);
 
612
 
 
613
                switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
 
614
                case FIL_PAGE_TYPE_ALLOCATED:
 
615
                case FIL_PAGE_INODE:
 
616
                case FIL_PAGE_IBUF_BITMAP:
 
617
                case FIL_PAGE_TYPE_FSP_HDR:
 
618
                case FIL_PAGE_TYPE_XDES:
 
619
                        /* These are essentially uncompressed pages. */
 
620
                        memcpy(page_zip->data, page, zip_size);
 
621
                        /* fall through */
 
622
                case FIL_PAGE_TYPE_ZBLOB:
 
623
                case FIL_PAGE_TYPE_ZBLOB2:
 
624
                case FIL_PAGE_INDEX:
 
625
                        mach_write_ull(page_zip->data
 
626
                                       + FIL_PAGE_LSN, newest_lsn);
 
627
                        memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 
628
                        mach_write_to_4(page_zip->data
 
629
                                        + FIL_PAGE_SPACE_OR_CHKSUM,
 
630
                                        srv_use_checksums
 
631
                                        ? page_zip_calc_checksum(
 
632
                                                page_zip->data, zip_size)
 
633
                                        : BUF_NO_CHECKSUM_MAGIC);
 
634
                        return;
 
635
                }
 
636
 
 
637
                ut_error;
 
638
        }
 
639
 
 
640
        /* Write the newest modification lsn to the page header and trailer */
 
641
        mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
 
642
 
 
643
        mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
644
                       newest_lsn);
 
645
 
 
646
        /* Store the new formula checksum */
 
647
 
 
648
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
 
649
                        srv_use_checksums
 
650
                        ? buf_calc_page_new_checksum(page)
 
651
                        : BUF_NO_CHECKSUM_MAGIC);
 
652
 
 
653
        /* We overwrite the first 4 bytes of the end lsn field to store
 
654
        the old formula checksum. Since it depends also on the field
 
655
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
 
656
        new formula checksum. */
 
657
 
 
658
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
659
                        srv_use_checksums
 
660
                        ? buf_calc_page_old_checksum(page)
 
661
                        : BUF_NO_CHECKSUM_MAGIC);
 
662
}
 
663
 
 
664
/************************************************************************
 
665
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
 
666
also when the doublewrite buffer is used, we must call
 
667
buf_flush_buffered_writes after we have posted a batch of writes! */
 
668
static
 
669
void
 
670
buf_flush_write_block_low(
 
671
/*======================*/
 
672
        buf_page_t*     bpage)  /* in: buffer block to write */
 
673
{
 
674
        ulint   zip_size        = buf_page_get_zip_size(bpage);
 
675
        page_t* frame           = NULL;
 
676
#ifdef UNIV_LOG_DEBUG
 
677
        static ibool univ_log_debug_warned;
 
678
#endif /* UNIV_LOG_DEBUG */
 
679
 
 
680
        ut_ad(buf_page_in_file(bpage));
 
681
 
 
682
#ifdef UNIV_IBUF_COUNT_DEBUG
 
683
        ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
 
684
#endif
 
685
        ut_ad(bpage->newest_modification != 0);
 
686
 
 
687
#ifdef UNIV_LOG_DEBUG
 
688
        if (!univ_log_debug_warned) {
 
689
                univ_log_debug_warned = TRUE;
 
690
                fputs("Warning: cannot force log to disk if"
 
691
                      " UNIV_LOG_DEBUG is defined!\n"
 
692
                      "Crash recovery will not work!\n",
 
693
                      stderr);
 
694
        }
 
695
#else
 
696
        /* Force the log to the disk before writing the modified block */
 
697
        log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
 
698
#endif
 
699
        switch (buf_page_get_state(bpage)) {
 
700
        case BUF_BLOCK_ZIP_FREE:
 
701
        case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
 
702
        case BUF_BLOCK_NOT_USED:
 
703
        case BUF_BLOCK_READY_FOR_USE:
 
704
        case BUF_BLOCK_MEMORY:
 
705
        case BUF_BLOCK_REMOVE_HASH:
 
706
                ut_error;
 
707
                break;
 
708
        case BUF_BLOCK_ZIP_DIRTY:
 
709
                frame = bpage->zip.data;
 
710
                if (UNIV_LIKELY(srv_use_checksums)) {
 
711
                        ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
 
712
                             == page_zip_calc_checksum(frame, zip_size));
 
713
                }
 
714
                mach_write_ull(frame + FIL_PAGE_LSN,
 
715
                               bpage->newest_modification);
 
716
                memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 
717
                break;
 
718
        case BUF_BLOCK_FILE_PAGE:
 
719
                frame = bpage->zip.data;
 
720
                if (!frame) {
 
721
                        frame = ((buf_block_t*) bpage)->frame;
 
722
                }
 
723
 
 
724
                buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
 
725
                                           bpage->zip.data
 
726
                                           ? &bpage->zip : NULL,
 
727
                                           bpage->newest_modification);
 
728
                break;
 
729
        }
 
730
 
 
731
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
 
732
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
733
                       FALSE, buf_page_get_space(bpage), zip_size,
 
734
                       buf_page_get_page_no(bpage), 0,
 
735
                       zip_size ? zip_size : UNIV_PAGE_SIZE,
 
736
                       frame, bpage);
 
737
        } else {
 
738
                buf_flush_post_to_doublewrite_buf(bpage);
 
739
        }
 
740
}
 
741
 
 
742
/************************************************************************
 
743
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
 
744
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
 
745
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
 
746
of writes! */
 
747
static
 
748
ulint
 
749
buf_flush_try_page(
 
750
/*===============*/
 
751
                                        /* out: 1 if a page was
 
752
                                        flushed, 0 otherwise */
 
753
        ulint           space,          /* in: space id */
 
754
        ulint           offset,         /* in: page offset */
 
755
        enum buf_flush  flush_type)     /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST,
 
756
                                        or BUF_FLUSH_SINGLE_PAGE */
 
757
{
 
758
        buf_page_t*     bpage;
 
759
        mutex_t*        block_mutex;
 
760
        ibool           locked;
 
761
 
 
762
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
 
763
              || flush_type == BUF_FLUSH_SINGLE_PAGE);
 
764
 
 
765
        buf_pool_mutex_enter();
 
766
 
 
767
        bpage = buf_page_hash_get(space, offset);
 
768
 
 
769
        if (!bpage) {
 
770
                buf_pool_mutex_exit();
 
771
                return(0);
 
772
        }
 
773
 
 
774
        ut_a(buf_page_in_file(bpage));
 
775
        block_mutex = buf_page_get_mutex(bpage);
 
776
 
 
777
        mutex_enter(block_mutex);
 
778
 
 
779
        if (!buf_flush_ready_for_flush(bpage, flush_type)) {
 
780
                mutex_exit(block_mutex);
 
781
                buf_pool_mutex_exit();
 
782
                return(0);
 
783
        }
 
784
 
 
785
        switch (flush_type) {
 
786
        case BUF_FLUSH_LIST:
 
787
                buf_page_set_io_fix(bpage, BUF_IO_WRITE);
 
788
 
 
789
                buf_page_set_flush_type(bpage, flush_type);
 
790
 
 
791
                if (buf_pool->n_flush[flush_type] == 0) {
 
792
 
 
793
                        os_event_reset(buf_pool->no_flush[flush_type]);
 
794
                }
 
795
 
 
796
                buf_pool->n_flush[flush_type]++;
 
797
 
 
798
                /* If the simulated aio thread is not running, we must
 
799
                not wait for any latch, as we may end up in a deadlock:
 
800
                if buf_fix_count == 0, then we know we need not wait */
 
801
 
 
802
                locked = bpage->buf_fix_count == 0;
 
803
                if (locked
 
804
                    && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
 
805
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
806
                                           BUF_IO_WRITE);
 
807
                }
 
808
 
 
809
                mutex_exit(block_mutex);
 
810
                buf_pool_mutex_exit();
 
811
 
 
812
                if (!locked) {
 
813
                        buf_flush_buffered_writes();
 
814
 
 
815
                        if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
 
816
                                rw_lock_s_lock_gen(&((buf_block_t*) bpage)
 
817
                                                   ->lock, BUF_IO_WRITE);
 
818
                        }
 
819
                }
 
820
 
 
821
                break;
 
822
 
 
823
        case BUF_FLUSH_LRU:
 
824
                /* VERY IMPORTANT:
 
825
                Because any thread may call the LRU flush, even when owning
 
826
                locks on pages, to avoid deadlocks, we must make sure that the
 
827
                s-lock is acquired on the page without waiting: this is
 
828
                accomplished because in the if-condition above we require
 
829
                the page not to be bufferfixed (in function
 
830
                ..._ready_for_flush). */
 
831
 
 
832
                buf_page_set_io_fix(bpage, BUF_IO_WRITE);
 
833
 
 
834
                buf_page_set_flush_type(bpage, flush_type);
 
835
 
 
836
                if (buf_pool->n_flush[flush_type] == 0) {
 
837
 
 
838
                        os_event_reset(buf_pool->no_flush[flush_type]);
 
839
                }
 
840
 
 
841
                buf_pool->n_flush[flush_type]++;
 
842
 
 
843
                if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
 
844
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
845
                                           BUF_IO_WRITE);
 
846
                }
 
847
 
 
848
                /* Note that the s-latch is acquired before releasing the
 
849
                buf_pool mutex: this ensures that the latch is acquired
 
850
                immediately. */
 
851
 
 
852
                mutex_exit(block_mutex);
 
853
                buf_pool_mutex_exit();
 
854
                break;
 
855
 
 
856
        case BUF_FLUSH_SINGLE_PAGE:
 
857
                buf_page_set_io_fix(bpage, BUF_IO_WRITE);
 
858
 
 
859
                buf_page_set_flush_type(bpage, flush_type);
 
860
 
 
861
                if (buf_pool->n_flush[flush_type] == 0) {
 
862
 
 
863
                        os_event_reset(buf_pool->no_flush[flush_type]);
 
864
                }
 
865
 
 
866
                buf_pool->n_flush[flush_type]++;
 
867
 
 
868
                mutex_exit(block_mutex);
 
869
                buf_pool_mutex_exit();
 
870
 
 
871
                if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
 
872
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
873
                                           BUF_IO_WRITE);
 
874
                }
 
875
                break;
 
876
 
 
877
        default:
 
878
                ut_error;
 
879
        }
 
880
 
 
881
#ifdef UNIV_DEBUG
 
882
        if (buf_debug_prints) {
 
883
                fprintf(stderr,
 
884
                        "Flushing %u space %u page %u\n",
 
885
                        flush_type, bpage->space, bpage->offset);
 
886
        }
 
887
#endif /* UNIV_DEBUG */
 
888
        buf_flush_write_block_low(bpage);
 
889
 
 
890
        return(1);
 
891
}
 
892
 
 
893
/***************************************************************
 
894
Flushes to disk all flushable pages within the flush area. */
 
895
static
 
896
ulint
 
897
buf_flush_try_neighbors(
 
898
/*====================*/
 
899
                                        /* out: number of pages flushed */
 
900
        ulint           space,          /* in: space id */
 
901
        ulint           offset,         /* in: page offset */
 
902
        enum buf_flush  flush_type)     /* in: BUF_FLUSH_LRU or
 
903
                                        BUF_FLUSH_LIST */
 
904
{
 
905
        buf_page_t*     bpage;
 
906
        ulint           low, high;
 
907
        ulint           count           = 0;
 
908
        ulint           i;
 
909
 
 
910
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
911
 
 
912
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
 
913
                /* If there is little space, it is better not to flush any
 
914
                block except from the end of the LRU list */
 
915
 
 
916
                low = offset;
 
917
                high = offset + 1;
 
918
        } else {
 
919
                /* When flushed, dirty blocks are searched in neighborhoods of
 
920
                this size, and flushed along with the original page. */
 
921
 
 
922
                ulint   buf_flush_area  = ut_min(BUF_READ_AHEAD_AREA,
 
923
                                                 buf_pool->curr_size / 16);
 
924
 
 
925
                low = (offset / buf_flush_area) * buf_flush_area;
 
926
                high = (offset / buf_flush_area + 1) * buf_flush_area;
 
927
        }
 
928
 
 
929
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
 
930
 
 
931
        if (high > fil_space_get_size(space)) {
 
932
                high = fil_space_get_size(space);
 
933
        }
 
934
 
 
935
        buf_pool_mutex_enter();
 
936
 
 
937
        for (i = low; i < high; i++) {
 
938
 
 
939
                bpage = buf_page_hash_get(space, i);
 
940
                ut_a(!bpage || buf_page_in_file(bpage));
 
941
 
 
942
                if (!bpage) {
 
943
 
 
944
                        continue;
 
945
 
 
946
                } else if (flush_type == BUF_FLUSH_LRU && i != offset
 
947
                           && !buf_page_is_old(bpage)) {
 
948
 
 
949
                        /* We avoid flushing 'non-old' blocks in an LRU flush,
 
950
                        because the flushed blocks are soon freed */
 
951
 
 
952
                        continue;
 
953
                } else {
 
954
 
 
955
                        mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
956
 
 
957
                        mutex_enter(block_mutex);
 
958
 
 
959
                        if (buf_flush_ready_for_flush(bpage, flush_type)
 
960
                            && (i == offset || !bpage->buf_fix_count)) {
 
961
                                /* We only try to flush those
 
962
                                neighbors != offset where the buf fix count is
 
963
                                zero, as we then know that we probably can
 
964
                                latch the page without a semaphore wait.
 
965
                                Semaphore waits are expensive because we must
 
966
                                flush the doublewrite buffer before we start
 
967
                                waiting. */
 
968
 
 
969
                                buf_pool_mutex_exit();
 
970
 
 
971
                                mutex_exit(block_mutex);
 
972
 
 
973
                                /* Note: as we release the buf_pool mutex
 
974
                                above, in buf_flush_try_page we cannot be sure
 
975
                                the page is still in a flushable state:
 
976
                                therefore we check it again inside that
 
977
                                function. */
 
978
 
 
979
                                count += buf_flush_try_page(space, i,
 
980
                                                            flush_type);
 
981
 
 
982
                                buf_pool_mutex_enter();
 
983
                        } else {
 
984
                                mutex_exit(block_mutex);
 
985
                        }
 
986
                }
 
987
        }
 
988
 
 
989
        buf_pool_mutex_exit();
 
990
 
 
991
        return(count);
 
992
}
 
993
 
 
994
/***********************************************************************
 
995
This utility flushes dirty blocks from the end of the LRU list or flush_list.
 
996
NOTE 1: in the case of an LRU flush the calling thread may own latches to
 
997
pages: to avoid deadlocks, this function must be written so that it cannot
 
998
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
 
999
the calling thread is not allowed to own any latches on pages! */
 
1000
UNIV_INTERN
 
1001
ulint
 
1002
buf_flush_batch(
 
1003
/*============*/
 
1004
                                        /* out: number of blocks for which the
 
1005
                                        write request was queued;
 
1006
                                        ULINT_UNDEFINED if there was a flush
 
1007
                                        of the same type already running */
 
1008
        enum buf_flush  flush_type,     /* in: BUF_FLUSH_LRU or
 
1009
                                        BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
 
1010
                                        then the caller must not own any
 
1011
                                        latches on pages */
 
1012
        ulint           min_n,          /* in: wished minimum mumber of blocks
 
1013
                                        flushed (it is not guaranteed that the
 
1014
                                        actual number is that big, though) */
 
1015
        ib_uint64_t     lsn_limit)      /* in the case BUF_FLUSH_LIST all
 
1016
                                        blocks whose oldest_modification is
 
1017
                                        smaller than this should be flushed
 
1018
                                        (if their number does not exceed
 
1019
                                        min_n), otherwise ignored */
 
1020
{
 
1021
        buf_page_t*     bpage;
 
1022
        ulint           page_count      = 0;
 
1023
        ulint           old_page_count;
 
1024
        ulint           space;
 
1025
        ulint           offset;
 
1026
 
 
1027
        ut_ad((flush_type == BUF_FLUSH_LRU)
 
1028
              || (flush_type == BUF_FLUSH_LIST));
 
1029
#ifdef UNIV_SYNC_DEBUG
 
1030
        ut_ad((flush_type != BUF_FLUSH_LIST)
 
1031
              || sync_thread_levels_empty_gen(TRUE));
 
1032
#endif /* UNIV_SYNC_DEBUG */
 
1033
        buf_pool_mutex_enter();
 
1034
 
 
1035
        if ((buf_pool->n_flush[flush_type] > 0)
 
1036
            || (buf_pool->init_flush[flush_type] == TRUE)) {
 
1037
 
 
1038
                /* There is already a flush batch of the same type running */
 
1039
 
 
1040
                buf_pool_mutex_exit();
 
1041
 
 
1042
                return(ULINT_UNDEFINED);
 
1043
        }
 
1044
 
 
1045
        buf_pool->init_flush[flush_type] = TRUE;
 
1046
 
 
1047
        for (;;) {
 
1048
flush_next:
 
1049
                /* If we have flushed enough, leave the loop */
 
1050
                if (page_count >= min_n) {
 
1051
 
 
1052
                        break;
 
1053
                }
 
1054
 
 
1055
                /* Start from the end of the list looking for a suitable
 
1056
                block to be flushed. */
 
1057
 
 
1058
                if (flush_type == BUF_FLUSH_LRU) {
 
1059
                        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
1060
                } else {
 
1061
                        ut_ad(flush_type == BUF_FLUSH_LIST);
 
1062
 
 
1063
                        bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
 
1064
                        if (!bpage
 
1065
                            || bpage->oldest_modification >= lsn_limit) {
 
1066
                                /* We have flushed enough */
 
1067
 
 
1068
                                break;
 
1069
                        }
 
1070
                        ut_ad(bpage->in_flush_list);
 
1071
                }
 
1072
 
 
1073
                /* Note that after finding a single flushable page, we try to
 
1074
                flush also all its neighbors, and after that start from the
 
1075
                END of the LRU list or flush list again: the list may change
 
1076
                during the flushing and we cannot safely preserve within this
 
1077
                function a pointer to a block in the list! */
 
1078
 
 
1079
                do {
 
1080
                        mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
1081
 
 
1082
                        ut_a(buf_page_in_file(bpage));
 
1083
 
 
1084
                        mutex_enter(block_mutex);
 
1085
 
 
1086
                        if (buf_flush_ready_for_flush(bpage, flush_type)) {
 
1087
 
 
1088
                                space = buf_page_get_space(bpage);
 
1089
                                offset = buf_page_get_page_no(bpage);
 
1090
 
 
1091
                                buf_pool_mutex_exit();
 
1092
                                mutex_exit(block_mutex);
 
1093
 
 
1094
                                old_page_count = page_count;
 
1095
 
 
1096
                                /* Try to flush also all the neighbors */
 
1097
                                page_count += buf_flush_try_neighbors(
 
1098
                                        space, offset, flush_type);
 
1099
                                /* fprintf(stderr,
 
1100
                                "Flush type %lu, page no %lu, neighb %lu\n",
 
1101
                                flush_type, offset,
 
1102
                                page_count - old_page_count); */
 
1103
 
 
1104
                                buf_pool_mutex_enter();
 
1105
                                goto flush_next;
 
1106
 
 
1107
                        } else if (flush_type == BUF_FLUSH_LRU) {
 
1108
 
 
1109
                                mutex_exit(block_mutex);
 
1110
 
 
1111
                                bpage = UT_LIST_GET_PREV(LRU, bpage);
 
1112
                        } else {
 
1113
                                ut_ad(flush_type == BUF_FLUSH_LIST);
 
1114
 
 
1115
                                mutex_exit(block_mutex);
 
1116
 
 
1117
                                bpage = UT_LIST_GET_PREV(list, bpage);
 
1118
                                ut_ad(!bpage || bpage->in_flush_list);
 
1119
                        }
 
1120
                } while (bpage != NULL);
 
1121
 
 
1122
                /* If we could not find anything to flush, leave the loop */
 
1123
 
 
1124
                break;
 
1125
        }
 
1126
 
 
1127
        buf_pool->init_flush[flush_type] = FALSE;
 
1128
 
 
1129
        if ((buf_pool->n_flush[flush_type] == 0)
 
1130
            && (buf_pool->init_flush[flush_type] == FALSE)) {
 
1131
 
 
1132
                /* The running flush batch has ended */
 
1133
 
 
1134
                os_event_set(buf_pool->no_flush[flush_type]);
 
1135
        }
 
1136
 
 
1137
        buf_pool_mutex_exit();
 
1138
 
 
1139
        buf_flush_buffered_writes();
 
1140
 
 
1141
#ifdef UNIV_DEBUG
 
1142
        if (buf_debug_prints && page_count > 0) {
 
1143
                ut_a(flush_type == BUF_FLUSH_LRU
 
1144
                     || flush_type == BUF_FLUSH_LIST);
 
1145
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
 
1146
                        ? "Flushed %lu pages in LRU flush\n"
 
1147
                        : "Flushed %lu pages in flush list flush\n",
 
1148
                        (ulong) page_count);
 
1149
        }
 
1150
#endif /* UNIV_DEBUG */
 
1151
 
 
1152
        srv_buf_pool_flushed += page_count;
 
1153
 
 
1154
        return(page_count);
 
1155
}
 
1156
 
 
1157
/**********************************************************************
 
1158
Waits until a flush batch of the given type ends */
 
1159
UNIV_INTERN
 
1160
void
 
1161
buf_flush_wait_batch_end(
 
1162
/*=====================*/
 
1163
        enum buf_flush  type)   /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
1164
{
 
1165
        ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
 
1166
 
 
1167
        os_event_wait(buf_pool->no_flush[type]);
 
1168
}
 
1169
 
 
1170
/**********************************************************************
 
1171
Gives a recommendation of how many blocks should be flushed to establish
 
1172
a big enough margin of replaceable blocks near the end of the LRU list
 
1173
and in the free list. */
 
1174
static
 
1175
ulint
 
1176
buf_flush_LRU_recommendation(void)
 
1177
/*==============================*/
 
1178
                        /* out: number of blocks which should be flushed
 
1179
                        from the end of the LRU list */
 
1180
{
 
1181
        buf_page_t*     bpage;
 
1182
        ulint           n_replaceable;
 
1183
        ulint           distance        = 0;
 
1184
 
 
1185
        buf_pool_mutex_enter();
 
1186
 
 
1187
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
 
1188
 
 
1189
        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
1190
 
 
1191
        while ((bpage != NULL)
 
1192
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
 
1193
                   + BUF_FLUSH_EXTRA_MARGIN)
 
1194
               && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
 
1195
 
 
1196
                mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
1197
 
 
1198
                mutex_enter(block_mutex);
 
1199
 
 
1200
                if (buf_flush_ready_for_replace(bpage)) {
 
1201
                        n_replaceable++;
 
1202
                }
 
1203
 
 
1204
                mutex_exit(block_mutex);
 
1205
 
 
1206
                distance++;
 
1207
 
 
1208
                bpage = UT_LIST_GET_PREV(LRU, bpage);
 
1209
        }
 
1210
 
 
1211
        buf_pool_mutex_exit();
 
1212
 
 
1213
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
 
1214
 
 
1215
                return(0);
 
1216
        }
 
1217
 
 
1218
        return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
 
1219
               - n_replaceable);
 
1220
}
 
1221
 
 
1222
/*************************************************************************
 
1223
Flushes pages from the end of the LRU list if there is too small a margin
 
1224
of replaceable pages there or in the free list. VERY IMPORTANT: this function
 
1225
is called also by threads which have locks on pages. To avoid deadlocks, we
 
1226
flush only pages such that the s-lock required for flushing can be acquired
 
1227
immediately, without waiting. */
 
1228
UNIV_INTERN
 
1229
void
 
1230
buf_flush_free_margin(void)
 
1231
/*=======================*/
 
1232
{
 
1233
        ulint   n_to_flush;
 
1234
        ulint   n_flushed;
 
1235
 
 
1236
        n_to_flush = buf_flush_LRU_recommendation();
 
1237
 
 
1238
        if (n_to_flush > 0) {
 
1239
                n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
 
1240
                if (n_flushed == ULINT_UNDEFINED) {
 
1241
                        /* There was an LRU type flush batch already running;
 
1242
                        let us wait for it to end */
 
1243
 
 
1244
                        buf_flush_wait_batch_end(BUF_FLUSH_LRU);
 
1245
                }
 
1246
        }
 
1247
}
 
1248
 
 
1249
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
1250
/**********************************************************************
 
1251
Validates the flush list. */
 
1252
static
 
1253
ibool
 
1254
buf_flush_validate_low(void)
 
1255
/*========================*/
 
1256
                /* out: TRUE if ok */
 
1257
{
 
1258
        buf_page_t*     bpage;
 
1259
 
 
1260
        UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list);
 
1261
 
 
1262
        bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
1263
 
 
1264
        while (bpage != NULL) {
 
1265
                const ib_uint64_t om = bpage->oldest_modification;
 
1266
                ut_ad(bpage->in_flush_list);
 
1267
                ut_a(buf_page_in_file(bpage));
 
1268
                ut_a(om > 0);
 
1269
 
 
1270
                bpage = UT_LIST_GET_NEXT(list, bpage);
 
1271
 
 
1272
                ut_a(!bpage || om >= bpage->oldest_modification);
 
1273
        }
 
1274
 
 
1275
        return(TRUE);
 
1276
}
 
1277
 
 
1278
/**********************************************************************
 
1279
Validates the flush list. */
 
1280
UNIV_INTERN
 
1281
ibool
 
1282
buf_flush_validate(void)
 
1283
/*====================*/
 
1284
                /* out: TRUE if ok */
 
1285
{
 
1286
        ibool   ret;
 
1287
 
 
1288
        buf_pool_mutex_enter();
 
1289
 
 
1290
        ret = buf_flush_validate_low();
 
1291
 
 
1292
        buf_pool_mutex_exit();
 
1293
 
 
1294
        return(ret);
 
1295
}
 
1296
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */