~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to storage/innobase/buf/buf0flu.c

  • Committer: brian
  • Date: 2008-06-25 05:29:13 UTC
  • Revision ID: brian@localhost.localdomain-20080625052913-6upwo0jsrl4lnapl
clean slate

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/******************************************************
 
2
The database buffer buf_pool flush algorithm
 
3
 
 
4
(c) 1995-2001 Innobase Oy
 
5
 
 
6
Created 11/11/1995 Heikki Tuuri
 
7
*******************************************************/
 
8
 
 
9
#include "buf0flu.h"
 
10
 
 
11
#ifdef UNIV_NONINL
 
12
#include "buf0flu.ic"
 
13
#include "trx0sys.h"
 
14
#endif
 
15
 
 
16
#include "ut0byte.h"
 
17
#include "ut0lst.h"
 
18
#include "page0page.h"
 
19
#include "fil0fil.h"
 
20
#include "buf0buf.h"
 
21
#include "buf0lru.h"
 
22
#include "buf0rea.h"
 
23
#include "ibuf0ibuf.h"
 
24
#include "log0log.h"
 
25
#include "os0file.h"
 
26
#include "trx0sys.h"
 
27
#include "srv0srv.h"
 
28
 
 
29
/* When flushed, dirty blocks are searched in neighborhoods of this size, and
 
30
flushed along with the original page. */
 
31
 
 
32
#define BUF_FLUSH_AREA          ut_min(BUF_READ_AHEAD_AREA,\
 
33
                buf_pool->curr_size / 16)
 
34
 
 
35
/**********************************************************************
 
36
Validates the flush list. */
 
37
static
 
38
ibool
 
39
buf_flush_validate_low(void);
 
40
/*========================*/
 
41
                /* out: TRUE if ok */
 
42
 
 
43
/************************************************************************
 
44
Inserts a modified block into the flush list. */
 
45
 
 
46
void
 
47
buf_flush_insert_into_flush_list(
 
48
/*=============================*/
 
49
        buf_block_t*    block)  /* in: block which is modified */
 
50
{
 
51
        ut_ad(mutex_own(&(buf_pool->mutex)));
 
52
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
53
 
 
54
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
 
55
              || (ut_dulint_cmp((UT_LIST_GET_FIRST(buf_pool->flush_list))
 
56
                                ->oldest_modification,
 
57
                                block->oldest_modification) <= 0));
 
58
 
 
59
        UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
 
60
 
 
61
        ut_ad(buf_flush_validate_low());
 
62
}
 
63
 
 
64
/************************************************************************
 
65
Inserts a modified block into the flush list in the right sorted position.
 
66
This function is used by recovery, because there the modifications do not
 
67
necessarily come in the order of lsn's. */
 
68
 
 
69
void
 
70
buf_flush_insert_sorted_into_flush_list(
 
71
/*====================================*/
 
72
        buf_block_t*    block)  /* in: block which is modified */
 
73
{
 
74
        buf_block_t*    prev_b;
 
75
        buf_block_t*    b;
 
76
 
 
77
        ut_ad(mutex_own(&(buf_pool->mutex)));
 
78
 
 
79
        prev_b = NULL;
 
80
        b = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
81
 
 
82
        while (b && (ut_dulint_cmp(b->oldest_modification,
 
83
                                   block->oldest_modification) > 0)) {
 
84
                prev_b = b;
 
85
                b = UT_LIST_GET_NEXT(flush_list, b);
 
86
        }
 
87
 
 
88
        if (prev_b == NULL) {
 
89
                UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
 
90
        } else {
 
91
                UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b,
 
92
                                     block);
 
93
        }
 
94
 
 
95
        ut_ad(buf_flush_validate_low());
 
96
}
 
97
 
 
98
/************************************************************************
 
99
Returns TRUE if the file page block is immediately suitable for replacement,
 
100
i.e., the transition FILE_PAGE => NOT_USED allowed. */
 
101
 
 
102
ibool
 
103
buf_flush_ready_for_replace(
 
104
/*========================*/
 
105
                                /* out: TRUE if can replace immediately */
 
106
        buf_block_t*    block)  /* in: buffer control block, must be in state
 
107
                                BUF_BLOCK_FILE_PAGE and in the LRU list */
 
108
{
 
109
        ut_ad(mutex_own(&(buf_pool->mutex)));
 
110
        ut_ad(mutex_own(&block->mutex));
 
111
        if (block->state != BUF_BLOCK_FILE_PAGE) {
 
112
                ut_print_timestamp(stderr);
 
113
                fprintf(stderr,
 
114
                        "  InnoDB: Error: buffer block state %lu"
 
115
                        " in the LRU list!\n",
 
116
                        (ulong)block->state);
 
117
                ut_print_buf(stderr, block, sizeof(buf_block_t));
 
118
 
 
119
                return(FALSE);
 
120
        }
 
121
 
 
122
        if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
 
123
            || (block->buf_fix_count != 0)
 
124
            || (block->io_fix != 0)) {
 
125
 
 
126
                return(FALSE);
 
127
        }
 
128
 
 
129
        return(TRUE);
 
130
}
 
131
 
 
132
/************************************************************************
 
133
Returns TRUE if the block is modified and ready for flushing. */
 
134
UNIV_INLINE
 
135
ibool
 
136
buf_flush_ready_for_flush(
 
137
/*======================*/
 
138
                                /* out: TRUE if can flush immediately */
 
139
        buf_block_t*    block,  /* in: buffer control block, must be in state
 
140
                                BUF_BLOCK_FILE_PAGE */
 
141
        ulint           flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
142
{
 
143
        ut_ad(mutex_own(&(buf_pool->mutex)));
 
144
        ut_ad(mutex_own(&(block->mutex)));
 
145
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
146
 
 
147
        if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
 
148
            && (block->io_fix == 0)) {
 
149
                if (flush_type != BUF_FLUSH_LRU) {
 
150
 
 
151
                        return(TRUE);
 
152
 
 
153
                } else if (block->buf_fix_count == 0) {
 
154
 
 
155
                        /* If we are flushing the LRU list, to avoid deadlocks
 
156
                        we require the block not to be bufferfixed, and hence
 
157
                        not latched. */
 
158
 
 
159
                        return(TRUE);
 
160
                }
 
161
        }
 
162
 
 
163
        return(FALSE);
 
164
}
 
165
 
 
166
/************************************************************************
 
167
Updates the flush system data structures when a write is completed. */
 
168
 
 
169
void
 
170
buf_flush_write_complete(
 
171
/*=====================*/
 
172
        buf_block_t*    block)  /* in: pointer to the block in question */
 
173
{
 
174
        ut_ad(block);
 
175
#ifdef UNIV_SYNC_DEBUG
 
176
        ut_ad(mutex_own(&(buf_pool->mutex)));
 
177
#endif /* UNIV_SYNC_DEBUG */
 
178
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
179
 
 
180
        block->oldest_modification = ut_dulint_zero;
 
181
 
 
182
        UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
 
183
 
 
184
        ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));
 
185
 
 
186
        (buf_pool->n_flush[block->flush_type])--;
 
187
 
 
188
        if (block->flush_type == BUF_FLUSH_LRU) {
 
189
                /* Put the block to the end of the LRU list to wait to be
 
190
                moved to the free list */
 
191
 
 
192
                buf_LRU_make_block_old(block);
 
193
 
 
194
                buf_pool->LRU_flush_ended++;
 
195
        }
 
196
 
 
197
        /* fprintf(stderr, "n pending flush %lu\n",
 
198
        buf_pool->n_flush[block->flush_type]); */
 
199
 
 
200
        if ((buf_pool->n_flush[block->flush_type] == 0)
 
201
            && (buf_pool->init_flush[block->flush_type] == FALSE)) {
 
202
 
 
203
                /* The running flush batch has ended */
 
204
 
 
205
                os_event_set(buf_pool->no_flush[block->flush_type]);
 
206
        }
 
207
}
 
208
 
 
209
/************************************************************************
 
210
Flushes possible buffered writes from the doublewrite memory buffer to disk,
 
211
and also wakes up the aio thread if simulated aio is used. It is very
 
212
important to call this function after a batch of writes has been posted,
 
213
and also when we may have to wait for a page latch! Otherwise a deadlock
 
214
of threads can occur. */
 
215
static
 
216
void
 
217
buf_flush_buffered_writes(void)
 
218
/*===========================*/
 
219
{
 
220
        buf_block_t*    block;
 
221
        byte*           write_buf;
 
222
        ulint           len;
 
223
        ulint           len2;
 
224
        ulint           i;
 
225
 
 
226
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
 
227
                os_aio_simulated_wake_handler_threads();
 
228
 
 
229
                return;
 
230
        }
 
231
 
 
232
        mutex_enter(&(trx_doublewrite->mutex));
 
233
 
 
234
        /* Write first to doublewrite buffer blocks. We use synchronous
 
235
        aio and thus know that file write has been completed when the
 
236
        control returns. */
 
237
 
 
238
        if (trx_doublewrite->first_free == 0) {
 
239
 
 
240
                mutex_exit(&(trx_doublewrite->mutex));
 
241
 
 
242
                return;
 
243
        }
 
244
 
 
245
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
246
 
 
247
                block = trx_doublewrite->buf_block_arr[i];
 
248
                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
249
 
 
250
                if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
 
251
                    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
 
252
                                        - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
 
253
                        ut_print_timestamp(stderr);
 
254
                        fprintf(stderr,
 
255
                                "  InnoDB: ERROR: The page to be written"
 
256
                                " seems corrupt!\n"
 
257
                                "InnoDB: The lsn fields do not match!"
 
258
                                " Noticed in the buffer pool\n"
 
259
                                "InnoDB: before posting to the"
 
260
                                " doublewrite buffer.\n");
 
261
                }
 
262
 
 
263
                if (block->check_index_page_at_flush
 
264
                    && !page_simple_validate(block->frame)) {
 
265
 
 
266
                        buf_page_print(block->frame);
 
267
 
 
268
                        ut_print_timestamp(stderr);
 
269
                        fprintf(stderr,
 
270
                                "  InnoDB: Apparent corruption of an"
 
271
                                " index page n:o %lu in space %lu\n"
 
272
                                "InnoDB: to be written to data file."
 
273
                                " We intentionally crash server\n"
 
274
                                "InnoDB: to prevent corrupt data"
 
275
                                " from ending up in data\n"
 
276
                                "InnoDB: files.\n",
 
277
                                (ulong) block->offset, (ulong) block->space);
 
278
 
 
279
                        ut_error;
 
280
                }
 
281
        }
 
282
 
 
283
        /* increment the doublewrite flushed pages counter */
 
284
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
 
285
        srv_dblwr_writes++;
 
286
 
 
287
        if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
288
                len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 
289
        } else {
 
290
                len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
 
291
        }
 
292
 
 
293
        fil_io(OS_FILE_WRITE,
 
294
               TRUE, TRX_SYS_SPACE,
 
295
               trx_doublewrite->block1, 0, len,
 
296
               (void*)trx_doublewrite->write_buf, NULL);
 
297
 
 
298
        write_buf = trx_doublewrite->write_buf;
 
299
 
 
300
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) {
 
301
                if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4)
 
302
                    != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE
 
303
                                        - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
 
304
                        ut_print_timestamp(stderr);
 
305
                        fprintf(stderr,
 
306
                                "  InnoDB: ERROR: The page to be written"
 
307
                                " seems corrupt!\n"
 
308
                                "InnoDB: The lsn fields do not match!"
 
309
                                " Noticed in the doublewrite block1.\n");
 
310
                }
 
311
        }
 
312
 
 
313
        if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
314
                len = (trx_doublewrite->first_free
 
315
                       - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
 
316
 
 
317
                fil_io(OS_FILE_WRITE,
 
318
                       TRUE, TRX_SYS_SPACE,
 
319
                       trx_doublewrite->block2, 0, len,
 
320
                       (void*)(trx_doublewrite->write_buf
 
321
                               + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
 
322
                               * UNIV_PAGE_SIZE),
 
323
                       NULL);
 
324
 
 
325
                write_buf = trx_doublewrite->write_buf
 
326
                        + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 
327
                for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
328
                     len2 += UNIV_PAGE_SIZE) {
 
329
                        if (mach_read_from_4(write_buf + len2
 
330
                                             + FIL_PAGE_LSN + 4)
 
331
                            != mach_read_from_4(write_buf + len2
 
332
                                                + UNIV_PAGE_SIZE
 
333
                                                - FIL_PAGE_END_LSN_OLD_CHKSUM
 
334
                                                + 4)) {
 
335
                                ut_print_timestamp(stderr);
 
336
                                fprintf(stderr,
 
337
                                        "  InnoDB: ERROR: The page to be"
 
338
                                        " written seems corrupt!\n"
 
339
                                        "InnoDB: The lsn fields do not match!"
 
340
                                        " Noticed in"
 
341
                                        " the doublewrite block2.\n");
 
342
                        }
 
343
                }
 
344
        }
 
345
 
 
346
        /* Now flush the doublewrite buffer data to disk */
 
347
 
 
348
        fil_flush(TRX_SYS_SPACE);
 
349
 
 
350
        /* We know that the writes have been flushed to disk now
 
351
        and in recovery we will find them in the doublewrite buffer
 
352
        blocks. Next do the writes to the intended positions. */
 
353
 
 
354
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
355
                block = trx_doublewrite->buf_block_arr[i];
 
356
 
 
357
                if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
 
358
                    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
 
359
                                        - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
 
360
                        ut_print_timestamp(stderr);
 
361
                        fprintf(stderr,
 
362
                                "  InnoDB: ERROR: The page to be written"
 
363
                                " seems corrupt!\n"
 
364
                                "InnoDB: The lsn fields do not match!"
 
365
                                " Noticed in the buffer pool\n"
 
366
                                "InnoDB: after posting and flushing"
 
367
                                " the doublewrite buffer.\n"
 
368
                                "InnoDB: Page buf fix count %lu,"
 
369
                                " io fix %lu, state %lu\n",
 
370
                                (ulong)block->buf_fix_count,
 
371
                                (ulong)block->io_fix,
 
372
                                (ulong)block->state);
 
373
                }
 
374
                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
375
 
 
376
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
377
                       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
 
378
                       (void*)block->frame, (void*)block);
 
379
        }
 
380
 
 
381
        /* Wake possible simulated aio thread to actually post the
 
382
        writes to the operating system */
 
383
 
 
384
        os_aio_simulated_wake_handler_threads();
 
385
 
 
386
        /* Wait that all async writes to tablespaces have been posted to
 
387
        the OS */
 
388
 
 
389
        os_aio_wait_until_no_pending_writes();
 
390
 
 
391
        /* Now we flush the data to disk (for example, with fsync) */
 
392
 
 
393
        fil_flush_file_spaces(FIL_TABLESPACE);
 
394
 
 
395
        /* We can now reuse the doublewrite memory buffer: */
 
396
 
 
397
        trx_doublewrite->first_free = 0;
 
398
 
 
399
        mutex_exit(&(trx_doublewrite->mutex));
 
400
}
 
401
 
 
402
/************************************************************************
 
403
Posts a buffer page for writing. If the doublewrite memory buffer is
 
404
full, calls buf_flush_buffered_writes and waits for for free space to
 
405
appear. */
 
406
static
 
407
void
 
408
buf_flush_post_to_doublewrite_buf(
 
409
/*==============================*/
 
410
        buf_block_t*    block)  /* in: buffer block to write */
 
411
{
 
412
try_again:
 
413
        mutex_enter(&(trx_doublewrite->mutex));
 
414
 
 
415
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
416
 
 
417
        if (trx_doublewrite->first_free
 
418
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
419
                mutex_exit(&(trx_doublewrite->mutex));
 
420
 
 
421
                buf_flush_buffered_writes();
 
422
 
 
423
                goto try_again;
 
424
        }
 
425
 
 
426
        ut_memcpy(trx_doublewrite->write_buf
 
427
                  + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
428
                  block->frame, UNIV_PAGE_SIZE);
 
429
 
 
430
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
 
431
 
 
432
        trx_doublewrite->first_free++;
 
433
 
 
434
        if (trx_doublewrite->first_free
 
435
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
436
                mutex_exit(&(trx_doublewrite->mutex));
 
437
 
 
438
                buf_flush_buffered_writes();
 
439
 
 
440
                return;
 
441
        }
 
442
 
 
443
        mutex_exit(&(trx_doublewrite->mutex));
 
444
}
 
445
 
 
446
/************************************************************************
 
447
Initializes a page for writing to the tablespace. */
 
448
 
 
449
void
 
450
buf_flush_init_for_writing(
 
451
/*=======================*/
 
452
        byte*   page,           /* in: page */
 
453
        dulint  newest_lsn,     /* in: newest modification lsn to the page */
 
454
        ulint   space,          /* in: space id */
 
455
        ulint   page_no)        /* in: page number */
 
456
{
 
457
        /* Write the newest modification lsn to the page header and trailer */
 
458
        mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
 
459
 
 
460
        mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
461
                        newest_lsn);
 
462
        /* Write the page number and the space id */
 
463
 
 
464
        mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
 
465
        mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space);
 
466
 
 
467
        /* Store the new formula checksum */
 
468
 
 
469
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
 
470
                        srv_use_checksums
 
471
                        ? buf_calc_page_new_checksum(page)
 
472
                        : BUF_NO_CHECKSUM_MAGIC);
 
473
 
 
474
        /* We overwrite the first 4 bytes of the end lsn field to store
 
475
        the old formula checksum. Since it depends also on the field
 
476
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
 
477
        new formula checksum. */
 
478
 
 
479
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
480
                        srv_use_checksums
 
481
                        ? buf_calc_page_old_checksum(page)
 
482
                        : BUF_NO_CHECKSUM_MAGIC);
 
483
}
 
484
 
 
485
/************************************************************************
 
486
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
 
487
also when the doublewrite buffer is used, we must call
 
488
buf_flush_buffered_writes after we have posted a batch of writes! */
 
489
static
 
490
void
 
491
buf_flush_write_block_low(
 
492
/*======================*/
 
493
        buf_block_t*    block)  /* in: buffer block to write */
 
494
{
 
495
#ifdef UNIV_LOG_DEBUG
 
496
        static ibool univ_log_debug_warned;
 
497
#endif /* UNIV_LOG_DEBUG */
 
498
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
499
 
 
500
#ifdef UNIV_IBUF_DEBUG
 
501
        ut_a(ibuf_count_get(block->space, block->offset) == 0);
 
502
#endif
 
503
        ut_ad(!ut_dulint_is_zero(block->newest_modification));
 
504
 
 
505
#ifdef UNIV_LOG_DEBUG
 
506
        if (!univ_log_debug_warned) {
 
507
                univ_log_debug_warned = TRUE;
 
508
                fputs("Warning: cannot force log to disk if"
 
509
                      " UNIV_LOG_DEBUG is defined!\n"
 
510
                      "Crash recovery will not work!\n",
 
511
                      stderr);
 
512
        }
 
513
#else
 
514
        /* Force the log to the disk before writing the modified block */
 
515
        log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
 
516
#endif
 
517
        buf_flush_init_for_writing(block->frame, block->newest_modification,
 
518
                                   block->space, block->offset);
 
519
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
 
520
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
521
                       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
 
522
                       (void*)block->frame, (void*)block);
 
523
        } else {
 
524
                buf_flush_post_to_doublewrite_buf(block);
 
525
        }
 
526
}
 
527
 
 
528
/************************************************************************
 
529
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
 
530
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
 
531
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
 
532
of writes! */
 
533
static
 
534
ulint
 
535
buf_flush_try_page(
 
536
/*===============*/
 
537
                                /* out: 1 if a page was flushed, 0 otherwise */
 
538
        ulint   space,          /* in: space id */
 
539
        ulint   offset,         /* in: page offset */
 
540
        ulint   flush_type)     /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or
 
541
                                BUF_FLUSH_SINGLE_PAGE */
 
542
{
 
543
        buf_block_t*    block;
 
544
        ibool           locked;
 
545
 
 
546
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
 
547
              || flush_type == BUF_FLUSH_SINGLE_PAGE);
 
548
 
 
549
        mutex_enter(&(buf_pool->mutex));
 
550
 
 
551
        block = buf_page_hash_get(space, offset);
 
552
 
 
553
        ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
 
554
 
 
555
        if (!block) {
 
556
                mutex_exit(&(buf_pool->mutex));
 
557
                return(0);
 
558
        }
 
559
 
 
560
        mutex_enter(&block->mutex);
 
561
 
 
562
        if (flush_type == BUF_FLUSH_LIST
 
563
            && buf_flush_ready_for_flush(block, flush_type)) {
 
564
 
 
565
                block->io_fix = BUF_IO_WRITE;
 
566
 
 
567
                /* If AWE is enabled and the page is not mapped to a frame,
 
568
                then map it */
 
569
 
 
570
                if (block->frame == NULL) {
 
571
                        ut_a(srv_use_awe);
 
572
 
 
573
                        /* We set second parameter TRUE because the block is
 
574
                        in the LRU list and we must put it to
 
575
                        awe_LRU_free_mapped list once mapped to a frame */
 
576
 
 
577
                        buf_awe_map_page_to_frame(block, TRUE);
 
578
                }
 
579
 
 
580
                block->flush_type = flush_type;
 
581
 
 
582
                if (buf_pool->n_flush[flush_type] == 0) {
 
583
 
 
584
                        os_event_reset(buf_pool->no_flush[flush_type]);
 
585
                }
 
586
 
 
587
                (buf_pool->n_flush[flush_type])++;
 
588
 
 
589
                locked = FALSE;
 
590
 
 
591
                /* If the simulated aio thread is not running, we must
 
592
                not wait for any latch, as we may end up in a deadlock:
 
593
                if buf_fix_count == 0, then we know we need not wait */
 
594
 
 
595
                if (block->buf_fix_count == 0) {
 
596
                        rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
 
597
 
 
598
                        locked = TRUE;
 
599
                }
 
600
 
 
601
                mutex_exit(&block->mutex);
 
602
                mutex_exit(&(buf_pool->mutex));
 
603
 
 
604
                if (!locked) {
 
605
                        buf_flush_buffered_writes();
 
606
 
 
607
                        rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
 
608
                }
 
609
 
 
610
#ifdef UNIV_DEBUG
 
611
                if (buf_debug_prints) {
 
612
                        fprintf(stderr,
 
613
                                "Flushing page space %lu, page no %lu \n",
 
614
                                (ulong) block->space, (ulong) block->offset);
 
615
                }
 
616
#endif /* UNIV_DEBUG */
 
617
 
 
618
                buf_flush_write_block_low(block);
 
619
 
 
620
                return(1);
 
621
 
 
622
        } else if (flush_type == BUF_FLUSH_LRU
 
623
                   && buf_flush_ready_for_flush(block, flush_type)) {
 
624
 
 
625
                /* VERY IMPORTANT:
 
626
                Because any thread may call the LRU flush, even when owning
 
627
                locks on pages, to avoid deadlocks, we must make sure that the
 
628
                s-lock is acquired on the page without waiting: this is
 
629
                accomplished because in the if-condition above we require
 
630
                the page not to be bufferfixed (in function
 
631
                ..._ready_for_flush). */
 
632
 
 
633
                block->io_fix = BUF_IO_WRITE;
 
634
 
 
635
                /* If AWE is enabled and the page is not mapped to a frame,
 
636
                then map it */
 
637
 
 
638
                if (block->frame == NULL) {
 
639
                        ut_a(srv_use_awe);
 
640
 
 
641
                        /* We set second parameter TRUE because the block is
 
642
                        in the LRU list and we must put it to
 
643
                        awe_LRU_free_mapped list once mapped to a frame */
 
644
 
 
645
                        buf_awe_map_page_to_frame(block, TRUE);
 
646
                }
 
647
 
 
648
                block->flush_type = flush_type;
 
649
 
 
650
                if (buf_pool->n_flush[flush_type] == 0) {
 
651
 
 
652
                        os_event_reset(buf_pool->no_flush[flush_type]);
 
653
                }
 
654
 
 
655
                (buf_pool->n_flush[flush_type])++;
 
656
 
 
657
                rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
 
658
 
 
659
                /* Note that the s-latch is acquired before releasing the
 
660
                buf_pool mutex: this ensures that the latch is acquired
 
661
                immediately. */
 
662
 
 
663
                mutex_exit(&block->mutex);
 
664
                mutex_exit(&(buf_pool->mutex));
 
665
 
 
666
                buf_flush_write_block_low(block);
 
667
 
 
668
                return(1);
 
669
 
 
670
        } else if (flush_type == BUF_FLUSH_SINGLE_PAGE
 
671
                   && buf_flush_ready_for_flush(block, flush_type)) {
 
672
 
 
673
                block->io_fix = BUF_IO_WRITE;
 
674
 
 
675
                /* If AWE is enabled and the page is not mapped to a frame,
 
676
                then map it */
 
677
 
 
678
                if (block->frame == NULL) {
 
679
                        ut_a(srv_use_awe);
 
680
 
 
681
                        /* We set second parameter TRUE because the block is
 
682
                        in the LRU list and we must put it to
 
683
                        awe_LRU_free_mapped list once mapped to a frame */
 
684
 
 
685
                        buf_awe_map_page_to_frame(block, TRUE);
 
686
                }
 
687
 
 
688
                block->flush_type = flush_type;
 
689
 
 
690
                if (buf_pool->n_flush[block->flush_type] == 0) {
 
691
 
 
692
                        os_event_reset(buf_pool->no_flush[block->flush_type]);
 
693
                }
 
694
 
 
695
                (buf_pool->n_flush[flush_type])++;
 
696
 
 
697
                mutex_exit(&block->mutex);
 
698
                mutex_exit(&(buf_pool->mutex));
 
699
 
 
700
                rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
 
701
 
 
702
#ifdef UNIV_DEBUG
 
703
                if (buf_debug_prints) {
 
704
                        fprintf(stderr,
 
705
                                "Flushing single page space %lu,"
 
706
                                " page no %lu \n",
 
707
                                (ulong) block->space,
 
708
                                (ulong) block->offset);
 
709
                }
 
710
#endif /* UNIV_DEBUG */
 
711
 
 
712
                buf_flush_write_block_low(block);
 
713
 
 
714
                return(1);
 
715
        }
 
716
 
 
717
        mutex_exit(&block->mutex);
 
718
        mutex_exit(&(buf_pool->mutex));
 
719
 
 
720
        return(0);
 
721
}
 
722
 
 
723
/***************************************************************
 
724
Flushes to disk all flushable pages within the flush area. */
 
725
static
 
726
ulint
 
727
buf_flush_try_neighbors(
 
728
/*====================*/
 
729
                                /* out: number of pages flushed */
 
730
        ulint   space,          /* in: space id */
 
731
        ulint   offset,         /* in: page offset */
 
732
        ulint   flush_type)     /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
733
{
 
734
        buf_block_t*    block;
 
735
        ulint           low, high;
 
736
        ulint           count           = 0;
 
737
        ulint           i;
 
738
 
 
739
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
740
 
 
741
        low = (offset / BUF_FLUSH_AREA) * BUF_FLUSH_AREA;
 
742
        high = (offset / BUF_FLUSH_AREA + 1) * BUF_FLUSH_AREA;
 
743
 
 
744
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
 
745
                /* If there is little space, it is better not to flush any
 
746
                block except from the end of the LRU list */
 
747
 
 
748
                low = offset;
 
749
                high = offset + 1;
 
750
        }
 
751
 
 
752
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
 
753
 
 
754
        if (high > fil_space_get_size(space)) {
 
755
                high = fil_space_get_size(space);
 
756
        }
 
757
 
 
758
        mutex_enter(&(buf_pool->mutex));
 
759
 
 
760
        for (i = low; i < high; i++) {
 
761
 
 
762
                block = buf_page_hash_get(space, i);
 
763
                ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
 
764
 
 
765
                if (!block) {
 
766
 
 
767
                        continue;
 
768
 
 
769
                } else if (flush_type == BUF_FLUSH_LRU && i != offset
 
770
                           && !block->old) {
 
771
 
 
772
                        /* We avoid flushing 'non-old' blocks in an LRU flush,
 
773
                        because the flushed blocks are soon freed */
 
774
 
 
775
                        continue;
 
776
                } else {
 
777
 
 
778
                        mutex_enter(&block->mutex);
 
779
 
 
780
                        if (buf_flush_ready_for_flush(block, flush_type)
 
781
                            && (i == offset || block->buf_fix_count == 0)) {
 
782
                                /* We only try to flush those
 
783
                                neighbors != offset where the buf fix count is
 
784
                                zero, as we then know that we probably can
 
785
                                latch the page without a semaphore wait.
 
786
                                Semaphore waits are expensive because we must
 
787
                                flush the doublewrite buffer before we start
 
788
                                waiting. */
 
789
 
 
790
                                mutex_exit(&block->mutex);
 
791
 
 
792
                                mutex_exit(&(buf_pool->mutex));
 
793
 
 
794
                                /* Note: as we release the buf_pool mutex
 
795
                                above, in buf_flush_try_page we cannot be sure
 
796
                                the page is still in a flushable state:
 
797
                                therefore we check it again inside that
 
798
                                function. */
 
799
 
 
800
                                count += buf_flush_try_page(space, i,
 
801
                                                            flush_type);
 
802
 
 
803
                                mutex_enter(&(buf_pool->mutex));
 
804
                        } else {
 
805
                                mutex_exit(&block->mutex);
 
806
                        }
 
807
                }
 
808
        }
 
809
 
 
810
        mutex_exit(&(buf_pool->mutex));
 
811
 
 
812
        return(count);
 
813
}
 
814
 
 
815
/***********************************************************************
 
816
This utility flushes dirty blocks from the end of the LRU list or flush_list.
 
817
NOTE 1: in the case of an LRU flush the calling thread may own latches to
 
818
pages: to avoid deadlocks, this function must be written so that it cannot
 
819
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
 
820
the calling thread is not allowed to own any latches on pages! */
 
821
 
 
822
ulint
 
823
buf_flush_batch(
 
824
/*============*/
 
825
                                /* out: number of blocks for which the write
 
826
                                request was queued; ULINT_UNDEFINED if there
 
827
                                was a flush of the same type already running */
 
828
        ulint   flush_type,     /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
 
829
                                BUF_FLUSH_LIST, then the caller must not own
 
830
                                any latches on pages */
 
831
        ulint   min_n,          /* in: wished minimum mumber of blocks flushed
 
832
                                (it is not guaranteed that the actual number
 
833
                                is that big, though) */
 
834
        dulint  lsn_limit)      /* in the case BUF_FLUSH_LIST all blocks whose
 
835
                                oldest_modification is smaller than this
 
836
                                should be flushed (if their number does not
 
837
                                exceed min_n), otherwise ignored */
 
838
{
 
839
        buf_block_t*    block;
 
840
        ulint           page_count      = 0;
 
841
        ulint           old_page_count;
 
842
        ulint           space;
 
843
        ulint           offset;
 
844
        ibool           found;
 
845
 
 
846
        ut_ad((flush_type == BUF_FLUSH_LRU)
 
847
              || (flush_type == BUF_FLUSH_LIST));
 
848
#ifdef UNIV_SYNC_DEBUG
 
849
        ut_ad((flush_type != BUF_FLUSH_LIST)
 
850
              || sync_thread_levels_empty_gen(TRUE));
 
851
#endif /* UNIV_SYNC_DEBUG */
 
852
        mutex_enter(&(buf_pool->mutex));
 
853
 
 
854
        if ((buf_pool->n_flush[flush_type] > 0)
 
855
            || (buf_pool->init_flush[flush_type] == TRUE)) {
 
856
 
 
857
                /* There is already a flush batch of the same type running */
 
858
 
 
859
                mutex_exit(&(buf_pool->mutex));
 
860
 
 
861
                return(ULINT_UNDEFINED);
 
862
        }
 
863
 
 
864
        (buf_pool->init_flush)[flush_type] = TRUE;
 
865
 
 
866
        for (;;) {
 
867
                /* If we have flushed enough, leave the loop */
 
868
                if (page_count >= min_n) {
 
869
 
 
870
                        break;
 
871
                }
 
872
 
 
873
                /* Start from the end of the list looking for a suitable
 
874
                block to be flushed. */
 
875
 
 
876
                if (flush_type == BUF_FLUSH_LRU) {
 
877
                        block = UT_LIST_GET_LAST(buf_pool->LRU);
 
878
                } else {
 
879
                        ut_ad(flush_type == BUF_FLUSH_LIST);
 
880
 
 
881
                        block = UT_LIST_GET_LAST(buf_pool->flush_list);
 
882
                        if (!block
 
883
                            || (ut_dulint_cmp(block->oldest_modification,
 
884
                                              lsn_limit) >= 0)) {
 
885
                                /* We have flushed enough */
 
886
 
 
887
                                break;
 
888
                        }
 
889
                }
 
890
 
 
891
                found = FALSE;
 
892
 
 
893
                /* Note that after finding a single flushable page, we try to
 
894
                flush also all its neighbors, and after that start from the
 
895
                END of the LRU list or flush list again: the list may change
 
896
                during the flushing and we cannot safely preserve within this
 
897
                function a pointer to a block in the list! */
 
898
 
 
899
                while ((block != NULL) && !found) {
 
900
                        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
901
 
 
902
                        mutex_enter(&block->mutex);
 
903
 
 
904
                        if (buf_flush_ready_for_flush(block, flush_type)) {
 
905
 
 
906
                                found = TRUE;
 
907
                                space = block->space;
 
908
                                offset = block->offset;
 
909
 
 
910
                                mutex_exit(&block->mutex);
 
911
                                mutex_exit(&(buf_pool->mutex));
 
912
 
 
913
                                old_page_count = page_count;
 
914
 
 
915
                                /* Try to flush also all the neighbors */
 
916
                                page_count += buf_flush_try_neighbors(
 
917
                                        space, offset, flush_type);
 
918
                                /* fprintf(stderr,
 
919
                                "Flush type %lu, page no %lu, neighb %lu\n",
 
920
                                flush_type, offset,
 
921
                                page_count - old_page_count); */
 
922
 
 
923
                                mutex_enter(&(buf_pool->mutex));
 
924
 
 
925
                        } else if (flush_type == BUF_FLUSH_LRU) {
 
926
 
 
927
                                mutex_exit(&block->mutex);
 
928
 
 
929
                                block = UT_LIST_GET_PREV(LRU, block);
 
930
                        } else {
 
931
                                ut_ad(flush_type == BUF_FLUSH_LIST);
 
932
 
 
933
                                mutex_exit(&block->mutex);
 
934
 
 
935
                                block = UT_LIST_GET_PREV(flush_list, block);
 
936
                        }
 
937
                }
 
938
 
 
939
                /* If we could not find anything to flush, leave the loop */
 
940
 
 
941
                if (!found) {
 
942
                        break;
 
943
                }
 
944
        }
 
945
 
 
946
        (buf_pool->init_flush)[flush_type] = FALSE;
 
947
 
 
948
        if ((buf_pool->n_flush[flush_type] == 0)
 
949
            && (buf_pool->init_flush[flush_type] == FALSE)) {
 
950
 
 
951
                /* The running flush batch has ended */
 
952
 
 
953
                os_event_set(buf_pool->no_flush[flush_type]);
 
954
        }
 
955
 
 
956
        mutex_exit(&(buf_pool->mutex));
 
957
 
 
958
        buf_flush_buffered_writes();
 
959
 
 
960
#ifdef UNIV_DEBUG
 
961
        if (buf_debug_prints && page_count > 0) {
 
962
                ut_a(flush_type == BUF_FLUSH_LRU
 
963
                     || flush_type == BUF_FLUSH_LIST);
 
964
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
 
965
                        ? "Flushed %lu pages in LRU flush\n"
 
966
                        : "Flushed %lu pages in flush list flush\n",
 
967
                        (ulong) page_count);
 
968
        }
 
969
#endif /* UNIV_DEBUG */
 
970
 
 
971
        srv_buf_pool_flushed += page_count;
 
972
 
 
973
        return(page_count);
 
974
}
 
975
 
 
976
/**********************************************************************
 
977
Waits until a flush batch of the given type ends */
 
978
 
 
979
void
 
980
buf_flush_wait_batch_end(
 
981
/*=====================*/
 
982
        ulint   type)   /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
983
{
 
984
        ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
 
985
 
 
986
        os_event_wait(buf_pool->no_flush[type]);
 
987
}
 
988
 
 
989
/**********************************************************************
 
990
Gives a recommendation of how many blocks should be flushed to establish
 
991
a big enough margin of replaceable blocks near the end of the LRU list
 
992
and in the free list. */
 
993
static
 
994
ulint
 
995
buf_flush_LRU_recommendation(void)
 
996
/*==============================*/
 
997
                        /* out: number of blocks which should be flushed
 
998
                        from the end of the LRU list */
 
999
{
 
1000
        buf_block_t*    block;
 
1001
        ulint           n_replaceable;
 
1002
        ulint           distance        = 0;
 
1003
 
 
1004
        mutex_enter(&(buf_pool->mutex));
 
1005
 
 
1006
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
 
1007
 
 
1008
        block = UT_LIST_GET_LAST(buf_pool->LRU);
 
1009
 
 
1010
        while ((block != NULL)
 
1011
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
 
1012
                   + BUF_FLUSH_EXTRA_MARGIN)
 
1013
               && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
 
1014
 
 
1015
                mutex_enter(&block->mutex);
 
1016
 
 
1017
                if (buf_flush_ready_for_replace(block)) {
 
1018
                        n_replaceable++;
 
1019
                }
 
1020
 
 
1021
                mutex_exit(&block->mutex);
 
1022
 
 
1023
                distance++;
 
1024
 
 
1025
                block = UT_LIST_GET_PREV(LRU, block);
 
1026
        }
 
1027
 
 
1028
        mutex_exit(&(buf_pool->mutex));
 
1029
 
 
1030
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
 
1031
 
 
1032
                return(0);
 
1033
        }
 
1034
 
 
1035
        return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
 
1036
               - n_replaceable);
 
1037
}
 
1038
 
 
1039
/*************************************************************************
 
1040
Flushes pages from the end of the LRU list if there is too small a margin
 
1041
of replaceable pages there or in the free list. VERY IMPORTANT: this function
 
1042
is called also by threads which have locks on pages. To avoid deadlocks, we
 
1043
flush only pages such that the s-lock required for flushing can be acquired
 
1044
immediately, without waiting. */
 
1045
 
 
1046
void
 
1047
buf_flush_free_margin(void)
 
1048
/*=======================*/
 
1049
{
 
1050
        ulint   n_to_flush;
 
1051
        ulint   n_flushed;
 
1052
 
 
1053
        n_to_flush = buf_flush_LRU_recommendation();
 
1054
 
 
1055
        if (n_to_flush > 0) {
 
1056
                n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush,
 
1057
                                            ut_dulint_zero);
 
1058
                if (n_flushed == ULINT_UNDEFINED) {
 
1059
                        /* There was an LRU type flush batch already running;
 
1060
                        let us wait for it to end */
 
1061
 
 
1062
                        buf_flush_wait_batch_end(BUF_FLUSH_LRU);
 
1063
                }
 
1064
        }
 
1065
}
 
1066
 
 
1067
/**********************************************************************
 
1068
Validates the flush list. */
 
1069
static
 
1070
ibool
 
1071
buf_flush_validate_low(void)
 
1072
/*========================*/
 
1073
                /* out: TRUE if ok */
 
1074
{
 
1075
        buf_block_t*    block;
 
1076
        dulint          om;
 
1077
 
 
1078
        UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);
 
1079
 
 
1080
        block = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
1081
 
 
1082
        while (block != NULL) {
 
1083
                om = block->oldest_modification;
 
1084
                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
1085
                ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
 
1086
 
 
1087
                block = UT_LIST_GET_NEXT(flush_list, block);
 
1088
 
 
1089
                if (block) {
 
1090
                        ut_a(ut_dulint_cmp(om, block->oldest_modification)
 
1091
                             >= 0);
 
1092
                }
 
1093
        }
 
1094
 
 
1095
        return(TRUE);
 
1096
}
 
1097
 
 
1098
/**********************************************************************
 
1099
Validates the flush list. */
 
1100
 
 
1101
ibool
 
1102
buf_flush_validate(void)
 
1103
/*====================*/
 
1104
                /* out: TRUE if ok */
 
1105
{
 
1106
        ibool   ret;
 
1107
 
 
1108
        mutex_enter(&(buf_pool->mutex));
 
1109
 
 
1110
        ret = buf_flush_validate_low();
 
1111
 
 
1112
        mutex_exit(&(buf_pool->mutex));
 
1113
 
 
1114
        return(ret);
 
1115
}