~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to storage/innobase/buf/buf0flu.c

pandora-build v0.72 - Moved remaining hard-coded tests into pandora-build
macros.
Add PANDORA_DRIZZLE_BUILD to run the extra checks that drizzle needs that 
plugins would also need to run so we can just use that macro in generated
external plugin builds.
Added support to register_plugins for external plugin building.
Renamed register_plugins.py to pandora-plugin.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/******************************************************
2
 
The database buffer buf_pool flush algorithm
3
 
 
4
 
(c) 1995-2001 Innobase Oy
5
 
 
6
 
Created 11/11/1995 Heikki Tuuri
7
 
*******************************************************/
8
 
 
9
 
#include "buf0flu.h"
10
 
 
11
 
#ifdef UNIV_NONINL
12
 
#include "buf0flu.ic"
13
 
#include "trx0sys.h"
14
 
#endif
15
 
 
16
 
#include "ut0byte.h"
17
 
#include "ut0lst.h"
18
 
#include "page0page.h"
19
 
#include "fil0fil.h"
20
 
#include "buf0buf.h"
21
 
#include "buf0lru.h"
22
 
#include "buf0rea.h"
23
 
#include "ibuf0ibuf.h"
24
 
#include "log0log.h"
25
 
#include "os0file.h"
26
 
#include "trx0sys.h"
27
 
#include "srv0srv.h"
28
 
 
29
 
/* When flushed, dirty blocks are searched in neighborhoods of this size, and
30
 
flushed along with the original page. */
31
 
 
32
 
#define BUF_FLUSH_AREA          ut_min(BUF_READ_AHEAD_AREA,\
33
 
                buf_pool->curr_size / 16)
34
 
 
35
 
/**********************************************************************
36
 
Validates the flush list. */
37
 
static
38
 
ibool
39
 
buf_flush_validate_low(void);
40
 
/*========================*/
41
 
                /* out: TRUE if ok */
42
 
 
43
 
/************************************************************************
44
 
Inserts a modified block into the flush list. */
45
 
 
46
 
void
47
 
buf_flush_insert_into_flush_list(
48
 
/*=============================*/
49
 
        buf_block_t*    block)  /* in: block which is modified */
50
 
{
51
 
        ut_ad(mutex_own(&(buf_pool->mutex)));
52
 
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
53
 
 
54
 
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
55
 
              || (ut_dulint_cmp((UT_LIST_GET_FIRST(buf_pool->flush_list))
56
 
                                ->oldest_modification,
57
 
                                block->oldest_modification) <= 0));
58
 
 
59
 
        UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
60
 
 
61
 
        ut_ad(buf_flush_validate_low());
62
 
}
63
 
 
64
 
/************************************************************************
65
 
Inserts a modified block into the flush list in the right sorted position.
66
 
This function is used by recovery, because there the modifications do not
67
 
necessarily come in the order of lsn's. */
68
 
 
69
 
void
70
 
buf_flush_insert_sorted_into_flush_list(
71
 
/*====================================*/
72
 
        buf_block_t*    block)  /* in: block which is modified */
73
 
{
74
 
        buf_block_t*    prev_b;
75
 
        buf_block_t*    b;
76
 
 
77
 
        ut_ad(mutex_own(&(buf_pool->mutex)));
78
 
 
79
 
        prev_b = NULL;
80
 
        b = UT_LIST_GET_FIRST(buf_pool->flush_list);
81
 
 
82
 
        while (b && (ut_dulint_cmp(b->oldest_modification,
83
 
                                   block->oldest_modification) > 0)) {
84
 
                prev_b = b;
85
 
                b = UT_LIST_GET_NEXT(flush_list, b);
86
 
        }
87
 
 
88
 
        if (prev_b == NULL) {
89
 
                UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
90
 
        } else {
91
 
                UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b,
92
 
                                     block);
93
 
        }
94
 
 
95
 
        ut_ad(buf_flush_validate_low());
96
 
}
97
 
 
98
 
/************************************************************************
99
 
Returns TRUE if the file page block is immediately suitable for replacement,
100
 
i.e., the transition FILE_PAGE => NOT_USED allowed. */
101
 
 
102
 
ibool
103
 
buf_flush_ready_for_replace(
104
 
/*========================*/
105
 
                                /* out: TRUE if can replace immediately */
106
 
        buf_block_t*    block)  /* in: buffer control block, must be in state
107
 
                                BUF_BLOCK_FILE_PAGE and in the LRU list */
108
 
{
109
 
        ut_ad(mutex_own(&(buf_pool->mutex)));
110
 
        ut_ad(mutex_own(&block->mutex));
111
 
        if (block->state != BUF_BLOCK_FILE_PAGE) {
112
 
                ut_print_timestamp(stderr);
113
 
                fprintf(stderr,
114
 
                        "  InnoDB: Error: buffer block state %lu"
115
 
                        " in the LRU list!\n",
116
 
                        (ulong)block->state);
117
 
                ut_print_buf(stderr, block, sizeof(buf_block_t));
118
 
 
119
 
                return(FALSE);
120
 
        }
121
 
 
122
 
        if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
123
 
            || (block->buf_fix_count != 0)
124
 
            || (block->io_fix != 0)) {
125
 
 
126
 
                return(FALSE);
127
 
        }
128
 
 
129
 
        return(TRUE);
130
 
}
131
 
 
132
 
/************************************************************************
133
 
Returns TRUE if the block is modified and ready for flushing. */
134
 
UNIV_INLINE
135
 
ibool
136
 
buf_flush_ready_for_flush(
137
 
/*======================*/
138
 
                                /* out: TRUE if can flush immediately */
139
 
        buf_block_t*    block,  /* in: buffer control block, must be in state
140
 
                                BUF_BLOCK_FILE_PAGE */
141
 
        ulint           flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
142
 
{
143
 
        ut_ad(mutex_own(&(buf_pool->mutex)));
144
 
        ut_ad(mutex_own(&(block->mutex)));
145
 
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
146
 
 
147
 
        if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
148
 
            && (block->io_fix == 0)) {
149
 
                if (flush_type != BUF_FLUSH_LRU) {
150
 
 
151
 
                        return(TRUE);
152
 
 
153
 
                } else if (block->buf_fix_count == 0) {
154
 
 
155
 
                        /* If we are flushing the LRU list, to avoid deadlocks
156
 
                        we require the block not to be bufferfixed, and hence
157
 
                        not latched. */
158
 
 
159
 
                        return(TRUE);
160
 
                }
161
 
        }
162
 
 
163
 
        return(FALSE);
164
 
}
165
 
 
166
 
/************************************************************************
167
 
Updates the flush system data structures when a write is completed. */
168
 
 
169
 
void
170
 
buf_flush_write_complete(
171
 
/*=====================*/
172
 
        buf_block_t*    block)  /* in: pointer to the block in question */
173
 
{
174
 
        ut_ad(block);
175
 
#ifdef UNIV_SYNC_DEBUG
176
 
        ut_ad(mutex_own(&(buf_pool->mutex)));
177
 
#endif /* UNIV_SYNC_DEBUG */
178
 
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
179
 
 
180
 
        block->oldest_modification = ut_dulint_zero;
181
 
 
182
 
        UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
183
 
 
184
 
        ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));
185
 
 
186
 
        (buf_pool->n_flush[block->flush_type])--;
187
 
 
188
 
        if (block->flush_type == BUF_FLUSH_LRU) {
189
 
                /* Put the block to the end of the LRU list to wait to be
190
 
                moved to the free list */
191
 
 
192
 
                buf_LRU_make_block_old(block);
193
 
 
194
 
                buf_pool->LRU_flush_ended++;
195
 
        }
196
 
 
197
 
        /* fprintf(stderr, "n pending flush %lu\n",
198
 
        buf_pool->n_flush[block->flush_type]); */
199
 
 
200
 
        if ((buf_pool->n_flush[block->flush_type] == 0)
201
 
            && (buf_pool->init_flush[block->flush_type] == FALSE)) {
202
 
 
203
 
                /* The running flush batch has ended */
204
 
 
205
 
                os_event_set(buf_pool->no_flush[block->flush_type]);
206
 
        }
207
 
}
208
 
 
209
 
/************************************************************************
210
 
Flushes possible buffered writes from the doublewrite memory buffer to disk,
211
 
and also wakes up the aio thread if simulated aio is used. It is very
212
 
important to call this function after a batch of writes has been posted,
213
 
and also when we may have to wait for a page latch! Otherwise a deadlock
214
 
of threads can occur. */
215
 
static
216
 
void
217
 
buf_flush_buffered_writes(void)
218
 
/*===========================*/
219
 
{
220
 
        buf_block_t*    block;
221
 
        byte*           write_buf;
222
 
        ulint           len;
223
 
        ulint           len2;
224
 
        ulint           i;
225
 
 
226
 
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
227
 
                os_aio_simulated_wake_handler_threads();
228
 
 
229
 
                return;
230
 
        }
231
 
 
232
 
        mutex_enter(&(trx_doublewrite->mutex));
233
 
 
234
 
        /* Write first to doublewrite buffer blocks. We use synchronous
235
 
        aio and thus know that file write has been completed when the
236
 
        control returns. */
237
 
 
238
 
        if (trx_doublewrite->first_free == 0) {
239
 
 
240
 
                mutex_exit(&(trx_doublewrite->mutex));
241
 
 
242
 
                return;
243
 
        }
244
 
 
245
 
        for (i = 0; i < trx_doublewrite->first_free; i++) {
246
 
 
247
 
                block = trx_doublewrite->buf_block_arr[i];
248
 
                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
249
 
 
250
 
                if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
251
 
                    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
252
 
                                        - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
253
 
                        ut_print_timestamp(stderr);
254
 
                        fprintf(stderr,
255
 
                                "  InnoDB: ERROR: The page to be written"
256
 
                                " seems corrupt!\n"
257
 
                                "InnoDB: The lsn fields do not match!"
258
 
                                " Noticed in the buffer pool\n"
259
 
                                "InnoDB: before posting to the"
260
 
                                " doublewrite buffer.\n");
261
 
                }
262
 
 
263
 
                if (block->check_index_page_at_flush
264
 
                    && !page_simple_validate(block->frame)) {
265
 
 
266
 
                        buf_page_print(block->frame);
267
 
 
268
 
                        ut_print_timestamp(stderr);
269
 
                        fprintf(stderr,
270
 
                                "  InnoDB: Apparent corruption of an"
271
 
                                " index page n:o %lu in space %lu\n"
272
 
                                "InnoDB: to be written to data file."
273
 
                                " We intentionally crash server\n"
274
 
                                "InnoDB: to prevent corrupt data"
275
 
                                " from ending up in data\n"
276
 
                                "InnoDB: files.\n",
277
 
                                (ulong) block->offset, (ulong) block->space);
278
 
 
279
 
                        ut_error;
280
 
                }
281
 
        }
282
 
 
283
 
        /* increment the doublewrite flushed pages counter */
284
 
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
285
 
        srv_dblwr_writes++;
286
 
 
287
 
        if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
288
 
                len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
289
 
        } else {
290
 
                len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
291
 
        }
292
 
 
293
 
        fil_io(OS_FILE_WRITE,
294
 
               TRUE, TRX_SYS_SPACE,
295
 
               trx_doublewrite->block1, 0, len,
296
 
               (void*)trx_doublewrite->write_buf, NULL);
297
 
 
298
 
        write_buf = trx_doublewrite->write_buf;
299
 
 
300
 
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) {
301
 
                if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4)
302
 
                    != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE
303
 
                                        - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
304
 
                        ut_print_timestamp(stderr);
305
 
                        fprintf(stderr,
306
 
                                "  InnoDB: ERROR: The page to be written"
307
 
                                " seems corrupt!\n"
308
 
                                "InnoDB: The lsn fields do not match!"
309
 
                                " Noticed in the doublewrite block1.\n");
310
 
                }
311
 
        }
312
 
 
313
 
        if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
314
 
                len = (trx_doublewrite->first_free
315
 
                       - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
316
 
 
317
 
                fil_io(OS_FILE_WRITE,
318
 
                       TRUE, TRX_SYS_SPACE,
319
 
                       trx_doublewrite->block2, 0, len,
320
 
                       (void*)(trx_doublewrite->write_buf
321
 
                               + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
322
 
                               * UNIV_PAGE_SIZE),
323
 
                       NULL);
324
 
 
325
 
                write_buf = trx_doublewrite->write_buf
326
 
                        + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
327
 
                for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
328
 
                     len2 += UNIV_PAGE_SIZE) {
329
 
                        if (mach_read_from_4(write_buf + len2
330
 
                                             + FIL_PAGE_LSN + 4)
331
 
                            != mach_read_from_4(write_buf + len2
332
 
                                                + UNIV_PAGE_SIZE
333
 
                                                - FIL_PAGE_END_LSN_OLD_CHKSUM
334
 
                                                + 4)) {
335
 
                                ut_print_timestamp(stderr);
336
 
                                fprintf(stderr,
337
 
                                        "  InnoDB: ERROR: The page to be"
338
 
                                        " written seems corrupt!\n"
339
 
                                        "InnoDB: The lsn fields do not match!"
340
 
                                        " Noticed in"
341
 
                                        " the doublewrite block2.\n");
342
 
                        }
343
 
                }
344
 
        }
345
 
 
346
 
        /* Now flush the doublewrite buffer data to disk */
347
 
 
348
 
        fil_flush(TRX_SYS_SPACE);
349
 
 
350
 
        /* We know that the writes have been flushed to disk now
351
 
        and in recovery we will find them in the doublewrite buffer
352
 
        blocks. Next do the writes to the intended positions. */
353
 
 
354
 
        for (i = 0; i < trx_doublewrite->first_free; i++) {
355
 
                block = trx_doublewrite->buf_block_arr[i];
356
 
 
357
 
                if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
358
 
                    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
359
 
                                        - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
360
 
                        ut_print_timestamp(stderr);
361
 
                        fprintf(stderr,
362
 
                                "  InnoDB: ERROR: The page to be written"
363
 
                                " seems corrupt!\n"
364
 
                                "InnoDB: The lsn fields do not match!"
365
 
                                " Noticed in the buffer pool\n"
366
 
                                "InnoDB: after posting and flushing"
367
 
                                " the doublewrite buffer.\n"
368
 
                                "InnoDB: Page buf fix count %lu,"
369
 
                                " io fix %lu, state %lu\n",
370
 
                                (ulong)block->buf_fix_count,
371
 
                                (ulong)block->io_fix,
372
 
                                (ulong)block->state);
373
 
                }
374
 
                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
375
 
 
376
 
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
377
 
                       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
378
 
                       (void*)block->frame, (void*)block);
379
 
        }
380
 
 
381
 
        /* Wake possible simulated aio thread to actually post the
382
 
        writes to the operating system */
383
 
 
384
 
        os_aio_simulated_wake_handler_threads();
385
 
 
386
 
        /* Wait that all async writes to tablespaces have been posted to
387
 
        the OS */
388
 
 
389
 
        os_aio_wait_until_no_pending_writes();
390
 
 
391
 
        /* Now we flush the data to disk (for example, with fsync) */
392
 
 
393
 
        fil_flush_file_spaces(FIL_TABLESPACE);
394
 
 
395
 
        /* We can now reuse the doublewrite memory buffer: */
396
 
 
397
 
        trx_doublewrite->first_free = 0;
398
 
 
399
 
        mutex_exit(&(trx_doublewrite->mutex));
400
 
}
401
 
 
402
 
/************************************************************************
403
 
Posts a buffer page for writing. If the doublewrite memory buffer is
404
 
full, calls buf_flush_buffered_writes and waits for for free space to
405
 
appear. */
406
 
static
407
 
void
408
 
buf_flush_post_to_doublewrite_buf(
409
 
/*==============================*/
410
 
        buf_block_t*    block)  /* in: buffer block to write */
411
 
{
412
 
try_again:
413
 
        mutex_enter(&(trx_doublewrite->mutex));
414
 
 
415
 
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
416
 
 
417
 
        if (trx_doublewrite->first_free
418
 
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
419
 
                mutex_exit(&(trx_doublewrite->mutex));
420
 
 
421
 
                buf_flush_buffered_writes();
422
 
 
423
 
                goto try_again;
424
 
        }
425
 
 
426
 
        ut_memcpy(trx_doublewrite->write_buf
427
 
                  + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
428
 
                  block->frame, UNIV_PAGE_SIZE);
429
 
 
430
 
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
431
 
 
432
 
        trx_doublewrite->first_free++;
433
 
 
434
 
        if (trx_doublewrite->first_free
435
 
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
436
 
                mutex_exit(&(trx_doublewrite->mutex));
437
 
 
438
 
                buf_flush_buffered_writes();
439
 
 
440
 
                return;
441
 
        }
442
 
 
443
 
        mutex_exit(&(trx_doublewrite->mutex));
444
 
}
445
 
 
446
 
/************************************************************************
447
 
Initializes a page for writing to the tablespace. */
448
 
 
449
 
void
450
 
buf_flush_init_for_writing(
451
 
/*=======================*/
452
 
        byte*   page,           /* in: page */
453
 
        dulint  newest_lsn,     /* in: newest modification lsn to the page */
454
 
        ulint   space,          /* in: space id */
455
 
        ulint   page_no)        /* in: page number */
456
 
{
457
 
        /* Write the newest modification lsn to the page header and trailer */
458
 
        mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
459
 
 
460
 
        mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
461
 
                        newest_lsn);
462
 
        /* Write the page number and the space id */
463
 
 
464
 
        mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
465
 
        mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space);
466
 
 
467
 
        /* Store the new formula checksum */
468
 
 
469
 
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
470
 
                        srv_use_checksums
471
 
                        ? buf_calc_page_new_checksum(page)
472
 
                        : BUF_NO_CHECKSUM_MAGIC);
473
 
 
474
 
        /* We overwrite the first 4 bytes of the end lsn field to store
475
 
        the old formula checksum. Since it depends also on the field
476
 
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
477
 
        new formula checksum. */
478
 
 
479
 
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
480
 
                        srv_use_checksums
481
 
                        ? buf_calc_page_old_checksum(page)
482
 
                        : BUF_NO_CHECKSUM_MAGIC);
483
 
}
484
 
 
485
 
/************************************************************************
486
 
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
487
 
also when the doublewrite buffer is used, we must call
488
 
buf_flush_buffered_writes after we have posted a batch of writes! */
489
 
static
490
 
void
491
 
buf_flush_write_block_low(
492
 
/*======================*/
493
 
        buf_block_t*    block)  /* in: buffer block to write */
494
 
{
495
 
#ifdef UNIV_LOG_DEBUG
496
 
        static ibool univ_log_debug_warned;
497
 
#endif /* UNIV_LOG_DEBUG */
498
 
        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
499
 
 
500
 
#ifdef UNIV_IBUF_DEBUG
501
 
        ut_a(ibuf_count_get(block->space, block->offset) == 0);
502
 
#endif
503
 
        ut_ad(!ut_dulint_is_zero(block->newest_modification));
504
 
 
505
 
#ifdef UNIV_LOG_DEBUG
506
 
        if (!univ_log_debug_warned) {
507
 
                univ_log_debug_warned = TRUE;
508
 
                fputs("Warning: cannot force log to disk if"
509
 
                      " UNIV_LOG_DEBUG is defined!\n"
510
 
                      "Crash recovery will not work!\n",
511
 
                      stderr);
512
 
        }
513
 
#else
514
 
        /* Force the log to the disk before writing the modified block */
515
 
        log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
516
 
#endif
517
 
        buf_flush_init_for_writing(block->frame, block->newest_modification,
518
 
                                   block->space, block->offset);
519
 
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
520
 
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
521
 
                       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
522
 
                       (void*)block->frame, (void*)block);
523
 
        } else {
524
 
                buf_flush_post_to_doublewrite_buf(block);
525
 
        }
526
 
}
527
 
 
528
 
/************************************************************************
529
 
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
530
 
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
531
 
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
532
 
of writes! */
533
 
static
534
 
ulint
535
 
buf_flush_try_page(
536
 
/*===============*/
537
 
                                /* out: 1 if a page was flushed, 0 otherwise */
538
 
        ulint   space,          /* in: space id */
539
 
        ulint   offset,         /* in: page offset */
540
 
        ulint   flush_type)     /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or
541
 
                                BUF_FLUSH_SINGLE_PAGE */
542
 
{
543
 
        buf_block_t*    block;
544
 
        ibool           locked;
545
 
 
546
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
547
 
              || flush_type == BUF_FLUSH_SINGLE_PAGE);
548
 
 
549
 
        mutex_enter(&(buf_pool->mutex));
550
 
 
551
 
        block = buf_page_hash_get(space, offset);
552
 
 
553
 
        ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
554
 
 
555
 
        if (!block) {
556
 
                mutex_exit(&(buf_pool->mutex));
557
 
                return(0);
558
 
        }
559
 
 
560
 
        mutex_enter(&block->mutex);
561
 
 
562
 
        if (flush_type == BUF_FLUSH_LIST
563
 
            && buf_flush_ready_for_flush(block, flush_type)) {
564
 
 
565
 
                block->io_fix = BUF_IO_WRITE;
566
 
 
567
 
                /* If AWE is enabled and the page is not mapped to a frame,
568
 
                then map it */
569
 
 
570
 
                if (block->frame == NULL) {
571
 
                        ut_a(srv_use_awe);
572
 
 
573
 
                        /* We set second parameter TRUE because the block is
574
 
                        in the LRU list and we must put it to
575
 
                        awe_LRU_free_mapped list once mapped to a frame */
576
 
 
577
 
                        buf_awe_map_page_to_frame(block, TRUE);
578
 
                }
579
 
 
580
 
                block->flush_type = flush_type;
581
 
 
582
 
                if (buf_pool->n_flush[flush_type] == 0) {
583
 
 
584
 
                        os_event_reset(buf_pool->no_flush[flush_type]);
585
 
                }
586
 
 
587
 
                (buf_pool->n_flush[flush_type])++;
588
 
 
589
 
                locked = FALSE;
590
 
 
591
 
                /* If the simulated aio thread is not running, we must
592
 
                not wait for any latch, as we may end up in a deadlock:
593
 
                if buf_fix_count == 0, then we know we need not wait */
594
 
 
595
 
                if (block->buf_fix_count == 0) {
596
 
                        rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
597
 
 
598
 
                        locked = TRUE;
599
 
                }
600
 
 
601
 
                mutex_exit(&block->mutex);
602
 
                mutex_exit(&(buf_pool->mutex));
603
 
 
604
 
                if (!locked) {
605
 
                        buf_flush_buffered_writes();
606
 
 
607
 
                        rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
608
 
                }
609
 
 
610
 
#ifdef UNIV_DEBUG
611
 
                if (buf_debug_prints) {
612
 
                        fprintf(stderr,
613
 
                                "Flushing page space %lu, page no %lu \n",
614
 
                                (ulong) block->space, (ulong) block->offset);
615
 
                }
616
 
#endif /* UNIV_DEBUG */
617
 
 
618
 
                buf_flush_write_block_low(block);
619
 
 
620
 
                return(1);
621
 
 
622
 
        } else if (flush_type == BUF_FLUSH_LRU
623
 
                   && buf_flush_ready_for_flush(block, flush_type)) {
624
 
 
625
 
                /* VERY IMPORTANT:
626
 
                Because any thread may call the LRU flush, even when owning
627
 
                locks on pages, to avoid deadlocks, we must make sure that the
628
 
                s-lock is acquired on the page without waiting: this is
629
 
                accomplished because in the if-condition above we require
630
 
                the page not to be bufferfixed (in function
631
 
                ..._ready_for_flush). */
632
 
 
633
 
                block->io_fix = BUF_IO_WRITE;
634
 
 
635
 
                /* If AWE is enabled and the page is not mapped to a frame,
636
 
                then map it */
637
 
 
638
 
                if (block->frame == NULL) {
639
 
                        ut_a(srv_use_awe);
640
 
 
641
 
                        /* We set second parameter TRUE because the block is
642
 
                        in the LRU list and we must put it to
643
 
                        awe_LRU_free_mapped list once mapped to a frame */
644
 
 
645
 
                        buf_awe_map_page_to_frame(block, TRUE);
646
 
                }
647
 
 
648
 
                block->flush_type = flush_type;
649
 
 
650
 
                if (buf_pool->n_flush[flush_type] == 0) {
651
 
 
652
 
                        os_event_reset(buf_pool->no_flush[flush_type]);
653
 
                }
654
 
 
655
 
                (buf_pool->n_flush[flush_type])++;
656
 
 
657
 
                rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
658
 
 
659
 
                /* Note that the s-latch is acquired before releasing the
660
 
                buf_pool mutex: this ensures that the latch is acquired
661
 
                immediately. */
662
 
 
663
 
                mutex_exit(&block->mutex);
664
 
                mutex_exit(&(buf_pool->mutex));
665
 
 
666
 
                buf_flush_write_block_low(block);
667
 
 
668
 
                return(1);
669
 
 
670
 
        } else if (flush_type == BUF_FLUSH_SINGLE_PAGE
671
 
                   && buf_flush_ready_for_flush(block, flush_type)) {
672
 
 
673
 
                block->io_fix = BUF_IO_WRITE;
674
 
 
675
 
                /* If AWE is enabled and the page is not mapped to a frame,
676
 
                then map it */
677
 
 
678
 
                if (block->frame == NULL) {
679
 
                        ut_a(srv_use_awe);
680
 
 
681
 
                        /* We set second parameter TRUE because the block is
682
 
                        in the LRU list and we must put it to
683
 
                        awe_LRU_free_mapped list once mapped to a frame */
684
 
 
685
 
                        buf_awe_map_page_to_frame(block, TRUE);
686
 
                }
687
 
 
688
 
                block->flush_type = flush_type;
689
 
 
690
 
                if (buf_pool->n_flush[block->flush_type] == 0) {
691
 
 
692
 
                        os_event_reset(buf_pool->no_flush[block->flush_type]);
693
 
                }
694
 
 
695
 
                (buf_pool->n_flush[flush_type])++;
696
 
 
697
 
                mutex_exit(&block->mutex);
698
 
                mutex_exit(&(buf_pool->mutex));
699
 
 
700
 
                rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
701
 
 
702
 
#ifdef UNIV_DEBUG
703
 
                if (buf_debug_prints) {
704
 
                        fprintf(stderr,
705
 
                                "Flushing single page space %lu,"
706
 
                                " page no %lu \n",
707
 
                                (ulong) block->space,
708
 
                                (ulong) block->offset);
709
 
                }
710
 
#endif /* UNIV_DEBUG */
711
 
 
712
 
                buf_flush_write_block_low(block);
713
 
 
714
 
                return(1);
715
 
        }
716
 
 
717
 
        mutex_exit(&block->mutex);
718
 
        mutex_exit(&(buf_pool->mutex));
719
 
 
720
 
        return(0);
721
 
}
722
 
 
723
 
/***************************************************************
724
 
Flushes to disk all flushable pages within the flush area. */
725
 
static
726
 
ulint
727
 
buf_flush_try_neighbors(
728
 
/*====================*/
729
 
                                /* out: number of pages flushed */
730
 
        ulint   space,          /* in: space id */
731
 
        ulint   offset,         /* in: page offset */
732
 
        ulint   flush_type)     /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
733
 
{
734
 
        buf_block_t*    block;
735
 
        ulint           low, high;
736
 
        ulint           count           = 0;
737
 
        ulint           i;
738
 
 
739
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
740
 
 
741
 
        low = (offset / BUF_FLUSH_AREA) * BUF_FLUSH_AREA;
742
 
        high = (offset / BUF_FLUSH_AREA + 1) * BUF_FLUSH_AREA;
743
 
 
744
 
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
745
 
                /* If there is little space, it is better not to flush any
746
 
                block except from the end of the LRU list */
747
 
 
748
 
                low = offset;
749
 
                high = offset + 1;
750
 
        }
751
 
 
752
 
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
753
 
 
754
 
        if (high > fil_space_get_size(space)) {
755
 
                high = fil_space_get_size(space);
756
 
        }
757
 
 
758
 
        mutex_enter(&(buf_pool->mutex));
759
 
 
760
 
        for (i = low; i < high; i++) {
761
 
 
762
 
                block = buf_page_hash_get(space, i);
763
 
                ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
764
 
 
765
 
                if (!block) {
766
 
 
767
 
                        continue;
768
 
 
769
 
                } else if (flush_type == BUF_FLUSH_LRU && i != offset
770
 
                           && !block->old) {
771
 
 
772
 
                        /* We avoid flushing 'non-old' blocks in an LRU flush,
773
 
                        because the flushed blocks are soon freed */
774
 
 
775
 
                        continue;
776
 
                } else {
777
 
 
778
 
                        mutex_enter(&block->mutex);
779
 
 
780
 
                        if (buf_flush_ready_for_flush(block, flush_type)
781
 
                            && (i == offset || block->buf_fix_count == 0)) {
782
 
                                /* We only try to flush those
783
 
                                neighbors != offset where the buf fix count is
784
 
                                zero, as we then know that we probably can
785
 
                                latch the page without a semaphore wait.
786
 
                                Semaphore waits are expensive because we must
787
 
                                flush the doublewrite buffer before we start
788
 
                                waiting. */
789
 
 
790
 
                                mutex_exit(&block->mutex);
791
 
 
792
 
                                mutex_exit(&(buf_pool->mutex));
793
 
 
794
 
                                /* Note: as we release the buf_pool mutex
795
 
                                above, in buf_flush_try_page we cannot be sure
796
 
                                the page is still in a flushable state:
797
 
                                therefore we check it again inside that
798
 
                                function. */
799
 
 
800
 
                                count += buf_flush_try_page(space, i,
801
 
                                                            flush_type);
802
 
 
803
 
                                mutex_enter(&(buf_pool->mutex));
804
 
                        } else {
805
 
                                mutex_exit(&block->mutex);
806
 
                        }
807
 
                }
808
 
        }
809
 
 
810
 
        mutex_exit(&(buf_pool->mutex));
811
 
 
812
 
        return(count);
813
 
}
814
 
 
815
 
/***********************************************************************
816
 
This utility flushes dirty blocks from the end of the LRU list or flush_list.
817
 
NOTE 1: in the case of an LRU flush the calling thread may own latches to
818
 
pages: to avoid deadlocks, this function must be written so that it cannot
819
 
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
820
 
the calling thread is not allowed to own any latches on pages! */
821
 
 
822
 
ulint
823
 
buf_flush_batch(
824
 
/*============*/
825
 
                                /* out: number of blocks for which the write
826
 
                                request was queued; ULINT_UNDEFINED if there
827
 
                                was a flush of the same type already running */
828
 
        ulint   flush_type,     /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
829
 
                                BUF_FLUSH_LIST, then the caller must not own
830
 
                                any latches on pages */
831
 
        ulint   min_n,          /* in: wished minimum mumber of blocks flushed
832
 
                                (it is not guaranteed that the actual number
833
 
                                is that big, though) */
834
 
        dulint  lsn_limit)      /* in the case BUF_FLUSH_LIST all blocks whose
835
 
                                oldest_modification is smaller than this
836
 
                                should be flushed (if their number does not
837
 
                                exceed min_n), otherwise ignored */
838
 
{
839
 
        buf_block_t*    block;
840
 
        ulint           page_count      = 0;
841
 
        ulint           old_page_count;
842
 
        ulint           space;
843
 
        ulint           offset;
844
 
        ibool           found;
845
 
 
846
 
        ut_ad((flush_type == BUF_FLUSH_LRU)
847
 
              || (flush_type == BUF_FLUSH_LIST));
848
 
#ifdef UNIV_SYNC_DEBUG
849
 
        ut_ad((flush_type != BUF_FLUSH_LIST)
850
 
              || sync_thread_levels_empty_gen(TRUE));
851
 
#endif /* UNIV_SYNC_DEBUG */
852
 
        mutex_enter(&(buf_pool->mutex));
853
 
 
854
 
        if ((buf_pool->n_flush[flush_type] > 0)
855
 
            || (buf_pool->init_flush[flush_type] == TRUE)) {
856
 
 
857
 
                /* There is already a flush batch of the same type running */
858
 
 
859
 
                mutex_exit(&(buf_pool->mutex));
860
 
 
861
 
                return(ULINT_UNDEFINED);
862
 
        }
863
 
 
864
 
        (buf_pool->init_flush)[flush_type] = TRUE;
865
 
 
866
 
        for (;;) {
867
 
                /* If we have flushed enough, leave the loop */
868
 
                if (page_count >= min_n) {
869
 
 
870
 
                        break;
871
 
                }
872
 
 
873
 
                /* Start from the end of the list looking for a suitable
874
 
                block to be flushed. */
875
 
 
876
 
                if (flush_type == BUF_FLUSH_LRU) {
877
 
                        block = UT_LIST_GET_LAST(buf_pool->LRU);
878
 
                } else {
879
 
                        ut_ad(flush_type == BUF_FLUSH_LIST);
880
 
 
881
 
                        block = UT_LIST_GET_LAST(buf_pool->flush_list);
882
 
                        if (!block
883
 
                            || (ut_dulint_cmp(block->oldest_modification,
884
 
                                              lsn_limit) >= 0)) {
885
 
                                /* We have flushed enough */
886
 
 
887
 
                                break;
888
 
                        }
889
 
                }
890
 
 
891
 
                found = FALSE;
892
 
 
893
 
                /* Note that after finding a single flushable page, we try to
894
 
                flush also all its neighbors, and after that start from the
895
 
                END of the LRU list or flush list again: the list may change
896
 
                during the flushing and we cannot safely preserve within this
897
 
                function a pointer to a block in the list! */
898
 
 
899
 
                while ((block != NULL) && !found) {
900
 
                        ut_a(block->state == BUF_BLOCK_FILE_PAGE);
901
 
 
902
 
                        mutex_enter(&block->mutex);
903
 
 
904
 
                        if (buf_flush_ready_for_flush(block, flush_type)) {
905
 
 
906
 
                                found = TRUE;
907
 
                                space = block->space;
908
 
                                offset = block->offset;
909
 
 
910
 
                                mutex_exit(&block->mutex);
911
 
                                mutex_exit(&(buf_pool->mutex));
912
 
 
913
 
                                old_page_count = page_count;
914
 
 
915
 
                                /* Try to flush also all the neighbors */
916
 
                                page_count += buf_flush_try_neighbors(
917
 
                                        space, offset, flush_type);
918
 
                                /* fprintf(stderr,
919
 
                                "Flush type %lu, page no %lu, neighb %lu\n",
920
 
                                flush_type, offset,
921
 
                                page_count - old_page_count); */
922
 
 
923
 
                                mutex_enter(&(buf_pool->mutex));
924
 
 
925
 
                        } else if (flush_type == BUF_FLUSH_LRU) {
926
 
 
927
 
                                mutex_exit(&block->mutex);
928
 
 
929
 
                                block = UT_LIST_GET_PREV(LRU, block);
930
 
                        } else {
931
 
                                ut_ad(flush_type == BUF_FLUSH_LIST);
932
 
 
933
 
                                mutex_exit(&block->mutex);
934
 
 
935
 
                                block = UT_LIST_GET_PREV(flush_list, block);
936
 
                        }
937
 
                }
938
 
 
939
 
                /* If we could not find anything to flush, leave the loop */
940
 
 
941
 
                if (!found) {
942
 
                        break;
943
 
                }
944
 
        }
945
 
 
946
 
        (buf_pool->init_flush)[flush_type] = FALSE;
947
 
 
948
 
        if ((buf_pool->n_flush[flush_type] == 0)
949
 
            && (buf_pool->init_flush[flush_type] == FALSE)) {
950
 
 
951
 
                /* The running flush batch has ended */
952
 
 
953
 
                os_event_set(buf_pool->no_flush[flush_type]);
954
 
        }
955
 
 
956
 
        mutex_exit(&(buf_pool->mutex));
957
 
 
958
 
        buf_flush_buffered_writes();
959
 
 
960
 
#ifdef UNIV_DEBUG
961
 
        if (buf_debug_prints && page_count > 0) {
962
 
                ut_a(flush_type == BUF_FLUSH_LRU
963
 
                     || flush_type == BUF_FLUSH_LIST);
964
 
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
965
 
                        ? "Flushed %lu pages in LRU flush\n"
966
 
                        : "Flushed %lu pages in flush list flush\n",
967
 
                        (ulong) page_count);
968
 
        }
969
 
#endif /* UNIV_DEBUG */
970
 
 
971
 
        srv_buf_pool_flushed += page_count;
972
 
 
973
 
        return(page_count);
974
 
}
975
 
 
976
 
/**********************************************************************
977
 
Waits until a flush batch of the given type ends */
978
 
 
979
 
void
980
 
buf_flush_wait_batch_end(
981
 
/*=====================*/
982
 
        ulint   type)   /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
983
 
{
984
 
        ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
985
 
 
986
 
        os_event_wait(buf_pool->no_flush[type]);
987
 
}
988
 
 
989
 
/**********************************************************************
990
 
Gives a recommendation of how many blocks should be flushed to establish
991
 
a big enough margin of replaceable blocks near the end of the LRU list
992
 
and in the free list. */
993
 
static
994
 
ulint
995
 
buf_flush_LRU_recommendation(void)
996
 
/*==============================*/
997
 
                        /* out: number of blocks which should be flushed
998
 
                        from the end of the LRU list */
999
 
{
1000
 
        buf_block_t*    block;
1001
 
        ulint           n_replaceable;
1002
 
        ulint           distance        = 0;
1003
 
 
1004
 
        mutex_enter(&(buf_pool->mutex));
1005
 
 
1006
 
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1007
 
 
1008
 
        block = UT_LIST_GET_LAST(buf_pool->LRU);
1009
 
 
1010
 
        while ((block != NULL)
1011
 
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
1012
 
                   + BUF_FLUSH_EXTRA_MARGIN)
1013
 
               && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
1014
 
 
1015
 
                mutex_enter(&block->mutex);
1016
 
 
1017
 
                if (buf_flush_ready_for_replace(block)) {
1018
 
                        n_replaceable++;
1019
 
                }
1020
 
 
1021
 
                mutex_exit(&block->mutex);
1022
 
 
1023
 
                distance++;
1024
 
 
1025
 
                block = UT_LIST_GET_PREV(LRU, block);
1026
 
        }
1027
 
 
1028
 
        mutex_exit(&(buf_pool->mutex));
1029
 
 
1030
 
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
1031
 
 
1032
 
                return(0);
1033
 
        }
1034
 
 
1035
 
        return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
1036
 
               - n_replaceable);
1037
 
}
1038
 
 
1039
 
/*************************************************************************
1040
 
Flushes pages from the end of the LRU list if there is too small a margin
1041
 
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1042
 
is called also by threads which have locks on pages. To avoid deadlocks, we
1043
 
flush only pages such that the s-lock required for flushing can be acquired
1044
 
immediately, without waiting. */
1045
 
 
1046
 
void
1047
 
buf_flush_free_margin(void)
1048
 
/*=======================*/
1049
 
{
1050
 
        ulint   n_to_flush;
1051
 
        ulint   n_flushed;
1052
 
 
1053
 
        n_to_flush = buf_flush_LRU_recommendation();
1054
 
 
1055
 
        if (n_to_flush > 0) {
1056
 
                n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush,
1057
 
                                            ut_dulint_zero);
1058
 
                if (n_flushed == ULINT_UNDEFINED) {
1059
 
                        /* There was an LRU type flush batch already running;
1060
 
                        let us wait for it to end */
1061
 
 
1062
 
                        buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1063
 
                }
1064
 
        }
1065
 
}
1066
 
 
1067
 
/**********************************************************************
1068
 
Validates the flush list. */
1069
 
static
1070
 
ibool
1071
 
buf_flush_validate_low(void)
1072
 
/*========================*/
1073
 
                /* out: TRUE if ok */
1074
 
{
1075
 
        buf_block_t*    block;
1076
 
        dulint          om;
1077
 
 
1078
 
        UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);
1079
 
 
1080
 
        block = UT_LIST_GET_FIRST(buf_pool->flush_list);
1081
 
 
1082
 
        while (block != NULL) {
1083
 
                om = block->oldest_modification;
1084
 
                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1085
 
                ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
1086
 
 
1087
 
                block = UT_LIST_GET_NEXT(flush_list, block);
1088
 
 
1089
 
                if (block) {
1090
 
                        ut_a(ut_dulint_cmp(om, block->oldest_modification)
1091
 
                             >= 0);
1092
 
                }
1093
 
        }
1094
 
 
1095
 
        return(TRUE);
1096
 
}
1097
 
 
1098
 
/**********************************************************************
1099
 
Validates the flush list. */
1100
 
 
1101
 
ibool
1102
 
buf_flush_validate(void)
1103
 
/*====================*/
1104
 
                /* out: TRUE if ok */
1105
 
{
1106
 
        ibool   ret;
1107
 
 
1108
 
        mutex_enter(&(buf_pool->mutex));
1109
 
 
1110
 
        ret = buf_flush_validate_low();
1111
 
 
1112
 
        mutex_exit(&(buf_pool->mutex));
1113
 
 
1114
 
        return(ret);
1115
 
}