1
/******************************************************
2
The database buffer read
6
Created 11/5/1995 Heikki Tuuri
7
*******************************************************/
17
#include "ibuf0ibuf.h"
21
#include "srv0start.h"
23
/* The size in blocks of the area where the random read-ahead algorithm counts
24
the accessed pages when deciding whether to read-ahead */
25
#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA
27
/* There must be at least this many pages in buf_pool in the area to start
28
a random read-ahead */
29
#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8)
31
/* The linear read-ahead area size */
32
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
34
/* The linear read-ahead threshold */
35
#define BUF_READ_AHEAD_LINEAR_THRESHOLD (3 * BUF_READ_AHEAD_LINEAR_AREA / 8)
37
/* If there are buf_pool->curr_size per the number below pending reads, then
38
read-ahead is not done: this is to prevent flooding the buffer pool with
39
i/o-fixed buffer blocks */
40
#define BUF_READ_AHEAD_PEND_LIMIT 2
42
/************************************************************************
43
Low-level function which reads a page asynchronously from a file to the
44
buffer buf_pool if it is not already there, in which case does nothing.
45
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
46
flag is cleared and the x-lock released by an i/o-handler thread. */
51
/* out: 1 if a read request was queued, 0 if the page
52
already resided in buf_pool, or if the page is in
53
the doublewrite buffer blocks in which case it is never
54
read into the pool, or if the tablespace does not
55
exist or is being dropped */
56
ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
57
trying to read from a non-existent tablespace, or a
58
tablespace which is just now being dropped */
59
ibool sync, /* in: TRUE if synchronous aio is desired */
60
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
61
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
62
at read-ahead functions) */
63
ulint space, /* in: space id */
64
ib_longlong tablespace_version, /* in: if the space memory object has
65
this timestamp different from what we are giving here,
66
treat the tablespace as dropped; this is a timestamp we
67
use to stop dangling page reads from a tablespace
68
which we have DISCARDed + IMPORTed back */
69
ulint offset) /* in: page number */
76
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
77
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
79
if (trx_doublewrite && space == TRX_SYS_SPACE
80
&& ( (offset >= trx_doublewrite->block1
81
&& offset < trx_doublewrite->block1
82
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
83
|| (offset >= trx_doublewrite->block2
84
&& offset < trx_doublewrite->block2
85
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
86
ut_print_timestamp(stderr);
88
" InnoDB: Warning: trying to read"
89
" doublewrite buffer page %lu\n",
95
if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) {
97
/* Trx sys header is so low in the latching order that we play
98
safe and do not leave the i/o-completion to an asynchronous
99
i/o-thread. Ibuf bitmap pages must always be read with
100
syncronous i/o, to make sure they do not get involved in
106
/* The following call will also check if the tablespace does not exist
107
or is being dropped; if we succeed in initing the page in the buffer
108
pool for read, then DISCARD cannot proceed until the read has
110
block = buf_page_init_for_read(err, mode, space, tablespace_version,
118
if (buf_debug_prints) {
120
"Posting read request for page %lu, sync %lu\n",
126
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
128
*err = fil_io(OS_FILE_READ | wake_later,
130
offset, 0, UNIV_PAGE_SIZE,
131
(void*)block->frame, (void*)block);
132
ut_a(*err == DB_SUCCESS);
135
/* The i/o is already completed when we arrive from
137
buf_page_io_complete(block);
143
/************************************************************************
144
Applies a random read-ahead in buf_pool if there are at least a threshold
145
value of accessed pages from the random read-ahead area. Does not read any
146
page, not even the one at the position (space, offset), if the read-ahead
147
mechanism is not activated. NOTE 1: the calling thread may own latches on
148
pages: to avoid deadlocks this function must be written such that it cannot
149
end up waiting for these latches! NOTE 2: the calling thread must want
150
access to the page given: this rule is set to prevent unintended read-aheads
151
performed by ibuf routines, a situation which could result in a deadlock if
152
the OS does not support asynchronous i/o. */
155
buf_read_ahead_random(
156
/*==================*/
157
/* out: number of page read requests issued; NOTE
158
that if we read ibuf pages, it may happen that
159
the page at the given page number does not get
160
read even if we return a value > 0! */
161
ulint space, /* in: space id */
162
ulint offset) /* in: page number of a page which the current thread
165
ib_longlong tablespace_version;
167
ulint recent_blocks = 0;
169
ulint LRU_recent_limit;
175
if (srv_startup_is_before_trx_rollback_phase) {
176
/* No read-ahead to avoid thread deadlocks */
180
if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) {
182
/* If it is an ibuf bitmap page or trx sys hdr, we do
183
no read-ahead, as that could break the ibuf page access
189
/* Remember the tablespace version before we ask te tablespace size
190
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
191
do not try to read outside the bounds of the tablespace! */
193
tablespace_version = fil_space_get_version(space);
195
low = (offset / BUF_READ_AHEAD_RANDOM_AREA)
196
* BUF_READ_AHEAD_RANDOM_AREA;
197
high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1)
198
* BUF_READ_AHEAD_RANDOM_AREA;
199
if (high > fil_space_get_size(space)) {
201
high = fil_space_get_size(space);
204
/* Get the minimum LRU_position field value for an initial segment
205
of the LRU list, to determine which blocks have recently been added
206
to the start of the list. */
208
LRU_recent_limit = buf_LRU_get_recent_limit();
210
mutex_enter(&(buf_pool->mutex));
212
if (buf_pool->n_pend_reads
213
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
214
mutex_exit(&(buf_pool->mutex));
219
/* Count how many blocks in the area have been recently accessed,
220
that is, reside near the start of the LRU list. */
222
for (i = low; i < high; i++) {
223
block = buf_page_hash_get(space, i);
226
&& (block->LRU_position > LRU_recent_limit)
227
&& block->accessed) {
233
mutex_exit(&(buf_pool->mutex));
235
if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) {
241
/* Read all the suitable blocks within the area */
244
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
246
ibuf_mode = BUF_READ_ANY_PAGE;
251
for (i = low; i < high; i++) {
252
/* It is only sensible to do read-ahead in the non-sync aio
253
mode: hence FALSE as the first parameter */
255
if (!ibuf_bitmap_page(i)) {
256
count += buf_read_page_low(
258
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
259
space, tablespace_version, i);
260
if (err == DB_TABLESPACE_DELETED) {
261
ut_print_timestamp(stderr);
263
" InnoDB: Warning: in random"
264
" readahead trying to access\n"
265
"InnoDB: tablespace %lu page %lu,\n"
266
"InnoDB: but the tablespace does not"
267
" exist or is just being dropped.\n",
268
(ulong) space, (ulong) i);
273
/* In simulated aio we wake the aio handler threads only after
274
queuing all aio requests, in native aio the following call does
277
os_aio_simulated_wake_handler_threads();
280
if (buf_debug_prints && (count > 0)) {
282
"Random read-ahead space %lu offset %lu pages %lu\n",
283
(ulong) space, (ulong) offset,
286
#endif /* UNIV_DEBUG */
288
++srv_read_ahead_rnd;
292
/************************************************************************
293
High-level function which reads a page asynchronously from a file to the
294
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
295
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
296
released by the i/o-handler thread. Does a random read-ahead if it seems
302
/* out: number of page read requests issued: this can
303
be > 1 if read-ahead occurred */
304
ulint space, /* in: space id */
305
ulint offset) /* in: page number */
307
ib_longlong tablespace_version;
312
tablespace_version = fil_space_get_version(space);
314
count = buf_read_ahead_random(space, offset);
316
/* We do the i/o in the synchronous aio mode to save thread
317
switches: hence TRUE */
319
count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
320
tablespace_version, offset);
321
srv_buf_pool_reads+= count2;
322
if (err == DB_TABLESPACE_DELETED) {
323
ut_print_timestamp(stderr);
325
" InnoDB: Error: trying to access"
326
" tablespace %lu page no. %lu,\n"
327
"InnoDB: but the tablespace does not exist"
328
" or is just being dropped.\n",
329
(ulong) space, (ulong) offset);
332
/* Flush pages from the end of the LRU list if necessary */
333
buf_flush_free_margin();
335
return(count + count2);
338
/************************************************************************
339
Applies linear read-ahead if in the buf_pool the page is a border page of
340
a linear read-ahead area and all the pages in the area have been accessed.
341
Does not read any page if the read-ahead mechanism is not activated. Note
342
that the the algorithm looks at the 'natural' adjacent successor and
343
predecessor of the page, which on the leaf level of a B-tree are the next
344
and previous page in the chain of leaves. To know these, the page specified
345
in (space, offset) must already be present in the buf_pool. Thus, the
346
natural way to use this function is to call it when a page in the buf_pool
347
is accessed the first time, calling this function just after it has been
349
NOTE 1: as this function looks at the natural predecessor and successor
350
fields on the page, what happens, if these are not initialized to any
351
sensible value? No problem, before applying read-ahead we check that the
352
area to read is within the span of the space, if not, read-ahead is not
353
applied. An uninitialized value may result in a useless read operation, but
354
only very improbably.
355
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
356
function must be written such that it cannot end up waiting for these
358
NOTE 3: the calling thread must want access to the page given: this rule is
359
set to prevent unintended read-aheads performed by ibuf routines, a situation
360
which could result in a deadlock if the OS does not support asynchronous io. */
363
buf_read_ahead_linear(
364
/*==================*/
365
/* out: number of page read requests issued */
366
ulint space, /* in: space id */
367
ulint offset) /* in: page number of a page; NOTE: the current thread
368
must want access to this page (see NOTE 3 above) */
370
ib_longlong tablespace_version;
373
buf_block_t* pred_block = NULL;
385
if (srv_startup_is_before_trx_rollback_phase) {
386
/* No read-ahead to avoid thread deadlocks */
390
if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) {
392
/* If it is an ibuf bitmap page or trx sys hdr, we do
393
no read-ahead, as that could break the ibuf page access
399
low = (offset / BUF_READ_AHEAD_LINEAR_AREA)
400
* BUF_READ_AHEAD_LINEAR_AREA;
401
high = (offset / BUF_READ_AHEAD_LINEAR_AREA + 1)
402
* BUF_READ_AHEAD_LINEAR_AREA;
404
if ((offset != low) && (offset != high - 1)) {
405
/* This is not a border page of the area: return */
410
/* Remember the tablespace version before we ask te tablespace size
411
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
412
do not try to read outside the bounds of the tablespace! */
414
tablespace_version = fil_space_get_version(space);
416
mutex_enter(&(buf_pool->mutex));
418
if (high > fil_space_get_size(space)) {
419
mutex_exit(&(buf_pool->mutex));
420
/* The area is not whole, return */
425
if (buf_pool->n_pend_reads
426
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
427
mutex_exit(&(buf_pool->mutex));
432
/* Check that almost all pages in the area have been accessed; if
433
offset == low, the accesses must be in a descending order, otherwise,
434
in an ascending order. */
444
for (i = low; i < high; i++) {
445
block = buf_page_hash_get(space, i);
447
if ((block == NULL) || !block->accessed) {
451
} else if (pred_block
452
&& (ut_ulint_cmp(block->LRU_position,
453
pred_block->LRU_position)
455
/* Accesses not in the right order */
462
if (fail_count > BUF_READ_AHEAD_LINEAR_AREA
463
- BUF_READ_AHEAD_LINEAR_THRESHOLD) {
464
/* Too many failures: return */
466
mutex_exit(&(buf_pool->mutex));
471
/* If we got this far, we know that enough pages in the area have
472
been accessed in the right order: linear read-ahead can be sensible */
474
block = buf_page_hash_get(space, offset);
477
mutex_exit(&(buf_pool->mutex));
482
frame = block->frame;
484
/* Read the natural predecessor and successor page addresses from
485
the page; NOTE that because the calling thread may have an x-latch
486
on the page, we do not acquire an s-latch on the page, this is to
487
prevent deadlocks. Even if we read values which are nonsense, the
488
algorithm will work. */
490
pred_offset = fil_page_get_prev(frame);
491
succ_offset = fil_page_get_next(frame);
493
mutex_exit(&(buf_pool->mutex));
495
if ((offset == low) && (succ_offset == offset + 1)) {
497
/* This is ok, we can continue */
498
new_offset = pred_offset;
500
} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
502
/* This is ok, we can continue */
503
new_offset = succ_offset;
505
/* Successor or predecessor not in the right order */
510
low = (new_offset / BUF_READ_AHEAD_LINEAR_AREA)
511
* BUF_READ_AHEAD_LINEAR_AREA;
512
high = (new_offset / BUF_READ_AHEAD_LINEAR_AREA + 1)
513
* BUF_READ_AHEAD_LINEAR_AREA;
515
if ((new_offset != low) && (new_offset != high - 1)) {
516
/* This is not a border page of the area: return */
521
if (high > fil_space_get_size(space)) {
522
/* The area is not whole, return */
527
/* If we got this far, read-ahead can be sensible: do it */
530
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
532
ibuf_mode = BUF_READ_ANY_PAGE;
537
/* Since Windows XP seems to schedule the i/o handler thread
538
very eagerly, and consequently it does not wait for the
539
full read batch to be posted, we use special heuristics here */
541
os_aio_simulated_put_read_threads_to_sleep();
543
for (i = low; i < high; i++) {
544
/* It is only sensible to do read-ahead in the non-sync
545
aio mode: hence FALSE as the first parameter */
547
if (!ibuf_bitmap_page(i)) {
548
count += buf_read_page_low(
550
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
551
space, tablespace_version, i);
552
if (err == DB_TABLESPACE_DELETED) {
553
ut_print_timestamp(stderr);
555
" InnoDB: Warning: in"
556
" linear readahead trying to access\n"
557
"InnoDB: tablespace %lu page %lu,\n"
558
"InnoDB: but the tablespace does not"
559
" exist or is just being dropped.\n",
560
(ulong) space, (ulong) i);
565
/* In simulated aio we wake the aio handler threads only after
566
queuing all aio requests, in native aio the following call does
569
os_aio_simulated_wake_handler_threads();
571
/* Flush pages from the end of the LRU list if necessary */
572
buf_flush_free_margin();
575
if (buf_debug_prints && (count > 0)) {
577
"LINEAR read-ahead space %lu offset %lu pages %lu\n",
578
(ulong) space, (ulong) offset, (ulong) count);
580
#endif /* UNIV_DEBUG */
582
++srv_read_ahead_seq;
586
/************************************************************************
587
Issues read requests for pages which the ibuf module wants to read in, in
588
order to contract the insert buffer tree. Technically, this function is like
589
a read-ahead function. */
592
buf_read_ibuf_merge_pages(
593
/*======================*/
594
ibool sync, /* in: TRUE if the caller wants this function
595
to wait for the highest address page to get
596
read in, before this function returns */
597
ulint* space_ids, /* in: array of space ids */
598
ib_longlong* space_versions,/* in: the spaces must have this version
599
number (timestamp), otherwise we discard the
600
read; we use this to cancel reads if
601
DISCARD + IMPORT may have changed the
603
ulint* page_nos, /* in: array of page numbers to read, with the
604
highest page number the last in the array */
605
ulint n_stored) /* in: number of page numbers in the array */
610
ut_ad(!ibuf_inside());
611
#ifdef UNIV_IBUF_DEBUG
612
ut_a(n_stored < UNIV_PAGE_SIZE);
614
while (buf_pool->n_pend_reads
615
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
616
os_thread_sleep(500000);
619
for (i = 0; i < n_stored; i++) {
620
buf_read_page_low(&err,
621
(i + 1 == n_stored) && sync,
623
space_ids[i], space_versions[i],
626
if (err == DB_TABLESPACE_DELETED) {
627
/* We have deleted or are deleting the single-table
628
tablespace: remove the entries for that page */
630
ibuf_merge_or_delete_for_page(NULL, space_ids[i],
635
os_aio_simulated_wake_handler_threads();
637
/* Flush pages from the end of the LRU list if necessary */
638
buf_flush_free_margin();
641
if (buf_debug_prints) {
643
"Ibuf merge read-ahead space %lu pages %lu\n",
644
(ulong) space_ids[0], (ulong) n_stored);
646
#endif /* UNIV_DEBUG */
649
/************************************************************************
650
Issues read requests for pages which recovery wants to read in. */
655
ibool sync, /* in: TRUE if the caller wants this function
656
to wait for the highest address page to get
657
read in, before this function returns */
658
ulint space, /* in: space id */
659
ulint* page_nos, /* in: array of page numbers to read, with the
660
highest page number the last in the array */
661
ulint n_stored) /* in: number of page numbers in the array */
663
ib_longlong tablespace_version;
668
tablespace_version = fil_space_get_version(space);
670
for (i = 0; i < n_stored; i++) {
674
os_aio_print_debug = FALSE;
676
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
678
os_aio_simulated_wake_handler_threads();
679
os_thread_sleep(500000);
685
"InnoDB: Error: InnoDB has waited for"
686
" 50 seconds for pending\n"
687
"InnoDB: reads to the buffer pool to"
689
"InnoDB: Number of pending reads %lu,"
690
" pending pread calls %lu\n",
691
(ulong) buf_pool->n_pend_reads,
692
(ulong)os_file_n_pending_preads);
694
os_aio_print_debug = TRUE;
698
os_aio_print_debug = FALSE;
700
if ((i + 1 == n_stored) && sync) {
701
buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE,
702
space, tablespace_version,
705
buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
706
| OS_AIO_SIMULATED_WAKE_LATER,
707
space, tablespace_version,
712
os_aio_simulated_wake_handler_threads();
714
/* Flush pages from the end of the LRU list if necessary */
715
buf_flush_free_margin();
718
if (buf_debug_prints) {
720
"Recovery applies read-ahead pages %lu\n",
723
#endif /* UNIV_DEBUG */