1
/******************************************************
2
The database buffer read
6
Created 11/5/1995 Heikki Tuuri
7
*******************************************************/
17
#include "ibuf0ibuf.h"
21
#include "srv0start.h"
23
extern ulint srv_read_ahead_rnd;
24
extern ulint srv_read_ahead_seq;
25
extern ulint srv_buf_pool_reads;
27
/* The size in blocks of the area where the random read-ahead algorithm counts
28
the accessed pages when deciding whether to read-ahead */
29
#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA
31
/* There must be at least this many pages in buf_pool in the area to start
32
a random read-ahead */
33
#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8)
35
/* The linear read-ahead area size */
36
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
38
/* The linear read-ahead threshold */
39
#define BUF_READ_AHEAD_LINEAR_THRESHOLD (3 * BUF_READ_AHEAD_LINEAR_AREA / 8)
41
/* If there are buf_pool->curr_size per the number below pending reads, then
42
read-ahead is not done: this is to prevent flooding the buffer pool with
43
i/o-fixed buffer blocks */
44
#define BUF_READ_AHEAD_PEND_LIMIT 2
46
/************************************************************************
47
Low-level function which reads a page asynchronously from a file to the
48
buffer buf_pool if it is not already there, in which case does nothing.
49
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
50
flag is cleared and the x-lock released by an i/o-handler thread. */
55
/* out: 1 if a read request was queued, 0 if the page
56
already resided in buf_pool, or if the page is in
57
the doublewrite buffer blocks in which case it is never
58
read into the pool, or if the tablespace does not
59
exist or is being dropped */
60
ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
61
trying to read from a non-existent tablespace, or a
62
tablespace which is just now being dropped */
63
ibool sync, /* in: TRUE if synchronous aio is desired */
64
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
65
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
66
at read-ahead functions) */
67
ulint space, /* in: space id */
68
ib_longlong tablespace_version, /* in: if the space memory object has
69
this timestamp different from what we are giving here,
70
treat the tablespace as dropped; this is a timestamp we
71
use to stop dangling page reads from a tablespace
72
which we have DISCARDed + IMPORTed back */
73
ulint offset) /* in: page number */
80
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
81
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
83
if (trx_doublewrite && space == TRX_SYS_SPACE
84
&& ( (offset >= trx_doublewrite->block1
85
&& offset < trx_doublewrite->block1
86
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
87
|| (offset >= trx_doublewrite->block2
88
&& offset < trx_doublewrite->block2
89
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
90
ut_print_timestamp(stderr);
92
" InnoDB: Warning: trying to read"
93
" doublewrite buffer page %lu\n",
99
if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) {
101
/* Trx sys header is so low in the latching order that we play
102
safe and do not leave the i/o-completion to an asynchronous
103
i/o-thread. Ibuf bitmap pages must always be read with
104
syncronous i/o, to make sure they do not get involved in
110
/* The following call will also check if the tablespace does not exist
111
or is being dropped; if we succeed in initing the page in the buffer
112
pool for read, then DISCARD cannot proceed until the read has
114
block = buf_page_init_for_read(err, mode, space, tablespace_version,
122
if (buf_debug_prints) {
124
"Posting read request for page %lu, sync %lu\n",
130
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
132
*err = fil_io(OS_FILE_READ | wake_later,
134
offset, 0, UNIV_PAGE_SIZE,
135
(void*)block->frame, (void*)block);
136
ut_a(*err == DB_SUCCESS);
139
/* The i/o is already completed when we arrive from
141
buf_page_io_complete(block);
147
/************************************************************************
148
Applies a random read-ahead in buf_pool if there are at least a threshold
149
value of accessed pages from the random read-ahead area. Does not read any
150
page, not even the one at the position (space, offset), if the read-ahead
151
mechanism is not activated. NOTE 1: the calling thread may own latches on
152
pages: to avoid deadlocks this function must be written such that it cannot
153
end up waiting for these latches! NOTE 2: the calling thread must want
154
access to the page given: this rule is set to prevent unintended read-aheads
155
performed by ibuf routines, a situation which could result in a deadlock if
156
the OS does not support asynchronous i/o. */
159
buf_read_ahead_random(
160
/*==================*/
161
/* out: number of page read requests issued; NOTE
162
that if we read ibuf pages, it may happen that
163
the page at the given page number does not get
164
read even if we return a value > 0! */
165
ulint space, /* in: space id */
166
ulint offset) /* in: page number of a page which the current thread
169
ib_longlong tablespace_version;
171
ulint recent_blocks = 0;
173
ulint LRU_recent_limit;
179
if (srv_startup_is_before_trx_rollback_phase) {
180
/* No read-ahead to avoid thread deadlocks */
184
if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) {
186
/* If it is an ibuf bitmap page or trx sys hdr, we do
187
no read-ahead, as that could break the ibuf page access
193
/* Remember the tablespace version before we ask te tablespace size
194
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
195
do not try to read outside the bounds of the tablespace! */
197
tablespace_version = fil_space_get_version(space);
199
low = (offset / BUF_READ_AHEAD_RANDOM_AREA)
200
* BUF_READ_AHEAD_RANDOM_AREA;
201
high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1)
202
* BUF_READ_AHEAD_RANDOM_AREA;
203
if (high > fil_space_get_size(space)) {
205
high = fil_space_get_size(space);
208
/* Get the minimum LRU_position field value for an initial segment
209
of the LRU list, to determine which blocks have recently been added
210
to the start of the list. */
212
LRU_recent_limit = buf_LRU_get_recent_limit();
214
mutex_enter(&(buf_pool->mutex));
216
if (buf_pool->n_pend_reads
217
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
218
mutex_exit(&(buf_pool->mutex));
223
/* Count how many blocks in the area have been recently accessed,
224
that is, reside near the start of the LRU list. */
226
for (i = low; i < high; i++) {
227
block = buf_page_hash_get(space, i);
230
&& (block->LRU_position > LRU_recent_limit)
231
&& block->accessed) {
237
mutex_exit(&(buf_pool->mutex));
239
if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) {
245
/* Read all the suitable blocks within the area */
248
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
250
ibuf_mode = BUF_READ_ANY_PAGE;
255
for (i = low; i < high; i++) {
256
/* It is only sensible to do read-ahead in the non-sync aio
257
mode: hence FALSE as the first parameter */
259
if (!ibuf_bitmap_page(i)) {
260
count += buf_read_page_low(
262
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
263
space, tablespace_version, i);
264
if (err == DB_TABLESPACE_DELETED) {
265
ut_print_timestamp(stderr);
267
" InnoDB: Warning: in random"
268
" readahead trying to access\n"
269
"InnoDB: tablespace %lu page %lu,\n"
270
"InnoDB: but the tablespace does not"
271
" exist or is just being dropped.\n",
272
(ulong) space, (ulong) i);
277
/* In simulated aio we wake the aio handler threads only after
278
queuing all aio requests, in native aio the following call does
281
os_aio_simulated_wake_handler_threads();
284
if (buf_debug_prints && (count > 0)) {
286
"Random read-ahead space %lu offset %lu pages %lu\n",
287
(ulong) space, (ulong) offset,
290
#endif /* UNIV_DEBUG */
292
++srv_read_ahead_rnd;
296
/************************************************************************
297
High-level function which reads a page asynchronously from a file to the
298
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
299
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
300
released by the i/o-handler thread. Does a random read-ahead if it seems
306
/* out: number of page read requests issued: this can
307
be > 1 if read-ahead occurred */
308
ulint space, /* in: space id */
309
ulint offset) /* in: page number */
311
ib_longlong tablespace_version;
316
tablespace_version = fil_space_get_version(space);
318
count = buf_read_ahead_random(space, offset);
320
/* We do the i/o in the synchronous aio mode to save thread
321
switches: hence TRUE */
323
count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
324
tablespace_version, offset);
325
srv_buf_pool_reads+= count2;
326
if (err == DB_TABLESPACE_DELETED) {
327
ut_print_timestamp(stderr);
329
" InnoDB: Error: trying to access"
330
" tablespace %lu page no. %lu,\n"
331
"InnoDB: but the tablespace does not exist"
332
" or is just being dropped.\n",
333
(ulong) space, (ulong) offset);
336
/* Flush pages from the end of the LRU list if necessary */
337
buf_flush_free_margin();
339
return(count + count2);
342
/************************************************************************
343
Applies linear read-ahead if in the buf_pool the page is a border page of
344
a linear read-ahead area and all the pages in the area have been accessed.
345
Does not read any page if the read-ahead mechanism is not activated. Note
346
that the the algorithm looks at the 'natural' adjacent successor and
347
predecessor of the page, which on the leaf level of a B-tree are the next
348
and previous page in the chain of leaves. To know these, the page specified
349
in (space, offset) must already be present in the buf_pool. Thus, the
350
natural way to use this function is to call it when a page in the buf_pool
351
is accessed the first time, calling this function just after it has been
353
NOTE 1: as this function looks at the natural predecessor and successor
354
fields on the page, what happens, if these are not initialized to any
355
sensible value? No problem, before applying read-ahead we check that the
356
area to read is within the span of the space, if not, read-ahead is not
357
applied. An uninitialized value may result in a useless read operation, but
358
only very improbably.
359
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
360
function must be written such that it cannot end up waiting for these
362
NOTE 3: the calling thread must want access to the page given: this rule is
363
set to prevent unintended read-aheads performed by ibuf routines, a situation
364
which could result in a deadlock if the OS does not support asynchronous io. */
367
buf_read_ahead_linear(
368
/*==================*/
369
/* out: number of page read requests issued */
370
ulint space, /* in: space id */
371
ulint offset) /* in: page number of a page; NOTE: the current thread
372
must want access to this page (see NOTE 3 above) */
374
ib_longlong tablespace_version;
377
buf_block_t* pred_block = NULL;
389
if (srv_startup_is_before_trx_rollback_phase) {
390
/* No read-ahead to avoid thread deadlocks */
394
if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) {
396
/* If it is an ibuf bitmap page or trx sys hdr, we do
397
no read-ahead, as that could break the ibuf page access
403
low = (offset / BUF_READ_AHEAD_LINEAR_AREA)
404
* BUF_READ_AHEAD_LINEAR_AREA;
405
high = (offset / BUF_READ_AHEAD_LINEAR_AREA + 1)
406
* BUF_READ_AHEAD_LINEAR_AREA;
408
if ((offset != low) && (offset != high - 1)) {
409
/* This is not a border page of the area: return */
414
/* Remember the tablespace version before we ask te tablespace size
415
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
416
do not try to read outside the bounds of the tablespace! */
418
tablespace_version = fil_space_get_version(space);
420
mutex_enter(&(buf_pool->mutex));
422
if (high > fil_space_get_size(space)) {
423
mutex_exit(&(buf_pool->mutex));
424
/* The area is not whole, return */
429
if (buf_pool->n_pend_reads
430
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
431
mutex_exit(&(buf_pool->mutex));
436
/* Check that almost all pages in the area have been accessed; if
437
offset == low, the accesses must be in a descending order, otherwise,
438
in an ascending order. */
448
for (i = low; i < high; i++) {
449
block = buf_page_hash_get(space, i);
451
if ((block == NULL) || !block->accessed) {
455
} else if (pred_block
456
&& (ut_ulint_cmp(block->LRU_position,
457
pred_block->LRU_position)
459
/* Accesses not in the right order */
466
if (fail_count > BUF_READ_AHEAD_LINEAR_AREA
467
- BUF_READ_AHEAD_LINEAR_THRESHOLD) {
468
/* Too many failures: return */
470
mutex_exit(&(buf_pool->mutex));
475
/* If we got this far, we know that enough pages in the area have
476
been accessed in the right order: linear read-ahead can be sensible */
478
block = buf_page_hash_get(space, offset);
481
mutex_exit(&(buf_pool->mutex));
486
frame = block->frame;
488
/* Read the natural predecessor and successor page addresses from
489
the page; NOTE that because the calling thread may have an x-latch
490
on the page, we do not acquire an s-latch on the page, this is to
491
prevent deadlocks. Even if we read values which are nonsense, the
492
algorithm will work. */
494
pred_offset = fil_page_get_prev(frame);
495
succ_offset = fil_page_get_next(frame);
497
mutex_exit(&(buf_pool->mutex));
499
if ((offset == low) && (succ_offset == offset + 1)) {
501
/* This is ok, we can continue */
502
new_offset = pred_offset;
504
} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
506
/* This is ok, we can continue */
507
new_offset = succ_offset;
509
/* Successor or predecessor not in the right order */
514
low = (new_offset / BUF_READ_AHEAD_LINEAR_AREA)
515
* BUF_READ_AHEAD_LINEAR_AREA;
516
high = (new_offset / BUF_READ_AHEAD_LINEAR_AREA + 1)
517
* BUF_READ_AHEAD_LINEAR_AREA;
519
if ((new_offset != low) && (new_offset != high - 1)) {
520
/* This is not a border page of the area: return */
525
if (high > fil_space_get_size(space)) {
526
/* The area is not whole, return */
531
/* If we got this far, read-ahead can be sensible: do it */
534
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
536
ibuf_mode = BUF_READ_ANY_PAGE;
541
/* Since Windows XP seems to schedule the i/o handler thread
542
very eagerly, and consequently it does not wait for the
543
full read batch to be posted, we use special heuristics here */
545
os_aio_simulated_put_read_threads_to_sleep();
547
for (i = low; i < high; i++) {
548
/* It is only sensible to do read-ahead in the non-sync
549
aio mode: hence FALSE as the first parameter */
551
if (!ibuf_bitmap_page(i)) {
552
count += buf_read_page_low(
554
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
555
space, tablespace_version, i);
556
if (err == DB_TABLESPACE_DELETED) {
557
ut_print_timestamp(stderr);
559
" InnoDB: Warning: in"
560
" linear readahead trying to access\n"
561
"InnoDB: tablespace %lu page %lu,\n"
562
"InnoDB: but the tablespace does not"
563
" exist or is just being dropped.\n",
564
(ulong) space, (ulong) i);
569
/* In simulated aio we wake the aio handler threads only after
570
queuing all aio requests, in native aio the following call does
573
os_aio_simulated_wake_handler_threads();
575
/* Flush pages from the end of the LRU list if necessary */
576
buf_flush_free_margin();
579
if (buf_debug_prints && (count > 0)) {
581
"LINEAR read-ahead space %lu offset %lu pages %lu\n",
582
(ulong) space, (ulong) offset, (ulong) count);
584
#endif /* UNIV_DEBUG */
586
++srv_read_ahead_seq;
590
/************************************************************************
591
Issues read requests for pages which the ibuf module wants to read in, in
592
order to contract the insert buffer tree. Technically, this function is like
593
a read-ahead function. */
596
buf_read_ibuf_merge_pages(
597
/*======================*/
598
ibool sync, /* in: TRUE if the caller wants this function
599
to wait for the highest address page to get
600
read in, before this function returns */
601
ulint* space_ids, /* in: array of space ids */
602
ib_longlong* space_versions,/* in: the spaces must have this version
603
number (timestamp), otherwise we discard the
604
read; we use this to cancel reads if
605
DISCARD + IMPORT may have changed the
607
ulint* page_nos, /* in: array of page numbers to read, with the
608
highest page number the last in the array */
609
ulint n_stored) /* in: number of page numbers in the array */
614
ut_ad(!ibuf_inside());
615
#ifdef UNIV_IBUF_DEBUG
616
ut_a(n_stored < UNIV_PAGE_SIZE);
618
while (buf_pool->n_pend_reads
619
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
620
os_thread_sleep(500000);
623
for (i = 0; i < n_stored; i++) {
624
buf_read_page_low(&err,
625
(i + 1 == n_stored) && sync,
627
space_ids[i], space_versions[i],
630
if (err == DB_TABLESPACE_DELETED) {
631
/* We have deleted or are deleting the single-table
632
tablespace: remove the entries for that page */
634
ibuf_merge_or_delete_for_page(NULL, space_ids[i],
639
os_aio_simulated_wake_handler_threads();
641
/* Flush pages from the end of the LRU list if necessary */
642
buf_flush_free_margin();
645
if (buf_debug_prints) {
647
"Ibuf merge read-ahead space %lu pages %lu\n",
648
(ulong) space_ids[0], (ulong) n_stored);
650
#endif /* UNIV_DEBUG */
653
/************************************************************************
654
Issues read requests for pages which recovery wants to read in. */
659
ibool sync, /* in: TRUE if the caller wants this function
660
to wait for the highest address page to get
661
read in, before this function returns */
662
ulint space, /* in: space id */
663
ulint* page_nos, /* in: array of page numbers to read, with the
664
highest page number the last in the array */
665
ulint n_stored) /* in: number of page numbers in the array */
667
ib_longlong tablespace_version;
672
tablespace_version = fil_space_get_version(space);
674
for (i = 0; i < n_stored; i++) {
678
os_aio_print_debug = FALSE;
680
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
682
os_aio_simulated_wake_handler_threads();
683
os_thread_sleep(500000);
689
"InnoDB: Error: InnoDB has waited for"
690
" 50 seconds for pending\n"
691
"InnoDB: reads to the buffer pool to"
693
"InnoDB: Number of pending reads %lu,"
694
" pending pread calls %lu\n",
695
(ulong) buf_pool->n_pend_reads,
696
(ulong)os_file_n_pending_preads);
698
os_aio_print_debug = TRUE;
702
os_aio_print_debug = FALSE;
704
if ((i + 1 == n_stored) && sync) {
705
buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE,
706
space, tablespace_version,
709
buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
710
| OS_AIO_SIMULATED_WAKE_LATER,
711
space, tablespace_version,
716
os_aio_simulated_wake_handler_threads();
718
/* Flush pages from the end of the LRU list if necessary */
719
buf_flush_free_margin();
722
if (buf_debug_prints) {
724
"Recovery applies read-ahead pages %lu\n",
727
#endif /* UNIV_DEBUG */