1
/******************************************************
2
The database buffer read
6
Created 11/5/1995 Heikki Tuuri
7
*******************************************************/
17
#include "ibuf0ibuf.h"
21
#include "srv0start.h"
24
/* The size in blocks of the area where the random read-ahead algorithm counts
25
the accessed pages when deciding whether to read-ahead */
26
#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA
28
/* There must be at least this many pages in buf_pool in the area to start
29
a random read-ahead */
30
#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + buf_read_ahead_random_area / 8)
32
/* The linear read-ahead area size */
33
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
35
/* The linear read-ahead threshold */
36
#define LINEAR_AREA_THRESHOLD_COEF 5 / 8
38
/* If there are buf_pool->curr_size per the number below pending reads, then
39
read-ahead is not done: this is to prevent flooding the buffer pool with
40
i/o-fixed buffer blocks */
41
#define BUF_READ_AHEAD_PEND_LIMIT 2
43
/************************************************************************
44
Low-level function which reads a page asynchronously from a file to the
45
buffer buf_pool if it is not already there, in which case does nothing.
46
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
47
flag is cleared and the x-lock released by an i/o-handler thread. */
52
/* out: 1 if a read request was queued, 0 if the page
53
already resided in buf_pool, or if the page is in
54
the doublewrite buffer blocks in which case it is never
55
read into the pool, or if the tablespace does not
56
exist or is being dropped */
57
ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
58
trying to read from a non-existent tablespace, or a
59
tablespace which is just now being dropped */
60
ibool sync, /* in: TRUE if synchronous aio is desired */
61
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
62
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
63
at read-ahead functions) */
64
ulint space, /* in: space id */
65
ulint zip_size,/* in: compressed page size, or 0 */
66
ibool unzip, /* in: TRUE=request uncompressed page */
67
ib_int64_t tablespace_version, /* in: if the space memory object has
68
this timestamp different from what we are giving here,
69
treat the tablespace as dropped; this is a timestamp we
70
use to stop dangling page reads from a tablespace
71
which we have DISCARDed + IMPORTed back */
72
ulint offset) /* in: page number */
79
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
80
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
82
if (trx_doublewrite && space == TRX_SYS_SPACE
83
&& ( (offset >= trx_doublewrite->block1
84
&& offset < trx_doublewrite->block1
85
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
86
|| (offset >= trx_doublewrite->block2
87
&& offset < trx_doublewrite->block2
88
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
89
ut_print_timestamp(stderr);
91
" InnoDB: Warning: trying to read"
92
" doublewrite buffer page %lu\n",
98
if (ibuf_bitmap_page(zip_size, offset)
99
|| trx_sys_hdr_page(space, offset)) {
101
/* Trx sys header is so low in the latching order that we play
102
safe and do not leave the i/o-completion to an asynchronous
103
i/o-thread. Ibuf bitmap pages must always be read with
104
syncronous i/o, to make sure they do not get involved in
110
/* The following call will also check if the tablespace does not exist
111
or is being dropped; if we succeed in initing the page in the buffer
112
pool for read, then DISCARD cannot proceed until the read has
114
bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
115
tablespace_version, offset);
122
if (buf_debug_prints) {
124
"Posting read request for page %lu, sync %lu\n",
130
ut_ad(buf_page_in_file(bpage));
133
*err = fil_io(OS_FILE_READ | wake_later,
134
sync, space, zip_size, offset, 0, zip_size,
135
bpage->zip.data, bpage);
137
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
139
*err = fil_io(OS_FILE_READ | wake_later,
140
sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
141
((buf_block_t*) bpage)->frame, bpage);
143
ut_a(*err == DB_SUCCESS);
146
/* The i/o is already completed when we arrive from
148
buf_page_io_complete(bpage);
154
/************************************************************************
155
Applies a random read-ahead in buf_pool if there are at least a threshold
156
value of accessed pages from the random read-ahead area. Does not read any
157
page, not even the one at the position (space, offset), if the read-ahead
158
mechanism is not activated. NOTE 1: the calling thread may own latches on
159
pages: to avoid deadlocks this function must be written such that it cannot
160
end up waiting for these latches! NOTE 2: the calling thread must want
161
access to the page given: this rule is set to prevent unintended read-aheads
162
performed by ibuf routines, a situation which could result in a deadlock if
163
the OS does not support asynchronous i/o. */
166
buf_read_ahead_random(
167
/*==================*/
168
/* out: number of page read requests issued; NOTE
169
that if we read ibuf pages, it may happen that
170
the page at the given page number does not get
171
read even if we return a value > 0! */
172
ulint space, /* in: space id */
173
ulint zip_size,/* in: compressed page size in bytes, or 0 */
174
ulint offset) /* in: page number of a page which the current thread
177
ib_int64_t tablespace_version;
178
ulint recent_blocks = 0;
180
ulint LRU_recent_limit;
185
ulint buf_read_ahead_random_area;
187
if (srv_startup_is_before_trx_rollback_phase) {
188
/* No read-ahead to avoid thread deadlocks */
192
if (ibuf_bitmap_page(zip_size, offset)
193
|| trx_sys_hdr_page(space, offset)) {
195
/* If it is an ibuf bitmap page or trx sys hdr, we do
196
no read-ahead, as that could break the ibuf page access
202
/* Remember the tablespace version before we ask te tablespace size
203
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
204
do not try to read outside the bounds of the tablespace! */
206
tablespace_version = fil_space_get_version(space);
208
buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA;
210
low = (offset / buf_read_ahead_random_area)
211
* buf_read_ahead_random_area;
212
high = (offset / buf_read_ahead_random_area + 1)
213
* buf_read_ahead_random_area;
214
if (high > fil_space_get_size(space)) {
216
high = fil_space_get_size(space);
219
/* Get the minimum LRU_position field value for an initial segment
220
of the LRU list, to determine which blocks have recently been added
221
to the start of the list. */
223
LRU_recent_limit = buf_LRU_get_recent_limit();
225
buf_pool_mutex_enter();
227
if (buf_pool->n_pend_reads
228
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
229
buf_pool_mutex_exit();
234
/* Count how many blocks in the area have been recently accessed,
235
that is, reside near the start of the LRU list. */
237
for (i = low; i < high; i++) {
238
const buf_page_t* bpage = buf_page_hash_get(space, i);
241
&& buf_page_is_accessed(bpage)
242
&& (buf_page_get_LRU_position(bpage) > LRU_recent_limit)) {
246
if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) {
248
buf_pool_mutex_exit();
254
buf_pool_mutex_exit();
259
/* Read all the suitable blocks within the area */
262
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
264
ibuf_mode = BUF_READ_ANY_PAGE;
269
for (i = low; i < high; i++) {
270
/* It is only sensible to do read-ahead in the non-sync aio
271
mode: hence FALSE as the first parameter */
273
if (!ibuf_bitmap_page(zip_size, i)) {
274
count += buf_read_page_low(
276
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
277
space, zip_size, FALSE,
278
tablespace_version, i);
279
if (err == DB_TABLESPACE_DELETED) {
280
ut_print_timestamp(stderr);
282
" InnoDB: Warning: in random"
283
" readahead trying to access\n"
284
"InnoDB: tablespace %lu page %lu,\n"
285
"InnoDB: but the tablespace does not"
286
" exist or is just being dropped.\n",
287
(ulong) space, (ulong) i);
292
/* In simulated aio we wake the aio handler threads only after
293
queuing all aio requests, in native aio the following call does
296
os_aio_simulated_wake_handler_threads();
299
if (buf_debug_prints && (count > 0)) {
301
"Random read-ahead space %lu offset %lu pages %lu\n",
302
(ulong) space, (ulong) offset,
305
#endif /* UNIV_DEBUG */
307
++srv_read_ahead_rnd;
311
/************************************************************************
312
High-level function which reads a page asynchronously from a file to the
313
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
314
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
315
released by the i/o-handler thread. Does a random read-ahead if it seems
321
/* out: number of page read requests issued: this can
322
be > 1 if read-ahead occurred */
323
ulint space, /* in: space id */
324
ulint zip_size,/* in: compressed page size in bytes, or 0 */
325
ulint offset) /* in: page number */
327
ib_int64_t tablespace_version;
332
tablespace_version = fil_space_get_version(space);
334
count = buf_read_ahead_random(space, zip_size, offset);
336
/* We do the i/o in the synchronous aio mode to save thread
337
switches: hence TRUE */
339
count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
341
tablespace_version, offset);
342
srv_buf_pool_reads+= count2;
343
if (err == DB_TABLESPACE_DELETED) {
344
ut_print_timestamp(stderr);
346
" InnoDB: Error: trying to access"
347
" tablespace %lu page no. %lu,\n"
348
"InnoDB: but the tablespace does not exist"
349
" or is just being dropped.\n",
350
(ulong) space, (ulong) offset);
353
/* Flush pages from the end of the LRU list if necessary */
354
buf_flush_free_margin();
356
/* Increment number of I/O operations used for LRU policy. */
357
buf_LRU_stat_inc_io();
359
return(count + count2);
362
/************************************************************************
363
Applies linear read-ahead if in the buf_pool the page is a border page of
364
a linear read-ahead area and all the pages in the area have been accessed.
365
Does not read any page if the read-ahead mechanism is not activated. Note
366
that the the algorithm looks at the 'natural' adjacent successor and
367
predecessor of the page, which on the leaf level of a B-tree are the next
368
and previous page in the chain of leaves. To know these, the page specified
369
in (space, offset) must already be present in the buf_pool. Thus, the
370
natural way to use this function is to call it when a page in the buf_pool
371
is accessed the first time, calling this function just after it has been
373
NOTE 1: as this function looks at the natural predecessor and successor
374
fields on the page, what happens, if these are not initialized to any
375
sensible value? No problem, before applying read-ahead we check that the
376
area to read is within the span of the space, if not, read-ahead is not
377
applied. An uninitialized value may result in a useless read operation, but
378
only very improbably.
379
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
380
function must be written such that it cannot end up waiting for these
382
NOTE 3: the calling thread must want access to the page given: this rule is
383
set to prevent unintended read-aheads performed by ibuf routines, a situation
384
which could result in a deadlock if the OS does not support asynchronous io. */
387
buf_read_ahead_linear(
388
/*==================*/
389
/* out: number of page read requests issued */
390
ulint space, /* in: space id */
391
ulint zip_size,/* in: compressed page size in bytes, or 0 */
392
ulint offset) /* in: page number of a page; NOTE: the current thread
393
must want access to this page (see NOTE 3 above) */
395
ib_int64_t tablespace_version;
398
buf_page_t* pred_bpage = NULL;
409
const ulint buf_read_ahead_linear_area
410
= BUF_READ_AHEAD_LINEAR_AREA;
412
if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
413
/* No read-ahead to avoid thread deadlocks */
417
low = (offset / buf_read_ahead_linear_area)
418
* buf_read_ahead_linear_area;
419
high = (offset / buf_read_ahead_linear_area + 1)
420
* buf_read_ahead_linear_area;
422
if ((offset != low) && (offset != high - 1)) {
423
/* This is not a border page of the area: return */
428
if (ibuf_bitmap_page(zip_size, offset)
429
|| trx_sys_hdr_page(space, offset)) {
431
/* If it is an ibuf bitmap page or trx sys hdr, we do
432
no read-ahead, as that could break the ibuf page access
438
/* Remember the tablespace version before we ask te tablespace size
439
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
440
do not try to read outside the bounds of the tablespace! */
442
tablespace_version = fil_space_get_version(space);
444
buf_pool_mutex_enter();
446
if (high > fil_space_get_size(space)) {
447
buf_pool_mutex_exit();
448
/* The area is not whole, return */
453
if (buf_pool->n_pend_reads
454
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
455
buf_pool_mutex_exit();
460
/* Check that almost all pages in the area have been accessed; if
461
offset == low, the accesses must be in a descending order, otherwise,
462
in an ascending order. */
472
for (i = low; i < high; i++) {
473
bpage = buf_page_hash_get(space, i);
475
if ((bpage == NULL) || !buf_page_is_accessed(bpage)) {
479
} else if (pred_bpage
481
buf_page_get_LRU_position(bpage),
482
buf_page_get_LRU_position(pred_bpage))
484
/* Accesses not in the right order */
491
if (fail_count > buf_read_ahead_linear_area
492
* LINEAR_AREA_THRESHOLD_COEF) {
493
/* Too many failures: return */
495
buf_pool_mutex_exit();
500
/* If we got this far, we know that enough pages in the area have
501
been accessed in the right order: linear read-ahead can be sensible */
503
bpage = buf_page_hash_get(space, offset);
506
buf_pool_mutex_exit();
511
switch (buf_page_get_state(bpage)) {
512
case BUF_BLOCK_ZIP_PAGE:
513
frame = bpage->zip.data;
515
case BUF_BLOCK_FILE_PAGE:
516
frame = ((buf_block_t*) bpage)->frame;
523
/* Read the natural predecessor and successor page addresses from
524
the page; NOTE that because the calling thread may have an x-latch
525
on the page, we do not acquire an s-latch on the page, this is to
526
prevent deadlocks. Even if we read values which are nonsense, the
527
algorithm will work. */
529
pred_offset = fil_page_get_prev(frame);
530
succ_offset = fil_page_get_next(frame);
532
buf_pool_mutex_exit();
534
if ((offset == low) && (succ_offset == offset + 1)) {
536
/* This is ok, we can continue */
537
new_offset = pred_offset;
539
} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
541
/* This is ok, we can continue */
542
new_offset = succ_offset;
544
/* Successor or predecessor not in the right order */
549
low = (new_offset / buf_read_ahead_linear_area)
550
* buf_read_ahead_linear_area;
551
high = (new_offset / buf_read_ahead_linear_area + 1)
552
* buf_read_ahead_linear_area;
554
if ((new_offset != low) && (new_offset != high - 1)) {
555
/* This is not a border page of the area: return */
560
if (high > fil_space_get_size(space)) {
561
/* The area is not whole, return */
566
/* If we got this far, read-ahead can be sensible: do it */
569
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
571
ibuf_mode = BUF_READ_ANY_PAGE;
576
/* Since Windows XP seems to schedule the i/o handler thread
577
very eagerly, and consequently it does not wait for the
578
full read batch to be posted, we use special heuristics here */
580
os_aio_simulated_put_read_threads_to_sleep();
582
for (i = low; i < high; i++) {
583
/* It is only sensible to do read-ahead in the non-sync
584
aio mode: hence FALSE as the first parameter */
586
if (!ibuf_bitmap_page(zip_size, i)) {
587
count += buf_read_page_low(
589
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
590
space, zip_size, FALSE, tablespace_version, i);
591
if (err == DB_TABLESPACE_DELETED) {
592
ut_print_timestamp(stderr);
594
" InnoDB: Warning: in"
595
" linear readahead trying to access\n"
596
"InnoDB: tablespace %lu page %lu,\n"
597
"InnoDB: but the tablespace does not"
598
" exist or is just being dropped.\n",
599
(ulong) space, (ulong) i);
604
/* In simulated aio we wake the aio handler threads only after
605
queuing all aio requests, in native aio the following call does
608
os_aio_simulated_wake_handler_threads();
610
/* Flush pages from the end of the LRU list if necessary */
611
buf_flush_free_margin();
614
if (buf_debug_prints && (count > 0)) {
616
"LINEAR read-ahead space %lu offset %lu pages %lu\n",
617
(ulong) space, (ulong) offset, (ulong) count);
619
#endif /* UNIV_DEBUG */
621
/* Read ahead is considered one I/O operation for the purpose of
622
LRU policy decision. */
623
buf_LRU_stat_inc_io();
625
++srv_read_ahead_seq;
629
/************************************************************************
630
Issues read requests for pages which the ibuf module wants to read in, in
631
order to contract the insert buffer tree. Technically, this function is like
632
a read-ahead function. */
635
buf_read_ibuf_merge_pages(
636
/*======================*/
637
ibool sync, /* in: TRUE if the caller
638
wants this function to wait
639
for the highest address page
640
to get read in, before this
642
const ulint* space_ids, /* in: array of space ids */
643
const ib_int64_t* space_versions,/* in: the spaces must have
645
(timestamp), otherwise we
646
discard the read; we use this
647
to cancel reads if DISCARD +
648
IMPORT may have changed the
650
const ulint* page_nos, /* in: array of page numbers
651
to read, with the highest page
652
number the last in the
654
ulint n_stored) /* in: number of elements
659
ut_ad(!ibuf_inside());
660
#ifdef UNIV_IBUF_DEBUG
661
ut_a(n_stored < UNIV_PAGE_SIZE);
663
while (buf_pool->n_pend_reads
664
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
665
os_thread_sleep(500000);
668
for (i = 0; i < n_stored; i++) {
669
ulint zip_size = fil_space_get_zip_size(space_ids[i]);
672
if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
674
goto tablespace_deleted;
677
buf_read_page_low(&err, sync && (i + 1 == n_stored),
678
BUF_READ_ANY_PAGE, space_ids[i],
679
zip_size, TRUE, space_versions[i],
682
if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
684
/* We have deleted or are deleting the single-table
685
tablespace: remove the entries for that page */
687
ibuf_merge_or_delete_for_page(NULL, space_ids[i],
693
os_aio_simulated_wake_handler_threads();
695
/* Flush pages from the end of the LRU list if necessary */
696
buf_flush_free_margin();
699
if (buf_debug_prints) {
701
"Ibuf merge read-ahead space %lu pages %lu\n",
702
(ulong) space_ids[0], (ulong) n_stored);
704
#endif /* UNIV_DEBUG */
707
/************************************************************************
708
Issues read requests for pages which recovery wants to read in. */
713
ibool sync, /* in: TRUE if the caller
714
wants this function to wait
715
for the highest address page
716
to get read in, before this
718
ulint space, /* in: space id */
719
ulint zip_size, /* in: compressed page size in
721
const ulint* page_nos, /* in: array of page numbers
722
to read, with the highest page
723
number the last in the
725
ulint n_stored) /* in: number of page numbers
728
ib_int64_t tablespace_version;
733
zip_size = fil_space_get_zip_size(space);
734
tablespace_version = fil_space_get_version(space);
736
for (i = 0; i < n_stored; i++) {
740
os_aio_print_debug = FALSE;
742
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
744
os_aio_simulated_wake_handler_threads();
745
os_thread_sleep(500000);
751
"InnoDB: Error: InnoDB has waited for"
752
" 50 seconds for pending\n"
753
"InnoDB: reads to the buffer pool to"
755
"InnoDB: Number of pending reads %lu,"
756
" pending pread calls %lu\n",
757
(ulong) buf_pool->n_pend_reads,
758
(ulong)os_file_n_pending_preads);
760
os_aio_print_debug = TRUE;
764
os_aio_print_debug = FALSE;
766
if ((i + 1 == n_stored) && sync) {
767
buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
768
zip_size, TRUE, tablespace_version,
771
buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
772
| OS_AIO_SIMULATED_WAKE_LATER,
773
space, zip_size, TRUE,
774
tablespace_version, page_nos[i]);
778
os_aio_simulated_wake_handler_threads();
780
/* Flush pages from the end of the LRU list if necessary */
781
buf_flush_free_margin();
784
if (buf_debug_prints) {
786
"Recovery applies read-ahead pages %lu\n",
789
#endif /* UNIV_DEBUG */