1
/*****************************************************************************
3
Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
21
The database buffer read
23
Created 11/5/1995 Heikki Tuuri
24
*******************************************************/
34
#include "ibuf0ibuf.h"
38
#include "srv0start.h"
41
/** The linear read-ahead area size */
42
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
44
/** If there are buf_pool->curr_size per the number below pending reads, then
45
read-ahead is not done: this is to prevent flooding the buffer pool with
46
i/o-fixed buffer blocks */
47
#define BUF_READ_AHEAD_PEND_LIMIT 2
49
/********************************************************************//**
50
Low-level function which reads a page asynchronously from a file to the
51
buffer buf_pool if it is not already there, in which case does nothing.
52
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
53
flag is cleared and the x-lock released by an i/o-handler thread.
54
@return 1 if a read request was queued, 0 if the page already resided
55
in buf_pool, or if the page is in the doublewrite buffer blocks in
56
which case it is never read into the pool, or if the tablespace does
57
not exist or is being dropped
58
@return 1 if read request is issued. 0 if it is not */
63
ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
64
trying to read from a non-existent tablespace, or a
65
tablespace which is just now being dropped */
66
ibool sync, /*!< in: TRUE if synchronous aio is desired */
67
ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
68
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
69
at read-ahead functions) */
70
ulint space, /*!< in: space id */
71
ulint zip_size,/*!< in: compressed page size, or 0 */
72
ibool unzip, /*!< in: TRUE=request uncompressed page */
73
ib_int64_t tablespace_version, /*!< in: if the space memory object has
74
this timestamp different from what we are giving here,
75
treat the tablespace as dropped; this is a timestamp we
76
use to stop dangling page reads from a tablespace
77
which we have DISCARDed + IMPORTed back */
78
ulint offset) /*!< in: page number */
85
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
86
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
88
if (trx_doublewrite && space == TRX_SYS_SPACE
89
&& ( (offset >= trx_doublewrite->block1
90
&& offset < trx_doublewrite->block1
91
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
92
|| (offset >= trx_doublewrite->block2
93
&& offset < trx_doublewrite->block2
94
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
95
ut_print_timestamp(stderr);
97
" InnoDB: Warning: trying to read"
98
" doublewrite buffer page %lu\n",
104
if (ibuf_bitmap_page(zip_size, offset)
105
|| trx_sys_hdr_page(space, offset)) {
107
/* Trx sys header is so low in the latching order that we play
108
safe and do not leave the i/o-completion to an asynchronous
109
i/o-thread. Ibuf bitmap pages must always be read with
110
syncronous i/o, to make sure they do not get involved in
116
/* The following call will also check if the tablespace does not exist
117
or is being dropped; if we succeed in initing the page in the buffer
118
pool for read, then DISCARD cannot proceed until the read has
120
bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
121
tablespace_version, offset);
128
if (buf_debug_prints) {
130
"Posting read request for page %lu, sync %lu\n",
136
ut_ad(buf_page_in_file(bpage));
139
*err = fil_io(OS_FILE_READ | wake_later,
140
sync, space, zip_size, offset, 0, zip_size,
141
bpage->zip.data, bpage);
143
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
145
*err = fil_io(OS_FILE_READ | wake_later,
146
sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
147
((buf_block_t*) bpage)->frame, bpage);
149
ut_a(*err == DB_SUCCESS);
152
/* The i/o is already completed when we arrive from
154
buf_page_io_complete(bpage);
160
/********************************************************************//**
161
High-level function which reads a page asynchronously from a file to the
162
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
163
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
164
released by the i/o-handler thread.
165
@return TRUE if page has been read in, FALSE in case of failure */
170
ulint space, /*!< in: space id */
171
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
172
ulint offset) /*!< in: page number */
174
buf_pool_t* buf_pool = buf_pool_get(space, offset);
175
ib_int64_t tablespace_version;
179
tablespace_version = fil_space_get_version(space);
181
/* We do the i/o in the synchronous aio mode to save thread
182
switches: hence TRUE */
184
count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
186
tablespace_version, offset);
187
srv_buf_pool_reads += count;
188
if (err == DB_TABLESPACE_DELETED) {
189
ut_print_timestamp(stderr);
191
" InnoDB: Error: trying to access"
192
" tablespace %lu page no. %lu,\n"
193
"InnoDB: but the tablespace does not exist"
194
" or is just being dropped.\n",
195
(ulong) space, (ulong) offset);
198
/* Flush pages from the end of the LRU list if necessary */
199
buf_flush_free_margin(buf_pool);
201
/* Increment number of I/O operations used for LRU policy. */
202
buf_LRU_stat_inc_io();
207
/********************************************************************//**
208
Applies linear read-ahead if in the buf_pool the page is a border page of
209
a linear read-ahead area and all the pages in the area have been accessed.
210
Does not read any page if the read-ahead mechanism is not activated. Note
211
that the the algorithm looks at the 'natural' adjacent successor and
212
predecessor of the page, which on the leaf level of a B-tree are the next
213
and previous page in the chain of leaves. To know these, the page specified
214
in (space, offset) must already be present in the buf_pool. Thus, the
215
natural way to use this function is to call it when a page in the buf_pool
216
is accessed the first time, calling this function just after it has been
218
NOTE 1: as this function looks at the natural predecessor and successor
219
fields on the page, what happens, if these are not initialized to any
220
sensible value? No problem, before applying read-ahead we check that the
221
area to read is within the span of the space, if not, read-ahead is not
222
applied. An uninitialized value may result in a useless read operation, but
223
only very improbably.
224
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
225
function must be written such that it cannot end up waiting for these
227
NOTE 3: the calling thread must want access to the page given: this rule is
228
set to prevent unintended read-aheads performed by ibuf routines, a situation
229
which could result in a deadlock if the OS does not support asynchronous io.
230
@return number of page read requests issued */
233
buf_read_ahead_linear(
234
/*==================*/
235
ulint space, /*!< in: space id */
236
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
237
ulint offset) /*!< in: page number of a page; NOTE: the current thread
238
must want access to this page (see NOTE 3 above) */
240
buf_pool_t* buf_pool = buf_pool_get(space, offset);
241
ib_int64_t tablespace_version;
244
buf_page_t* pred_bpage = NULL;
255
const ulint buf_read_ahead_linear_area
256
= BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
259
if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
260
/* No read-ahead to avoid thread deadlocks */
264
low = (offset / buf_read_ahead_linear_area)
265
* buf_read_ahead_linear_area;
266
high = (offset / buf_read_ahead_linear_area + 1)
267
* buf_read_ahead_linear_area;
269
if ((offset != low) && (offset != high - 1)) {
270
/* This is not a border page of the area: return */
275
if (ibuf_bitmap_page(zip_size, offset)
276
|| trx_sys_hdr_page(space, offset)) {
278
/* If it is an ibuf bitmap page or trx sys hdr, we do
279
no read-ahead, as that could break the ibuf page access
285
/* Remember the tablespace version before we ask te tablespace size
286
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
287
do not try to read outside the bounds of the tablespace! */
289
tablespace_version = fil_space_get_version(space);
291
buf_pool_mutex_enter(buf_pool);
293
if (high > fil_space_get_size(space)) {
294
buf_pool_mutex_exit(buf_pool);
295
/* The area is not whole, return */
300
if (buf_pool->n_pend_reads
301
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
302
buf_pool_mutex_exit(buf_pool);
307
/* Check that almost all pages in the area have been accessed; if
308
offset == low, the accesses must be in a descending order, otherwise,
309
in an ascending order. */
317
/* How many out of order accessed pages can we ignore
318
when working out the access pattern for linear readahead */
319
threshold = ut_min((64 - srv_read_ahead_threshold),
320
BUF_READ_AHEAD_AREA(buf_pool));
324
for (i = low; i < high; i++) {
325
bpage = buf_page_hash_get(buf_pool, space, i);
327
if (bpage == NULL || !buf_page_is_accessed(bpage)) {
331
} else if (pred_bpage) {
332
/* Note that buf_page_is_accessed() returns
333
the time of the first access. If some blocks
334
of the extent existed in the buffer pool at
335
the time of a linear access pattern, the first
336
access times may be nonmonotonic, even though
337
the latest access times were linear. The
338
threshold (srv_read_ahead_factor) should help
339
a little against this. */
340
int res = ut_ulint_cmp(
341
buf_page_is_accessed(bpage),
342
buf_page_is_accessed(pred_bpage));
343
/* Accesses not in the right order */
344
if (res != 0 && res != asc_or_desc) {
349
if (fail_count > threshold) {
350
/* Too many failures: return */
351
buf_pool_mutex_exit(buf_pool);
355
if (bpage && buf_page_is_accessed(bpage)) {
360
/* If we got this far, we know that enough pages in the area have
361
been accessed in the right order: linear read-ahead can be sensible */
363
bpage = buf_page_hash_get(buf_pool, space, offset);
366
buf_pool_mutex_exit(buf_pool);
371
switch (buf_page_get_state(bpage)) {
372
case BUF_BLOCK_ZIP_PAGE:
373
frame = bpage->zip.data;
375
case BUF_BLOCK_FILE_PAGE:
376
frame = ((buf_block_t*) bpage)->frame;
383
/* Read the natural predecessor and successor page addresses from
384
the page; NOTE that because the calling thread may have an x-latch
385
on the page, we do not acquire an s-latch on the page, this is to
386
prevent deadlocks. Even if we read values which are nonsense, the
387
algorithm will work. */
389
pred_offset = fil_page_get_prev(frame);
390
succ_offset = fil_page_get_next(frame);
392
buf_pool_mutex_exit(buf_pool);
394
if ((offset == low) && (succ_offset == offset + 1)) {
396
/* This is ok, we can continue */
397
new_offset = pred_offset;
399
} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
401
/* This is ok, we can continue */
402
new_offset = succ_offset;
404
/* Successor or predecessor not in the right order */
409
low = (new_offset / buf_read_ahead_linear_area)
410
* buf_read_ahead_linear_area;
411
high = (new_offset / buf_read_ahead_linear_area + 1)
412
* buf_read_ahead_linear_area;
414
if ((new_offset != low) && (new_offset != high - 1)) {
415
/* This is not a border page of the area: return */
420
if (high > fil_space_get_size(space)) {
421
/* The area is not whole, return */
426
/* If we got this far, read-ahead can be sensible: do it */
429
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
431
ibuf_mode = BUF_READ_ANY_PAGE;
436
/* Since Windows XP seems to schedule the i/o handler thread
437
very eagerly, and consequently it does not wait for the
438
full read batch to be posted, we use special heuristics here */
440
os_aio_simulated_put_read_threads_to_sleep();
442
for (i = low; i < high; i++) {
443
/* It is only sensible to do read-ahead in the non-sync
444
aio mode: hence FALSE as the first parameter */
446
if (!ibuf_bitmap_page(zip_size, i)) {
447
count += buf_read_page_low(
449
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
450
space, zip_size, FALSE, tablespace_version, i);
451
if (err == DB_TABLESPACE_DELETED) {
452
ut_print_timestamp(stderr);
454
" InnoDB: Warning: in"
455
" linear readahead trying to access\n"
456
"InnoDB: tablespace %lu page %lu,\n"
457
"InnoDB: but the tablespace does not"
458
" exist or is just being dropped.\n",
459
(ulong) space, (ulong) i);
464
/* In simulated aio we wake the aio handler threads only after
465
queuing all aio requests, in native aio the following call does
468
os_aio_simulated_wake_handler_threads();
470
/* Flush pages from the end of the LRU list if necessary */
471
buf_flush_free_margin(buf_pool);
474
if (buf_debug_prints && (count > 0)) {
476
"LINEAR read-ahead space %lu offset %lu pages %lu\n",
477
(ulong) space, (ulong) offset, (ulong) count);
479
#endif /* UNIV_DEBUG */
481
/* Read ahead is considered one I/O operation for the purpose of
482
LRU policy decision. */
483
buf_LRU_stat_inc_io();
485
buf_pool->stat.n_ra_pages_read += count;
489
/********************************************************************//**
490
Issues read requests for pages which the ibuf module wants to read in, in
491
order to contract the insert buffer tree. Technically, this function is like
492
a read-ahead function. */
495
buf_read_ibuf_merge_pages(
496
/*======================*/
497
ibool sync, /*!< in: TRUE if the caller
498
wants this function to wait
499
for the highest address page
500
to get read in, before this
502
const ulint* space_ids, /*!< in: array of space ids */
503
const ib_int64_t* space_versions,/*!< in: the spaces must have
505
(timestamp), otherwise we
506
discard the read; we use this
507
to cancel reads if DISCARD +
508
IMPORT may have changed the
510
const ulint* page_nos, /*!< in: array of page numbers
511
to read, with the highest page
512
number the last in the
514
ulint n_stored) /*!< in: number of elements
519
ut_ad(!ibuf_inside());
520
#ifdef UNIV_IBUF_DEBUG
521
ut_a(n_stored < UNIV_PAGE_SIZE);
524
for (i = 0; i < n_stored; i++) {
526
buf_pool_t* buf_pool;
527
ulint zip_size = fil_space_get_zip_size(space_ids[i]);
529
buf_pool = buf_pool_get(space_ids[i], space_versions[i]);
531
while (buf_pool->n_pend_reads
532
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
533
os_thread_sleep(500000);
536
if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
538
goto tablespace_deleted;
541
buf_read_page_low(&err, sync && (i + 1 == n_stored),
542
BUF_READ_ANY_PAGE, space_ids[i],
543
zip_size, TRUE, space_versions[i],
546
if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
548
/* We have deleted or are deleting the single-table
549
tablespace: remove the entries for that page */
551
ibuf_merge_or_delete_for_page(NULL, space_ids[i],
557
os_aio_simulated_wake_handler_threads();
559
/* Flush pages from the end of all the LRU lists if necessary */
560
buf_flush_free_margins();
563
if (buf_debug_prints) {
565
"Ibuf merge read-ahead space %lu pages %lu\n",
566
(ulong) space_ids[0], (ulong) n_stored);
568
#endif /* UNIV_DEBUG */
571
/********************************************************************//**
572
Issues read requests for pages which recovery wants to read in. */
577
ibool sync, /*!< in: TRUE if the caller
578
wants this function to wait
579
for the highest address page
580
to get read in, before this
582
ulint space, /*!< in: space id */
583
ulint zip_size, /*!< in: compressed page size in
585
const ulint* page_nos, /*!< in: array of page numbers
586
to read, with the highest page
587
number the last in the
589
ulint n_stored) /*!< in: number of page numbers
592
ib_int64_t tablespace_version;
597
zip_size = fil_space_get_zip_size(space);
599
if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
600
/* It is a single table tablespace and the .ibd file is
601
missing: do nothing */
606
tablespace_version = fil_space_get_version(space);
608
for (i = 0; i < n_stored; i++) {
609
buf_pool_t* buf_pool;
613
os_aio_print_debug = FALSE;
614
buf_pool = buf_pool_get(space, page_nos[i]);
615
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
617
os_aio_simulated_wake_handler_threads();
618
os_thread_sleep(10000);
624
"InnoDB: Error: InnoDB has waited for"
625
" 10 seconds for pending\n"
626
"InnoDB: reads to the buffer pool to"
628
"InnoDB: Number of pending reads %lu,"
629
" pending pread calls %lu\n",
630
(ulong) buf_pool->n_pend_reads,
631
(ulong)os_file_n_pending_preads);
633
os_aio_print_debug = TRUE;
637
os_aio_print_debug = FALSE;
639
if ((i + 1 == n_stored) && sync) {
640
buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
641
zip_size, TRUE, tablespace_version,
644
buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
645
| OS_AIO_SIMULATED_WAKE_LATER,
646
space, zip_size, TRUE,
647
tablespace_version, page_nos[i]);
651
os_aio_simulated_wake_handler_threads();
653
/* Flush pages from the end of all the LRU lists if necessary */
654
buf_flush_free_margins();
657
if (buf_debug_prints) {
659
"Recovery applies read-ahead pages %lu\n",
662
#endif /* UNIV_DEBUG */