1
/* Innobase relational database engine; Copyright (C) 2001 Innobase Oy
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License 2
5
as published by the Free Software Foundation in June 1991.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License 2
13
along with this program (in file COPYING); if not, write to the Free
14
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
15
/******************************************************
16
The database buffer buf_pool
20
Created 11/5/1995 Heikki Tuuri
21
*******************************************************/
32
#include "lock0lock.h"
34
#include "ibuf0ibuf.h"
35
#include "dict0dict.h"
42
IMPLEMENTATION OF THE BUFFER POOL
43
=================================
45
Performance improvement:
46
------------------------
47
Thread scheduling in NT may be so slow that the OS wait mechanism should
48
not be used even in waiting for disk reads to complete.
49
Rather, we should put waiting query threads to the queue of
50
waiting jobs, and let the OS thread do something useful while the i/o
51
is processed. In this way we could remove most OS thread switches in
52
an i/o-intensive benchmark like TPC-C.
54
A possibility is to put a user space thread library between the database
55
and NT. User space thread libraries might be very fast.
57
SQL Server 7.0 can be configured to use 'fibers' which are lightweight
58
threads in NT. These should be studied.
60
Buffer frames and blocks
61
------------------------
62
Following the terminology of Gray and Reuter, we call the memory
63
blocks where file pages are loaded buffer frames. For each buffer
64
frame there is a control block, or shortly, a block, in the buffer
65
control array. The control info which does not need to be stored
66
in the file along with the file page, resides in the control block.
70
The buffer buf_pool contains a single mutex which protects all the
71
control data structures of the buf_pool. The content of a buffer frame is
72
protected by a separate read-write lock in its control block, though.
73
These locks can be locked and unlocked without owning the buf_pool mutex.
74
The OS events in the buf_pool struct can be waited for without owning the
77
The buf_pool mutex is a hot-spot in main memory, causing a lot of
78
memory bus traffic on multiprocessor systems when processors
79
alternately access the mutex. On our Pentium, the mutex is accessed
80
maybe every 10 microseconds. We gave up the solution to have mutexes
81
for each control block, for instance, because it seemed to be
84
A solution to reduce mutex contention of the buf_pool mutex is to
85
create a separate mutex for the page hash table. On Pentium,
86
accessing the hash table takes 2 microseconds, about half
87
of the total buf_pool mutex hold time.
92
The control block contains, for instance, the bufferfix count
93
which is incremented when a thread wants a file page to be fixed
94
in a buffer frame. The bufferfix operation does not lock the
95
contents of the frame, however. For this purpose, the control
96
block contains a read-write lock.
98
The buffer frames have to be aligned so that the start memory
99
address of a frame is divisible by the universal page size, which
102
We intend to make the buffer buf_pool size on-line reconfigurable,
103
that is, the buf_pool size can be changed without closing the database.
104
Then the database administarator may adjust it to be bigger
105
at night, for example. The control block array must
106
contain enough control blocks for the maximum buffer buf_pool size
107
which is used in the particular database.
108
If the buf_pool size is cut, we exploit the virtual memory mechanism of
109
the OS, and just refrain from using frames at high addresses. Then the OS
110
can swap them to disk.
112
The control blocks containing file pages are put to a hash table
113
according to the file address of the page.
114
We could speed up the access to an individual page by using
115
"pointer swizzling": we could replace the page references on
116
non-leaf index pages by direct pointers to the page, if it exists
117
in the buf_pool. We could make a separate hash table where we could
118
chain all the page references in non-leaf pages residing in the buf_pool,
119
using the page reference as the hash key,
120
and at the time of reading of a page update the pointers accordingly.
121
Drawbacks of this solution are added complexity and,
122
possibly, extra space required on non-leaf pages for memory pointers.
123
A simpler solution is just to speed up the hash table mechanism
124
in the database, using tables whose size is a power of 2.
129
There are several lists of control blocks. The free list contains
130
blocks which are currently not used.
132
The LRU-list contains all the blocks holding a file page
133
except those for which the bufferfix count is non-zero.
134
The pages are in the LRU list roughly in the order of the last
135
access to the page, so that the oldest pages are at the end of the
136
list. We also keep a pointer to near the end of the LRU list,
137
which we can use when we want to artificially age a page in the
138
buf_pool. This is used if we know that some page is not needed
139
again for some time: we insert the block right after the pointer,
140
causing it to be replaced sooner than would noramlly be the case.
141
Currently this aging mechanism is used for read-ahead mechanism
142
of pages, and it can also be used when there is a scan of a full
143
table which cannot fit in the memory. Putting the pages near the
144
of the LRU list, we make sure that most of the buf_pool stays in the
145
main memory, undisturbed.
147
The chain of modified blocks contains the blocks
148
holding file pages that have been modified in the memory
149
but not written to disk yet. The block with the oldest modification
150
which has not yet been written to disk is at the end of the chain.
155
First, a victim block for replacement has to be found in the
156
buf_pool. It is taken from the free list or searched for from the
157
end of the LRU-list. An exclusive lock is reserved for the frame,
158
the io_fix field is set in the block fixing the block in buf_pool,
159
and the io-operation for loading the page is queued. The io-handler thread
160
releases the X-lock on the frame and resets the io_fix field
161
when the io operation completes.
163
A thread may request the above operation using the function
164
buf_page_get(). It may then continue to request a lock on the frame.
165
The lock is granted when the io-handler releases the x-lock.
170
The read-ahead mechanism is intended to be intelligent and
171
isolated from the semantically higher levels of the database
172
index management. From the higher level we only need the
173
information if a file page has a natural successor or
174
predecessor page. On the leaf level of a B-tree index,
175
these are the next and previous pages in the natural
178
Let us first explain the read-ahead mechanism when the leafs
179
of a B-tree are scanned in an ascending or descending order.
180
When a read page is the first time referenced in the buf_pool,
181
the buffer manager checks if it is at the border of a so-called
182
linear read-ahead area. The tablespace is divided into these
183
areas of size 64 blocks, for example. So if the page is at the
184
border of such an area, the read-ahead mechanism checks if
185
all the other blocks in the area have been accessed in an
186
ascending or descending order. If this is the case, the system
187
looks at the natural successor or predecessor of the page,
188
checks if that is at the border of another area, and in this case
189
issues read-requests for all the pages in that area. Maybe
190
we could relax the condition that all the pages in the area
191
have to be accessed: if data is deleted from a table, there may
192
appear holes of unused pages in the area.
194
A different read-ahead mechanism is used when there appears
195
to be a random access pattern to a file.
196
If a new page is referenced in the buf_pool, and several pages
197
of its random access area (for instance, 32 consecutive pages
198
in a tablespace) have recently been referenced, we may predict
199
that the whole area may be needed in the near future, and issue
200
the read requests for the whole area.
205
By a 'block' we mean the buffer header of type buf_block_t. By a 'page'
206
we mean the physical 16 kB memory area allocated from RAM for that block.
207
By a 'frame' we mean a 16 kB area in the virtual address space of the
208
process, in the frame_mem of buf_pool.
210
We can map pages to the frames of the buffer pool.
212
1) A buffer block allocated to use as a non-data page, e.g., to the lock
213
table, is always mapped to a frame.
214
2) A bufferfixed or io-fixed data page is always mapped to a frame.
215
3) When we need to map a block to frame, we look from the list
216
awe_LRU_free_mapped and try to unmap its last block, but note that
217
bufferfixed or io-fixed pages cannot be unmapped.
218
4) For every frame in the buffer pool there is always a block whose page is
219
mapped to it. When we create the buffer pool, we map the first elements
220
in the free list to the frames.
221
5) When we have AWE enabled, we disable adaptive hash indexes.
224
/* Value in microseconds */
225
static const int WAIT_FOR_READ = 20000;
227
buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */
230
ulint buf_dbg_counter = 0; /* This is used to insert validation
231
operations in excution in the
233
ibool buf_debug_prints = FALSE; /* If this is set TRUE,
234
the program prints info whenever
235
read-ahead or flush occurs */
236
#endif /* UNIV_DEBUG */
237
/************************************************************************
238
Calculates a page checksum which is stored to the page when it is written
239
to a file. Note that we must be careful to calculate the same value on
240
32-bit and 64-bit architectures. */
243
buf_calc_page_new_checksum(
244
/*=======================*/
246
byte* page) /* in: buffer page */
250
/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
251
..._ARCH_LOG_NO, are written outside the buffer pool to the first
252
pages of data files, we have to skip them in the page checksum
254
We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
255
checksum is stored, and also the last 8 bytes of page because
256
there we store the old formula checksum. */
258
checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
259
FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
260
+ ut_fold_binary(page + FIL_PAGE_DATA,
261
UNIV_PAGE_SIZE - FIL_PAGE_DATA
262
- FIL_PAGE_END_LSN_OLD_CHKSUM);
263
checksum = checksum & 0xFFFFFFFFUL;
268
/************************************************************************
269
In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
270
looked at the first few bytes of the page. This calculates that old
272
NOTE: we must first store the new formula checksum to
273
FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
274
because this takes that field as an input! */
277
buf_calc_page_old_checksum(
278
/*=======================*/
280
byte* page) /* in: buffer page */
284
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
286
checksum = checksum & 0xFFFFFFFFUL;
291
/************************************************************************
292
Checks if a page is corrupt. */
295
buf_page_is_corrupted(
296
/*==================*/
297
/* out: TRUE if corrupted */
298
byte* read_buf) /* in: a database page */
302
ulint checksum_field;
303
ulint old_checksum_field;
304
#ifndef UNIV_HOTBACKUP
307
if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
308
!= mach_read_from_4(read_buf + UNIV_PAGE_SIZE
309
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
311
/* Stored log sequence numbers at the start and the end
312
of page do not match */
317
#ifndef UNIV_HOTBACKUP
318
if (recv_lsn_checks_on && log_peek_lsn(¤t_lsn)) {
319
if (ut_dulint_cmp(current_lsn,
320
mach_read_from_8(read_buf + FIL_PAGE_LSN))
322
ut_print_timestamp(stderr);
325
" InnoDB: Error: page %lu log sequence number"
327
"InnoDB: is in the future! Current system "
328
"log sequence number %lu %lu.\n"
329
"InnoDB: Your database may be corrupt or "
330
"you may have copied the InnoDB\n"
331
"InnoDB: tablespace but not the InnoDB "
333
"InnoDB: http://dev.mysql.com/doc/refman/"
334
"5.1/en/forcing-recovery.html\n"
335
"InnoDB: for more information.\n",
336
(ulong) mach_read_from_4(read_buf
338
(ulong) ut_dulint_get_high
339
(mach_read_from_8(read_buf + FIL_PAGE_LSN)),
340
(ulong) ut_dulint_get_low
341
(mach_read_from_8(read_buf + FIL_PAGE_LSN)),
342
(ulong) ut_dulint_get_high(current_lsn),
343
(ulong) ut_dulint_get_low(current_lsn));
348
/* If we use checksums validation, make additional check before
349
returning TRUE to ensure that the checksum is not equal to
350
BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
351
disabled. Otherwise, skip checksum calculation and return FALSE */
353
if (srv_use_checksums) {
354
old_checksum = buf_calc_page_old_checksum(read_buf);
356
old_checksum_field = mach_read_from_4(
357
read_buf + UNIV_PAGE_SIZE
358
- FIL_PAGE_END_LSN_OLD_CHKSUM);
360
/* There are 2 valid formulas for old_checksum_field:
362
1. Very old versions of InnoDB only stored 8 byte lsn to the
363
start and the end of the page.
365
2. Newer InnoDB versions store the old formula checksum
368
if (old_checksum_field != mach_read_from_4(read_buf
370
&& old_checksum_field != old_checksum
371
&& old_checksum_field != BUF_NO_CHECKSUM_MAGIC) {
376
checksum = buf_calc_page_new_checksum(read_buf);
377
checksum_field = mach_read_from_4(read_buf
378
+ FIL_PAGE_SPACE_OR_CHKSUM);
380
/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
381
(always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */
383
if (checksum_field != 0 && checksum_field != checksum
384
&& checksum_field != BUF_NO_CHECKSUM_MAGIC) {
393
/************************************************************************
394
Prints a page to stderr. */
399
byte* read_buf) /* in: a database page */
405
ut_print_timestamp(stderr);
406
fprintf(stderr, " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
407
(ulint)UNIV_PAGE_SIZE);
408
ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE);
409
fputs("InnoDB: End of page dump\n", stderr);
411
checksum = srv_use_checksums
412
? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
413
old_checksum = srv_use_checksums
414
? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
416
ut_print_timestamp(stderr);
418
" InnoDB: Page checksum %lu, prior-to-4.0.14-form"
420
"InnoDB: stored checksum %lu, prior-to-4.0.14-form"
421
" stored checksum %lu\n"
422
"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
424
"InnoDB: Page number (if stored to page already) %lu,\n"
425
"InnoDB: space id (if created with >= MySQL-4.1.1"
426
" and stored already) %lu\n",
427
(ulong) checksum, (ulong) old_checksum,
428
(ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
429
(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
430
- FIL_PAGE_END_LSN_OLD_CHKSUM),
431
(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
432
(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
433
(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
434
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
435
(ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
436
(ulong) mach_read_from_4(read_buf
437
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
439
if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
440
== TRX_UNDO_INSERT) {
442
"InnoDB: Page may be an insert undo log page\n");
443
} else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
444
+ TRX_UNDO_PAGE_TYPE)
445
== TRX_UNDO_UPDATE) {
447
"InnoDB: Page may be an update undo log page\n");
450
switch (fil_page_get_type(read_buf)) {
453
"InnoDB: Page may be an index page where"
454
" index id is %lu %lu\n",
455
(ulong) ut_dulint_get_high
456
(btr_page_get_index_id(read_buf)),
457
(ulong) ut_dulint_get_low
458
(btr_page_get_index_id(read_buf)));
460
/* If the code is in ibbackup, dict_sys may be uninitialized,
463
if (dict_sys != NULL) {
465
index = dict_index_find_on_id_low(
466
btr_page_get_index_id(read_buf));
468
fputs("InnoDB: (", stderr);
469
dict_index_name_print(stderr, NULL, index);
470
fputs(")\n", stderr);
475
fputs("InnoDB: Page may be an 'inode' page\n", stderr);
477
case FIL_PAGE_IBUF_FREE_LIST:
478
fputs("InnoDB: Page may be an insert buffer free list page\n",
481
case FIL_PAGE_TYPE_ALLOCATED:
482
fputs("InnoDB: Page may be a freshly allocated page\n",
485
case FIL_PAGE_IBUF_BITMAP:
486
fputs("InnoDB: Page may be an insert buffer bitmap page\n",
489
case FIL_PAGE_TYPE_SYS:
490
fputs("InnoDB: Page may be a system page\n",
493
case FIL_PAGE_TYPE_TRX_SYS:
494
fputs("InnoDB: Page may be a transaction system page\n",
497
case FIL_PAGE_TYPE_FSP_HDR:
498
fputs("InnoDB: Page may be a file space header page\n",
501
case FIL_PAGE_TYPE_XDES:
502
fputs("InnoDB: Page may be an extent descriptor page\n",
505
case FIL_PAGE_TYPE_BLOB:
506
fputs("InnoDB: Page may be a BLOB page\n",
512
/************************************************************************
513
Initializes a buffer control block when the buf_pool is created. */
518
buf_block_t* block, /* in: pointer to control block */
519
byte* frame) /* in: pointer to buffer frame, or NULL if in
520
the case of AWE there is no frame */
524
block->state = BUF_BLOCK_NOT_USED;
526
block->frame = frame;
528
block->awe_info = NULL;
530
block->buf_fix_count = 0;
533
block->modify_clock = ut_dulint_zero;
535
block->file_page_was_freed = FALSE;
537
block->check_index_page_at_flush = FALSE;
540
block->in_free_list = FALSE;
541
block->in_LRU_list = FALSE;
543
block->n_pointers = 0;
545
mutex_create(&block->mutex, SYNC_BUF_BLOCK);
547
rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
548
ut_ad(rw_lock_validate(&(block->lock)));
550
#ifdef UNIV_SYNC_DEBUG
551
rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
552
#endif /* UNIV_SYNC_DEBUG */
555
/************************************************************************
556
Creates the buffer pool. */
561
/* out, own: buf_pool object, NULL if not
562
enough memory or error */
563
ulint max_size, /* in: maximum size of the buf_pool in
565
ulint curr_size, /* in: current size to use, must be <=
566
max_size, currently must be equal to
568
ulint n_frames) /* in: number of frames; if AWE is used,
569
this is the size of the address space window
570
where physical memory pages are mapped; if
571
AWE is not used then this must be the same
578
ut_a(max_size == curr_size);
579
ut_a(srv_use_awe || n_frames == max_size);
581
if (n_frames > curr_size) {
583
"InnoDB: AWE: Error: you must specify in my.cnf"
584
" .._awe_mem_mb larger\n"
585
"InnoDB: than .._buffer_pool_size. Now the former"
587
"InnoDB: the latter %lu pages.\n",
588
(ulong) curr_size, (ulong) n_frames);
593
buf_pool = mem_alloc(sizeof(buf_pool_t));
595
/* 1. Initialize general fields
596
---------------------------- */
597
mutex_create(&buf_pool->mutex, SYNC_BUF_POOL);
599
mutex_enter(&(buf_pool->mutex));
602
/*----------------------------------------*/
603
/* Allocate the virtual address space window, i.e., the
604
buffer pool frames */
606
buf_pool->frame_mem = os_awe_allocate_virtual_mem_window(
607
UNIV_PAGE_SIZE * (n_frames + 1));
609
/* Allocate the physical memory for AWE and the AWE info array
612
if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) {
615
"InnoDB: AWE: Error: physical memory must be"
616
" allocated in full megabytes.\n"
617
"InnoDB: Trying to allocate %lu"
618
" database pages.\n",
624
if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info),
627
/ UNIV_PAGE_SIZE))) {
631
/*----------------------------------------*/
633
buf_pool->frame_mem = os_mem_alloc_large(
634
UNIV_PAGE_SIZE * (n_frames + 1), TRUE, FALSE);
637
if (buf_pool->frame_mem == NULL) {
642
buf_pool->blocks = ut_malloc(sizeof(buf_block_t) * max_size);
644
if (buf_pool->blocks == NULL) {
649
buf_pool->max_size = max_size;
650
buf_pool->curr_size = curr_size;
652
buf_pool->n_frames = n_frames;
654
/* Align pointer to the first frame */
656
frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE);
658
buf_pool->frame_zero = frame;
659
buf_pool->high_end = frame + UNIV_PAGE_SIZE * n_frames;
662
/*----------------------------------------*/
663
/* Map an initial part of the allocated physical memory to
666
os_awe_map_physical_mem_to_window(buf_pool->frame_zero,
669
/ OS_AWE_X86_PAGE_SIZE),
671
/*----------------------------------------*/
674
buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames);
676
if (buf_pool->blocks_of_frames == NULL) {
681
/* Init block structs and assign frames for them; in the case of
682
AWE there are less frames than blocks. Then we assign the frames
683
to the first blocks (we already mapped the memory above). We also
684
init the awe_info for every block. */
686
for (i = 0; i < max_size; i++) {
688
block = buf_pool_get_nth_block(buf_pool, i);
691
frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE;
692
*(buf_pool->blocks_of_frames + i) = block;
697
buf_block_init(block, frame);
700
/*----------------------------------------*/
701
block->awe_info = buf_pool->awe_info
702
+ i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE);
703
/*----------------------------------------*/
707
buf_pool->page_hash = hash_create(2 * max_size);
709
buf_pool->n_pend_reads = 0;
711
buf_pool->last_printout_time = time(NULL);
713
buf_pool->n_pages_read = 0;
714
buf_pool->n_pages_written = 0;
715
buf_pool->n_pages_created = 0;
716
buf_pool->n_pages_awe_remapped = 0;
718
buf_pool->n_page_gets = 0;
719
buf_pool->n_page_gets_old = 0;
720
buf_pool->n_pages_read_old = 0;
721
buf_pool->n_pages_written_old = 0;
722
buf_pool->n_pages_created_old = 0;
723
buf_pool->n_pages_awe_remapped_old = 0;
725
/* 2. Initialize flushing fields
726
---------------------------- */
727
UT_LIST_INIT(buf_pool->flush_list);
729
for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) {
730
buf_pool->n_flush[i] = 0;
731
buf_pool->init_flush[i] = FALSE;
732
buf_pool->no_flush[i] = os_event_create(NULL);
735
buf_pool->LRU_flush_ended = 0;
737
buf_pool->ulint_clock = 1;
738
buf_pool->freed_page_clock = 0;
740
/* 3. Initialize LRU fields
741
---------------------------- */
742
UT_LIST_INIT(buf_pool->LRU);
744
buf_pool->LRU_old = NULL;
746
UT_LIST_INIT(buf_pool->awe_LRU_free_mapped);
748
/* Add control blocks to the free list */
749
UT_LIST_INIT(buf_pool->free);
751
for (i = 0; i < curr_size; i++) {
753
block = buf_pool_get_nth_block(buf_pool, i);
756
/* Wipe contents of frame to eliminate a Purify
760
memset(block->frame, '\0', UNIV_PAGE_SIZE);
763
/* Add to the list of blocks mapped to
766
UT_LIST_ADD_LAST(awe_LRU_free_mapped,
767
buf_pool->awe_LRU_free_mapped,
772
UT_LIST_ADD_LAST(free, buf_pool->free, block);
773
block->in_free_list = TRUE;
776
mutex_exit(&(buf_pool->mutex));
778
if (srv_use_adaptive_hash_indexes) {
779
btr_search_sys_create(curr_size * UNIV_PAGE_SIZE
780
/ sizeof(void*) / 64);
782
/* Create only a small dummy system */
783
btr_search_sys_create(1000);
789
/************************************************************************
790
Maps the page of block to a frame, if not mapped yet. Unmaps some page
791
from the end of the awe_LRU_free_mapped. */
794
buf_awe_map_page_to_frame(
795
/*======================*/
796
buf_block_t* block, /* in: block whose page should be
798
ibool add_to_mapped_list) /* in: TRUE if we in the case
799
we need to map the page should also
801
awe_LRU_free_mapped list */
805
ut_ad(mutex_own(&(buf_pool->mutex)));
813
/* Scan awe_LRU_free_mapped from the end and try to find a block
814
which is not bufferfixed or io-fixed */
816
bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped);
821
mutex_enter(&bck->mutex);
823
skip = (bck->state == BUF_BLOCK_FILE_PAGE
824
&& (bck->buf_fix_count != 0 || bck->io_fix != 0));
827
mutex_exit(&bck->mutex);
829
/* We have to skip this */
830
bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck);
832
/* We can map block to the frame of bck */
834
os_awe_map_physical_mem_to_window(
836
UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE,
839
block->frame = bck->frame;
841
*(buf_pool->blocks_of_frames
842
+ (((ulint)(block->frame
843
- buf_pool->frame_zero))
844
>> UNIV_PAGE_SIZE_SHIFT))
848
UT_LIST_REMOVE(awe_LRU_free_mapped,
849
buf_pool->awe_LRU_free_mapped,
852
if (add_to_mapped_list) {
855
buf_pool->awe_LRU_free_mapped,
859
buf_pool->n_pages_awe_remapped++;
861
mutex_exit(&bck->mutex);
868
"InnoDB: AWE: Fatal error: cannot find a page to unmap\n"
869
"InnoDB: awe_LRU_free_mapped list length %lu\n",
870
(ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
875
/************************************************************************
876
Allocates a buffer block. */
879
buf_block_alloc(void)
880
/*=================*/
881
/* out, own: the allocated block; also if AWE
882
is used it is guaranteed that the page is
887
block = buf_LRU_get_free_block();
892
/************************************************************************
893
Moves to the block to the start of the LRU list if there is a danger
894
that the block would drift out of the buffer pool. */
897
buf_block_make_young(
898
/*=================*/
899
buf_block_t* block) /* in: block to make younger */
901
ut_ad(!mutex_own(&(buf_pool->mutex)));
903
/* Note that we read freed_page_clock's without holding any mutex:
904
this is allowed since the result is used only in heuristics */
906
if (buf_block_peek_if_too_old(block)) {
908
mutex_enter(&buf_pool->mutex);
909
/* There has been freeing activity in the LRU list:
910
best to move to the head of the LRU list */
912
buf_LRU_make_block_young(block);
913
mutex_exit(&buf_pool->mutex);
917
/************************************************************************
918
Moves a page to the start of the buffer pool LRU list. This high-level
919
function can be used to prevent an important page from from slipping out of
925
buf_frame_t* frame) /* in: buffer frame of a file page */
929
mutex_enter(&(buf_pool->mutex));
931
block = buf_block_align(frame);
933
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
935
buf_LRU_make_block_young(block);
937
mutex_exit(&(buf_pool->mutex));
940
/************************************************************************
941
Frees a buffer block which does not contain a file page. */
946
buf_block_t* block) /* in, own: block to be freed */
948
mutex_enter(&(buf_pool->mutex));
950
mutex_enter(&block->mutex);
952
ut_a(block->state != BUF_BLOCK_FILE_PAGE);
954
buf_LRU_block_free_non_file_page(block);
956
mutex_exit(&block->mutex);
958
mutex_exit(&(buf_pool->mutex));
961
/*************************************************************************
962
Allocates a buffer frame. */
965
buf_frame_alloc(void)
966
/*=================*/
967
/* out: buffer frame */
969
return(buf_block_alloc()->frame);
972
/*************************************************************************
973
Frees a buffer frame which does not contain a file page. */
978
buf_frame_t* frame) /* in: buffer frame */
980
buf_block_free(buf_block_align(frame));
983
/************************************************************************
984
Returns the buffer control block if the page can be found in the buffer
985
pool. NOTE that it is possible that the page is not yet read
986
from disk, though. This is a very low-level function: use with care! */
991
/* out: control block if found from page hash table,
992
otherwise NULL; NOTE that the page is not necessarily
993
yet read from disk! */
994
ulint space, /* in: space id */
995
ulint offset) /* in: page number */
999
mutex_enter_fast(&(buf_pool->mutex));
1001
block = buf_page_hash_get(space, offset);
1003
mutex_exit(&(buf_pool->mutex));
1008
/************************************************************************
1009
Resets the check_index_page_at_flush field of a page if found in the buffer
1013
buf_reset_check_index_page_at_flush(
1014
/*================================*/
1015
ulint space, /* in: space id */
1016
ulint offset) /* in: page number */
1020
mutex_enter_fast(&(buf_pool->mutex));
1022
block = buf_page_hash_get(space, offset);
1025
block->check_index_page_at_flush = FALSE;
1028
mutex_exit(&(buf_pool->mutex));
1031
/************************************************************************
1032
Returns the current state of is_hashed of a page. FALSE if the page is
1033
not in the pool. NOTE that this operation does not fix the page in the
1034
pool if it is found there. */
1037
buf_page_peek_if_search_hashed(
1038
/*===========================*/
1039
/* out: TRUE if page hash index is built in search
1041
ulint space, /* in: space id */
1042
ulint offset) /* in: page number */
1047
mutex_enter_fast(&(buf_pool->mutex));
1049
block = buf_page_hash_get(space, offset);
1054
is_hashed = block->is_hashed;
1057
mutex_exit(&(buf_pool->mutex));
1062
/************************************************************************
1063
Returns TRUE if the page can be found in the buffer pool hash table. NOTE
1064
that it is possible that the page is not yet read from disk, though. */
1069
/* out: TRUE if found from page hash table,
1070
NOTE that the page is not necessarily yet read
1072
ulint space, /* in: space id */
1073
ulint offset) /* in: page number */
1075
if (buf_page_peek_block(space, offset)) {
1083
/************************************************************************
1084
Sets file_page_was_freed TRUE if the page is found in the buffer pool.
1085
This function should be called when we free a file page and want the
1086
debug version to check that it is not accessed any more unless
1090
buf_page_set_file_page_was_freed(
1091
/*=============================*/
1092
/* out: control block if found from page hash table,
1094
ulint space, /* in: space id */
1095
ulint offset) /* in: page number */
1099
mutex_enter_fast(&(buf_pool->mutex));
1101
block = buf_page_hash_get(space, offset);
1104
block->file_page_was_freed = TRUE;
1107
mutex_exit(&(buf_pool->mutex));
1112
/************************************************************************
1113
Sets file_page_was_freed FALSE if the page is found in the buffer pool.
1114
This function should be called when we free a file page and want the
1115
debug version to check that it is not accessed any more unless
1119
buf_page_reset_file_page_was_freed(
1120
/*===============================*/
1121
/* out: control block if found from page hash table,
1123
ulint space, /* in: space id */
1124
ulint offset) /* in: page number */
1128
mutex_enter_fast(&(buf_pool->mutex));
1130
block = buf_page_hash_get(space, offset);
1133
block->file_page_was_freed = FALSE;
1136
mutex_exit(&(buf_pool->mutex));
1141
/************************************************************************
1142
This is the general function used to get access to a database page. */
1147
/* out: pointer to the frame or NULL */
1148
ulint space, /* in: space id */
1149
ulint offset, /* in: page number */
1150
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
1151
buf_frame_t* guess, /* in: guessed frame or NULL */
1152
ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL,
1153
BUF_GET_NO_LATCH, BUF_GET_NOWAIT */
1154
const char* file, /* in: file name */
1155
ulint line, /* in: line where called */
1156
mtr_t* mtr) /* in: mini-transaction */
1165
ut_ad((rw_latch == RW_S_LATCH)
1166
|| (rw_latch == RW_X_LATCH)
1167
|| (rw_latch == RW_NO_LATCH));
1168
ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
1169
ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
1170
|| (mode == BUF_GET_NO_LATCH) || (mode == BUF_GET_NOWAIT));
1171
#ifndef UNIV_LOG_DEBUG
1172
ut_ad(!ibuf_inside() || ibuf_page(space, offset));
1174
buf_pool->n_page_gets++;
1177
mutex_enter_fast(&(buf_pool->mutex));
1180
block = buf_block_align(guess);
1182
if ((offset != block->offset) || (space != block->space)
1183
|| (block->state != BUF_BLOCK_FILE_PAGE)) {
1189
if (block == NULL) {
1190
block = buf_page_hash_get(space, offset);
1193
if (block == NULL) {
1194
/* Page not in buf_pool: needs to be read from file */
1196
mutex_exit(&(buf_pool->mutex));
1198
if (mode == BUF_GET_IF_IN_POOL) {
1203
buf_read_page(space, offset);
1208
if (buf_dbg_counter % 37 == 0) {
1209
ut_ad(buf_validate());
1215
mutex_enter(&block->mutex);
1217
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1221
if (block->io_fix == BUF_IO_READ) {
1225
if (mode == BUF_GET_IF_IN_POOL) {
1226
/* The page is only being read to buffer */
1227
mutex_exit(&buf_pool->mutex);
1228
mutex_exit(&block->mutex);
1234
/* If AWE is enabled and the page is not mapped to a frame, then
1237
if (block->frame == NULL) {
1240
/* We set second parameter TRUE because the block is in the
1241
LRU list and we must put it to awe_LRU_free_mapped list once
1242
mapped to a frame */
1244
buf_awe_map_page_to_frame(block, TRUE);
1247
#ifdef UNIV_SYNC_DEBUG
1248
buf_block_buf_fix_inc_debug(block, file, line);
1250
buf_block_buf_fix_inc(block);
1252
mutex_exit(&buf_pool->mutex);
1254
/* Check if this is the first access to the page */
1256
accessed = block->accessed;
1258
block->accessed = TRUE;
1260
mutex_exit(&block->mutex);
1262
buf_block_make_young(block);
1264
#ifdef UNIV_DEBUG_FILE_ACCESSES
1265
ut_a(block->file_page_was_freed == FALSE);
1271
if (buf_dbg_counter % 5771 == 0) {
1272
ut_ad(buf_validate());
1275
ut_ad(block->buf_fix_count > 0);
1276
ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
1278
if (mode == BUF_GET_NOWAIT) {
1279
if (rw_latch == RW_S_LATCH) {
1280
success = rw_lock_s_lock_func_nowait(&(block->lock),
1282
fix_type = MTR_MEMO_PAGE_S_FIX;
1284
ut_ad(rw_latch == RW_X_LATCH);
1285
success = rw_lock_x_lock_func_nowait(&(block->lock),
1287
fix_type = MTR_MEMO_PAGE_X_FIX;
1291
mutex_enter(&block->mutex);
1293
block->buf_fix_count--;
1295
mutex_exit(&block->mutex);
1296
#ifdef UNIV_SYNC_DEBUG
1297
rw_lock_s_unlock(&(block->debug_latch));
1302
} else if (rw_latch == RW_NO_LATCH) {
1305
/* Let us wait until the read operation
1309
mutex_enter(&block->mutex);
1311
if (block->io_fix == BUF_IO_READ) {
1313
mutex_exit(&block->mutex);
1315
os_thread_sleep(WAIT_FOR_READ);
1318
mutex_exit(&block->mutex);
1325
fix_type = MTR_MEMO_BUF_FIX;
1326
} else if (rw_latch == RW_S_LATCH) {
1328
rw_lock_s_lock_func(&(block->lock), 0, file, line);
1330
fix_type = MTR_MEMO_PAGE_S_FIX;
1332
rw_lock_x_lock_func(&(block->lock), 0, file, line);
1334
fix_type = MTR_MEMO_PAGE_X_FIX;
1337
mtr_memo_push(mtr, block, fix_type);
1340
/* In the case of a first access, try to apply linear
1343
buf_read_ahead_linear(space, offset);
1346
#ifdef UNIV_IBUF_DEBUG
1347
ut_a(ibuf_count_get(block->space, block->offset) == 0);
1349
return(block->frame);
1352
/************************************************************************
1353
This is the general function used to get optimistic access to a database
1357
buf_page_optimistic_get_func(
1358
/*=========================*/
1359
/* out: TRUE if success */
1360
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
1361
buf_block_t* block, /* in: guessed buffer block */
1362
buf_frame_t* guess, /* in: guessed frame; note that AWE may move
1364
dulint modify_clock,/* in: modify clock value if mode is
1365
..._GUESS_ON_CLOCK */
1366
const char* file, /* in: file name */
1367
ulint line, /* in: line where called */
1368
mtr_t* mtr) /* in: mini-transaction */
1374
ut_ad(mtr && block);
1375
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
1377
/* If AWE is used, block may have a different frame now, e.g., NULL */
1379
mutex_enter(&block->mutex);
1381
if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE)
1382
|| UNIV_UNLIKELY(block->frame != guess)) {
1384
mutex_exit(&block->mutex);
1389
#ifdef UNIV_SYNC_DEBUG
1390
buf_block_buf_fix_inc_debug(block, file, line);
1392
buf_block_buf_fix_inc(block);
1394
accessed = block->accessed;
1395
block->accessed = TRUE;
1397
mutex_exit(&block->mutex);
1399
buf_block_make_young(block);
1401
/* Check if this is the first access to the page */
1403
ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset));
1405
if (rw_latch == RW_S_LATCH) {
1406
success = rw_lock_s_lock_func_nowait(&(block->lock),
1408
fix_type = MTR_MEMO_PAGE_S_FIX;
1410
success = rw_lock_x_lock_func_nowait(&(block->lock),
1412
fix_type = MTR_MEMO_PAGE_X_FIX;
1415
if (UNIV_UNLIKELY(!success)) {
1416
mutex_enter(&block->mutex);
1418
block->buf_fix_count--;
1420
mutex_exit(&block->mutex);
1422
#ifdef UNIV_SYNC_DEBUG
1423
rw_lock_s_unlock(&(block->debug_latch));
1428
if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) {
1429
#ifdef UNIV_SYNC_DEBUG
1430
buf_page_dbg_add_level(block->frame, SYNC_NO_ORDER_CHECK);
1431
#endif /* UNIV_SYNC_DEBUG */
1432
if (rw_latch == RW_S_LATCH) {
1433
rw_lock_s_unlock(&(block->lock));
1435
rw_lock_x_unlock(&(block->lock));
1438
mutex_enter(&block->mutex);
1440
block->buf_fix_count--;
1442
mutex_exit(&block->mutex);
1444
#ifdef UNIV_SYNC_DEBUG
1445
rw_lock_s_unlock(&(block->debug_latch));
1450
mtr_memo_push(mtr, block, fix_type);
1455
if (buf_dbg_counter % 5771 == 0) {
1456
ut_ad(buf_validate());
1459
ut_ad(block->buf_fix_count > 0);
1460
ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
1462
#ifdef UNIV_DEBUG_FILE_ACCESSES
1463
ut_a(block->file_page_was_freed == FALSE);
1465
if (UNIV_UNLIKELY(!accessed)) {
1466
/* In the case of a first access, try to apply linear
1469
buf_read_ahead_linear(buf_frame_get_space_id(guess),
1470
buf_frame_get_page_no(guess));
1473
#ifdef UNIV_IBUF_DEBUG
1474
ut_a(ibuf_count_get(block->space, block->offset) == 0);
1476
buf_pool->n_page_gets++;
1481
/************************************************************************
1482
This is used to get access to a known database page, when no waiting can be
1483
done. For example, if a search in an adaptive hash index leads us to this
1487
buf_page_get_known_nowait(
1488
/*======================*/
1489
/* out: TRUE if success */
1490
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
1491
buf_frame_t* guess, /* in: the known page frame */
1492
ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
1493
const char* file, /* in: file name */
1494
ulint line, /* in: line where called */
1495
mtr_t* mtr) /* in: mini-transaction */
1502
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
1504
block = buf_block_align(guess);
1506
mutex_enter(&block->mutex);
1508
if (block->state == BUF_BLOCK_REMOVE_HASH) {
1509
/* Another thread is just freeing the block from the LRU list
1510
of the buffer pool: do not try to access this page; this
1511
attempt to access the page can only come through the hash
1512
index because when the buffer block state is ..._REMOVE_HASH,
1513
we have already removed it from the page address hash table
1514
of the buffer pool. */
1516
mutex_exit(&block->mutex);
1521
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1523
#ifdef UNIV_SYNC_DEBUG
1524
buf_block_buf_fix_inc_debug(block, file, line);
1526
buf_block_buf_fix_inc(block);
1528
mutex_exit(&block->mutex);
1530
if (mode == BUF_MAKE_YOUNG) {
1531
buf_block_make_young(block);
1534
ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
1536
if (rw_latch == RW_S_LATCH) {
1537
success = rw_lock_s_lock_func_nowait(&(block->lock),
1539
fix_type = MTR_MEMO_PAGE_S_FIX;
1541
success = rw_lock_x_lock_func_nowait(&(block->lock),
1543
fix_type = MTR_MEMO_PAGE_X_FIX;
1547
mutex_enter(&block->mutex);
1549
block->buf_fix_count--;
1551
mutex_exit(&block->mutex);
1553
#ifdef UNIV_SYNC_DEBUG
1554
rw_lock_s_unlock(&(block->debug_latch));
1560
mtr_memo_push(mtr, block, fix_type);
1565
if (buf_dbg_counter % 5771 == 0) {
1566
ut_ad(buf_validate());
1569
ut_ad(block->buf_fix_count > 0);
1570
ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
1571
#ifdef UNIV_DEBUG_FILE_ACCESSES
1572
ut_a(block->file_page_was_freed == FALSE);
1575
#ifdef UNIV_IBUF_DEBUG
1576
ut_a((mode == BUF_KEEP_OLD)
1577
|| (ibuf_count_get(block->space, block->offset) == 0));
1579
buf_pool->n_page_gets++;
1584
/************************************************************************
1585
Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
1588
buf_page_init_for_backup_restore(
1589
/*=============================*/
1590
ulint space, /* in: space id */
1591
ulint offset, /* in: offset of the page within space
1592
in units of a page */
1593
buf_block_t* block) /* in: block to init */
1595
/* Set the state of the block */
1596
block->magic_n = BUF_BLOCK_MAGIC_N;
1598
block->state = BUF_BLOCK_FILE_PAGE;
1599
block->space = space;
1600
block->offset = offset;
1602
block->lock_hash_val = 0;
1604
block->freed_page_clock = 0;
1606
block->newest_modification = ut_dulint_zero;
1607
block->oldest_modification = ut_dulint_zero;
1609
block->accessed = FALSE;
1610
block->buf_fix_count = 0;
1613
block->n_hash_helps = 0;
1614
block->is_hashed = FALSE;
1615
block->n_fields = 1;
1617
block->left_side = TRUE;
1619
block->file_page_was_freed = FALSE;
1622
/************************************************************************
1623
Inits a page to the buffer buf_pool. */
1628
ulint space, /* in: space id */
1629
ulint offset, /* in: offset of the page within space
1630
in units of a page */
1631
buf_block_t* block) /* in: block to init */
1634
ut_ad(mutex_own(&(buf_pool->mutex)));
1635
ut_ad(mutex_own(&(block->mutex)));
1636
ut_a(block->state != BUF_BLOCK_FILE_PAGE);
1638
/* Set the state of the block */
1639
block->magic_n = BUF_BLOCK_MAGIC_N;
1641
block->state = BUF_BLOCK_FILE_PAGE;
1642
block->space = space;
1643
block->offset = offset;
1645
block->check_index_page_at_flush = FALSE;
1646
block->index = NULL;
1648
block->lock_hash_val = lock_rec_hash(space, offset);
1650
#ifdef UNIV_DEBUG_VALGRIND
1652
/* Silence valid Valgrind warnings about uninitialized
1653
data being written to data files. There are some unused
1654
bytes on some pages that InnoDB does not initialize. */
1655
UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
1657
#endif /* UNIV_DEBUG_VALGRIND */
1659
/* Insert into the hash table of file pages */
1661
if (buf_page_hash_get(space, offset)) {
1663
"InnoDB: Error: page %lu %lu already found"
1664
" in the hash table\n",
1672
#endif /* UNIV_DEBUG */
1676
HASH_INSERT(buf_block_t, hash, buf_pool->page_hash,
1677
buf_page_address_fold(space, offset), block);
1679
block->freed_page_clock = 0;
1681
block->newest_modification = ut_dulint_zero;
1682
block->oldest_modification = ut_dulint_zero;
1684
block->accessed = FALSE;
1685
block->buf_fix_count = 0;
1688
block->n_hash_helps = 0;
1689
block->is_hashed = FALSE;
1690
block->n_fields = 1;
1692
block->left_side = TRUE;
1694
block->file_page_was_freed = FALSE;
1697
/************************************************************************
1698
Function which inits a page for read to the buffer buf_pool. If the page is
1699
(1) already in buf_pool, or
1700
(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
1701
(3) if the space is deleted or being deleted,
1702
then this function does nothing.
1703
Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
1704
on the buffer frame. The io-handler must take care that the flag is cleared
1705
and the lock released later. This is one of the functions which perform the
1706
state transition NOT_USED => FILE_PAGE to a block (the other is
1707
buf_page_create). */
1710
buf_page_init_for_read(
1711
/*===================*/
1712
/* out: pointer to the block or NULL */
1713
ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */
1714
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
1715
ulint space, /* in: space id */
1716
ib_longlong tablespace_version,/* in: prevents reading from a wrong
1717
version of the tablespace in case we have done
1719
ulint offset) /* in: page number */
1728
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
1729
/* It is a read-ahead within an ibuf routine */
1731
ut_ad(!ibuf_bitmap_page(offset));
1732
ut_ad(ibuf_inside());
1736
if (!ibuf_page_low(space, offset, &mtr)) {
1743
ut_ad(mode == BUF_READ_ANY_PAGE);
1746
block = buf_block_alloc();
1750
mutex_enter(&(buf_pool->mutex));
1751
mutex_enter(&block->mutex);
1753
if (fil_tablespace_deleted_or_being_deleted_in_mem(
1754
space, tablespace_version)) {
1755
*err = DB_TABLESPACE_DELETED;
1758
if (*err == DB_TABLESPACE_DELETED
1759
|| NULL != buf_page_hash_get(space, offset)) {
1761
/* The page belongs to a space which has been
1762
deleted or is being deleted, or the page is
1763
already in buf_pool, return */
1765
mutex_exit(&block->mutex);
1766
mutex_exit(&(buf_pool->mutex));
1768
buf_block_free(block);
1770
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
1780
buf_page_init(space, offset, block);
1782
/* The block must be put to the LRU list, to the old blocks */
1784
buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */
1786
block->io_fix = BUF_IO_READ;
1788
buf_pool->n_pend_reads++;
1790
/* We set a pass-type x-lock on the frame because then the same
1791
thread which called for the read operation (and is running now at
1792
this point of code) can wait for the read to complete by waiting
1793
for the x-lock on the frame; if the x-lock were recursive, the
1794
same thread would illegally get the x-lock before the page read
1795
is completed. The x-lock is cleared by the io-handler thread. */
1797
rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ);
1799
mutex_exit(&block->mutex);
1800
mutex_exit(&(buf_pool->mutex));
1802
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
1810
/************************************************************************
1811
Initializes a page to the buffer buf_pool. The page is usually not read
1812
from a file even if it cannot be found in the buffer buf_pool. This is one
1813
of the functions which perform to a block a state transition NOT_USED =>
1814
FILE_PAGE (the other is buf_page_init_for_read above). */
1819
/* out: pointer to the frame, page bufferfixed */
1820
ulint space, /* in: space id */
1821
ulint offset, /* in: offset of the page within space in units of
1823
mtr_t* mtr) /* in: mini-transaction handle */
1827
buf_block_t* free_block = NULL;
1831
free_block = buf_LRU_get_free_block();
1833
mutex_enter(&(buf_pool->mutex));
1835
block = buf_page_hash_get(space, offset);
1837
if (block != NULL) {
1838
#ifdef UNIV_IBUF_DEBUG
1839
ut_a(ibuf_count_get(block->space, block->offset) == 0);
1841
block->file_page_was_freed = FALSE;
1843
/* Page can be found in buf_pool */
1844
mutex_exit(&(buf_pool->mutex));
1846
buf_block_free(free_block);
1848
frame = buf_page_get_with_no_latch(space, offset, mtr);
1853
/* If we get here, the page was not in buf_pool: init it there */
1856
if (buf_debug_prints) {
1857
fprintf(stderr, "Creating space %lu page %lu to buffer\n",
1858
(ulong) space, (ulong) offset);
1860
#endif /* UNIV_DEBUG */
1864
mutex_enter(&block->mutex);
1866
buf_page_init(space, offset, block);
1868
/* The block must be put to the LRU list */
1869
buf_LRU_add_block(block, FALSE);
1871
#ifdef UNIV_SYNC_DEBUG
1872
buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__);
1874
buf_block_buf_fix_inc(block);
1876
buf_pool->n_pages_created++;
1878
mutex_exit(&(buf_pool->mutex));
1880
mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
1882
block->accessed = TRUE;
1884
mutex_exit(&block->mutex);
1886
/* Delete possible entries for the page from the insert buffer:
1887
such can exist if the page belonged to an index which was dropped */
1889
ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE);
1891
/* Flush pages from the end of the LRU list if necessary */
1892
buf_flush_free_margin();
1894
frame = block->frame;
1896
memset(frame + FIL_PAGE_PREV, 0xff, 4);
1897
memset(frame + FIL_PAGE_NEXT, 0xff, 4);
1898
mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
1900
/* Reset to zero the file flush lsn field in the page; if the first
1901
page of an ibdata file is 'created' in this function into the buffer
1902
pool then we lose the original contents of the file flush lsn stamp.
1903
Then InnoDB could in a crash recovery print a big, false, corruption
1904
warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
1906
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1911
if (buf_dbg_counter % 357 == 0) {
1912
ut_ad(buf_validate());
1915
#ifdef UNIV_IBUF_DEBUG
1916
ut_a(ibuf_count_get(block->space, block->offset) == 0);
1921
/************************************************************************
1922
Completes an asynchronous read or write request of a file page to or from
1926
buf_page_io_complete(
1927
/*=================*/
1928
buf_block_t* block) /* in: pointer to the block in question */
1934
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1936
/* We do not need protect block->io_fix here by block->mutex to read
1937
it because this is the only function where we can change the value
1938
from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
1939
ensures that this is the only thread that handles the i/o for this
1942
io_type = block->io_fix;
1944
if (io_type == BUF_IO_READ) {
1945
/* If this page is not uninitialized and not in the
1946
doublewrite buffer, then the page number and space id
1947
should be the same as in block. */
1948
ulint read_page_no = mach_read_from_4(
1949
block->frame + FIL_PAGE_OFFSET);
1950
ulint read_space_id = mach_read_from_4(
1951
block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1954
&& trx_doublewrite_page_inside(block->offset)) {
1956
ut_print_timestamp(stderr);
1958
" InnoDB: Error: reading page %lu\n"
1959
"InnoDB: which is in the"
1960
" doublewrite buffer!\n",
1961
(ulong) block->offset);
1962
} else if (!read_space_id && !read_page_no) {
1963
/* This is likely an uninitialized page. */
1964
} else if ((block->space && block->space != read_space_id)
1965
|| block->offset != read_page_no) {
1966
/* We did not compare space_id to read_space_id
1967
if block->space == 0, because the field on the
1968
page may contain garbage in MySQL < 4.1.1,
1969
which only supported block->space == 0. */
1971
ut_print_timestamp(stderr);
1973
" InnoDB: Error: space id and page n:o"
1974
" stored in the page\n"
1975
"InnoDB: read in are %lu:%lu,"
1976
" should be %lu:%lu!\n",
1977
(ulong) read_space_id, (ulong) read_page_no,
1978
(ulong) block->space, (ulong) block->offset);
1980
/* From version 3.23.38 up we store the page checksum
1981
to the 4 first bytes of the page end lsn field */
1983
if (buf_page_is_corrupted(block->frame)) {
1985
"InnoDB: Database page corruption on disk"
1987
"InnoDB: file read of page %lu.\n",
1988
(ulong) block->offset);
1990
fputs("InnoDB: You may have to recover"
1991
" from a backup.\n", stderr);
1993
buf_page_print(block->frame);
1996
"InnoDB: Database page corruption on disk"
1998
"InnoDB: file read of page %lu.\n",
1999
(ulong) block->offset);
2000
fputs("InnoDB: You may have to recover"
2001
" from a backup.\n", stderr);
2002
fputs("InnoDB: It is also possible that"
2004
"InnoDB: system has corrupted its"
2006
"InnoDB: and rebooting your computer"
2009
"InnoDB: If the corrupt page is an index page\n"
2010
"InnoDB: you can also try to"
2011
" fix the corruption\n"
2012
"InnoDB: by dumping, dropping,"
2013
" and reimporting\n"
2014
"InnoDB: the corrupt table."
2015
" You can use CHECK\n"
2016
"InnoDB: TABLE to scan your"
2017
" table for corruption.\n"
2019
" http://dev.mysql.com/doc/refman/5.1/en/"
2020
"forcing-recovery.html\n"
2021
"InnoDB: about forcing recovery.\n", stderr);
2023
if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
2024
fputs("InnoDB: Ending processing because of"
2025
" a corrupt database page.\n",
2031
if (recv_recovery_is_on()) {
2032
recv_recover_page(FALSE, TRUE, block->frame,
2033
block->space, block->offset);
2036
if (!recv_no_ibuf_operations) {
2037
ibuf_merge_or_delete_for_page(
2038
block->frame, block->space, block->offset,
2043
mutex_enter(&(buf_pool->mutex));
2044
mutex_enter(&block->mutex);
2046
#ifdef UNIV_IBUF_DEBUG
2047
ut_a(ibuf_count_get(block->space, block->offset) == 0);
2049
/* Because this thread which does the unlocking is not the same that
2050
did the locking, we use a pass value != 0 in unlock, which simply
2051
removes the newest lock debug record, without checking the thread
2056
if (io_type == BUF_IO_READ) {
2057
/* NOTE that the call to ibuf may have moved the ownership of
2058
the x-latch to this OS thread: do not let this confuse you in
2061
ut_ad(buf_pool->n_pend_reads > 0);
2062
buf_pool->n_pend_reads--;
2063
buf_pool->n_pages_read++;
2065
rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ);
2068
if (buf_debug_prints) {
2069
fputs("Has read ", stderr);
2071
#endif /* UNIV_DEBUG */
2073
ut_ad(io_type == BUF_IO_WRITE);
2075
/* Write means a flush operation: call the completion
2076
routine in the flush system */
2078
buf_flush_write_complete(block);
2080
rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE);
2082
buf_pool->n_pages_written++;
2085
if (buf_debug_prints) {
2086
fputs("Has written ", stderr);
2088
#endif /* UNIV_DEBUG */
2091
mutex_exit(&block->mutex);
2092
mutex_exit(&(buf_pool->mutex));
2095
if (buf_debug_prints) {
2096
fprintf(stderr, "page space %lu page no %lu\n",
2097
(ulong) block->space, (ulong) block->offset);
2099
#endif /* UNIV_DEBUG */
2102
/*************************************************************************
2103
Invalidates the file pages in the buffer pool when an archive recovery is
2104
completed. All the file pages buffered must be in a replaceable state when
2105
this function is called: not latched and not modified. */
2108
buf_pool_invalidate(void)
2109
/*=====================*/
2113
ut_ad(buf_all_freed());
2118
freed = buf_LRU_search_and_free_block(100);
2121
mutex_enter(&(buf_pool->mutex));
2123
ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
2125
mutex_exit(&(buf_pool->mutex));
2129
/*************************************************************************
2130
Validates the buffer buf_pool data structure. */
2138
ulint n_single_flush = 0;
2139
ulint n_lru_flush = 0;
2140
ulint n_list_flush = 0;
2148
mutex_enter(&(buf_pool->mutex));
2150
for (i = 0; i < buf_pool->curr_size; i++) {
2152
block = buf_pool_get_nth_block(buf_pool, i);
2154
mutex_enter(&block->mutex);
2156
if (block->state == BUF_BLOCK_FILE_PAGE) {
2158
ut_a(buf_page_hash_get(block->space,
2159
block->offset) == block);
2162
#ifdef UNIV_IBUF_DEBUG
2163
ut_a((block->io_fix == BUF_IO_READ)
2164
|| ibuf_count_get(block->space, block->offset)
2167
if (block->io_fix == BUF_IO_WRITE) {
2169
if (block->flush_type == BUF_FLUSH_LRU) {
2171
ut_a(rw_lock_is_locked(
2174
} else if (block->flush_type
2175
== BUF_FLUSH_LIST) {
2177
} else if (block->flush_type
2178
== BUF_FLUSH_SINGLE_PAGE) {
2184
} else if (block->io_fix == BUF_IO_READ) {
2186
ut_a(rw_lock_is_locked(&(block->lock),
2192
if (ut_dulint_cmp(block->oldest_modification,
2193
ut_dulint_zero) > 0) {
2197
} else if (block->state == BUF_BLOCK_NOT_USED) {
2201
mutex_exit(&block->mutex);
2204
if (n_lru + n_free > buf_pool->curr_size) {
2205
fprintf(stderr, "n LRU %lu, n free %lu\n",
2206
(ulong) n_lru, (ulong) n_free);
2210
ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
2211
if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
2212
fprintf(stderr, "Free list len %lu, free blocks %lu\n",
2213
(ulong) UT_LIST_GET_LEN(buf_pool->free),
2217
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
2219
ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
2220
ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
2221
ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
2223
mutex_exit(&(buf_pool->mutex));
2225
ut_a(buf_LRU_validate());
2226
ut_a(buf_flush_validate());
2231
/*************************************************************************
2232
Prints info of the buffer buf_pool data structure. */
2246
dict_index_t* index;
2250
size = buf_pool->curr_size;
2252
index_ids = mem_alloc(sizeof(dulint) * size);
2253
counts = mem_alloc(sizeof(ulint) * size);
2255
mutex_enter(&(buf_pool->mutex));
2258
"buf_pool size %lu\n"
2259
"database pages %lu\n"
2261
"modified database pages %lu\n"
2262
"n pending reads %lu\n"
2263
"n pending flush LRU %lu list %lu single page %lu\n"
2264
"pages read %lu, created %lu, written %lu\n",
2266
(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
2267
(ulong) UT_LIST_GET_LEN(buf_pool->free),
2268
(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
2269
(ulong) buf_pool->n_pend_reads,
2270
(ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
2271
(ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
2272
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
2273
(ulong) buf_pool->n_pages_read, buf_pool->n_pages_created,
2274
(ulong) buf_pool->n_pages_written);
2276
/* Count the number of blocks belonging to each index in the buffer */
2280
for (i = 0; i < size; i++) {
2281
frame = buf_pool_get_nth_block(buf_pool, i)->frame;
2283
if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
2285
id = btr_page_get_index_id(frame);
2287
/* Look for the id in the index_ids array */
2290
while (j < n_found) {
2292
if (ut_dulint_cmp(index_ids[j], id) == 0) {
2308
mutex_exit(&(buf_pool->mutex));
2310
for (i = 0; i < n_found; i++) {
2311
index = dict_index_get_if_in_cache(index_ids[i]);
2314
"Block count for index %lu in buffer is about %lu",
2315
(ulong) ut_dulint_get_low(index_ids[i]),
2320
dict_index_name_print(stderr, NULL, index);
2326
mem_free(index_ids);
2329
ut_a(buf_validate());
2331
#endif /* UNIV_DEBUG */
2333
/*************************************************************************
2334
Returns the number of latched pages in the buffer pool. */
2337
buf_get_latched_pages_number(void)
2341
ulint fixed_pages_number = 0;
2343
mutex_enter(&(buf_pool->mutex));
2345
for (i = 0; i < buf_pool->curr_size; i++) {
2347
block = buf_pool_get_nth_block(buf_pool, i);
2349
if (block->magic_n == BUF_BLOCK_MAGIC_N) {
2350
mutex_enter(&block->mutex);
2352
if (block->buf_fix_count != 0 || block->io_fix != 0) {
2353
fixed_pages_number++;
2356
mutex_exit(&block->mutex);
2360
mutex_exit(&(buf_pool->mutex));
2362
return(fixed_pages_number);
2365
/*************************************************************************
2366
Returns the number of pending buf pool ios. */
2369
buf_get_n_pending_ios(void)
2370
/*=======================*/
2372
return(buf_pool->n_pend_reads
2373
+ buf_pool->n_flush[BUF_FLUSH_LRU]
2374
+ buf_pool->n_flush[BUF_FLUSH_LIST]
2375
+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
2378
/*************************************************************************
2379
Returns the ratio in percents of modified pages in the buffer pool /
2380
database pages in the buffer pool. */
2383
buf_get_modified_ratio_pct(void)
2384
/*============================*/
2388
mutex_enter(&(buf_pool->mutex));
2390
ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
2391
/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
2392
+ UT_LIST_GET_LEN(buf_pool->free));
2394
/* 1 + is there to avoid division by zero */
2396
mutex_exit(&(buf_pool->mutex));
2401
/*************************************************************************
2402
Prints info of the buffer i/o. */
2407
FILE* file) /* in/out: buffer where to print */
2409
time_t current_time;
2410
double time_elapsed;
2414
size = buf_pool->curr_size;
2416
mutex_enter(&(buf_pool->mutex));
2420
"AWE: Buffer pool memory frames %lu\n",
2421
(ulong) buf_pool->n_frames);
2424
"AWE: Database pages and free buffers"
2425
" mapped in frames %lu\n",
2427
UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
2430
"Buffer pool size %lu\n"
2431
"Free buffers %lu\n"
2432
"Database pages %lu\n"
2433
"Modified db pages %lu\n"
2434
"Pending reads %lu\n"
2435
"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
2437
(ulong) UT_LIST_GET_LEN(buf_pool->free),
2438
(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
2439
(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
2440
(ulong) buf_pool->n_pend_reads,
2441
(ulong) buf_pool->n_flush[BUF_FLUSH_LRU]
2442
+ buf_pool->init_flush[BUF_FLUSH_LRU],
2443
(ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
2444
+ buf_pool->init_flush[BUF_FLUSH_LIST],
2445
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
2447
current_time = time(NULL);
2448
time_elapsed = 0.001 + difftime(current_time,
2449
buf_pool->last_printout_time);
2450
buf_pool->last_printout_time = current_time;
2453
"Pages read %lu, created %lu, written %lu\n"
2454
"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
2455
(ulong) buf_pool->n_pages_read,
2456
(ulong) buf_pool->n_pages_created,
2457
(ulong) buf_pool->n_pages_written,
2458
(buf_pool->n_pages_read - buf_pool->n_pages_read_old)
2460
(buf_pool->n_pages_created - buf_pool->n_pages_created_old)
2462
(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
2466
fprintf(file, "AWE: %.2f page remaps/s\n",
2467
(buf_pool->n_pages_awe_remapped
2468
- buf_pool->n_pages_awe_remapped_old)
2472
if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
2473
fprintf(file, "Buffer pool hit rate %lu / 1000\n",
2475
(1000 - ((1000 * (buf_pool->n_pages_read
2476
- buf_pool->n_pages_read_old))
2477
/ (buf_pool->n_page_gets
2478
- buf_pool->n_page_gets_old))));
2480
fputs("No buffer pool page gets since the last printout\n",
2484
buf_pool->n_page_gets_old = buf_pool->n_page_gets;
2485
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
2486
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
2487
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
2488
buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
2490
mutex_exit(&(buf_pool->mutex));
2493
/**************************************************************************
2494
Refreshes the statistics used to print per-second averages. */
2497
buf_refresh_io_stats(void)
2498
/*======================*/
2500
buf_pool->last_printout_time = time(NULL);
2501
buf_pool->n_page_gets_old = buf_pool->n_page_gets;
2502
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
2503
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
2504
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
2505
buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
2508
/*************************************************************************
2509
Checks that all file pages in the buffer are in a replaceable state. */
2520
mutex_enter(&(buf_pool->mutex));
2522
for (i = 0; i < buf_pool->curr_size; i++) {
2524
block = buf_pool_get_nth_block(buf_pool, i);
2526
mutex_enter(&block->mutex);
2528
if (block->state == BUF_BLOCK_FILE_PAGE) {
2530
if (!buf_flush_ready_for_replace(block)) {
2533
"Page %lu %lu still fixed or dirty\n",
2534
(ulong) block->space,
2535
(ulong) block->offset);
2540
mutex_exit(&block->mutex);
2543
mutex_exit(&(buf_pool->mutex));
2548
/*************************************************************************
2549
Checks that there currently are no pending i/o-operations for the buffer
2553
buf_pool_check_no_pending_io(void)
2554
/*==============================*/
2555
/* out: TRUE if there is no pending i/o */
2559
mutex_enter(&(buf_pool->mutex));
2561
if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU]
2562
+ buf_pool->n_flush[BUF_FLUSH_LIST]
2563
+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) {
2569
mutex_exit(&(buf_pool->mutex));
2574
/*************************************************************************
2575
Gets the current length of the free list of buffer blocks. */
2578
buf_get_free_list_len(void)
2579
/*=======================*/
2583
mutex_enter(&(buf_pool->mutex));
2585
len = UT_LIST_GET_LEN(buf_pool->free);
2587
mutex_exit(&(buf_pool->mutex));