1
/*****************************************************************************
3
Copyright (C) 2006, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
21
Binary buddy allocator for compressed pages
23
Created December 2006 by Marko Makela
24
*******************************************************/
27
#include "buf0buddy.h"
29
# include "buf0buddy.ic"
37
/**********************************************************************//**
38
Get the offset of the buddy of a compressed page frame.
39
@return the buddy relative of page */
44
byte* page, /*!< in: compressed page */
45
ulint size) /*!< in: page size in bytes */
47
ut_ad(ut_is_2pow(size));
48
ut_ad(size >= BUF_BUDDY_LOW);
49
ut_ad(size < BUF_BUDDY_HIGH);
50
ut_ad(!ut_align_offset(page, size));
52
if (((ulint) page) & size) {
59
/**********************************************************************//**
60
Add a block to the head of the appropriate buddy free list. */
63
buf_buddy_add_to_free(
64
/*==================*/
65
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
66
buf_page_t* bpage, /*!< in,own: block to be freed */
67
ulint i) /*!< in: index of
68
buf_pool->zip_free[] */
70
#ifdef UNIV_DEBUG_VALGRIND
71
buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
73
if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i);
74
#endif /* UNIV_DEBUG_VALGRIND */
76
ut_ad(buf_pool_mutex_own(buf_pool));
77
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
78
ut_ad(buf_pool->zip_free[i].start != bpage);
79
UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage);
81
#ifdef UNIV_DEBUG_VALGRIND
82
if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i);
83
UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
84
#endif /* UNIV_DEBUG_VALGRIND */
87
/**********************************************************************//**
88
Remove a block from the appropriate buddy free list. */
91
buf_buddy_remove_from_free(
92
/*=======================*/
93
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
94
buf_page_t* bpage, /*!< in: block to be removed */
95
ulint i) /*!< in: index of
96
buf_pool->zip_free[] */
98
#ifdef UNIV_DEBUG_VALGRIND
99
buf_page_t* prev = UT_LIST_GET_PREV(list, bpage);
100
buf_page_t* next = UT_LIST_GET_NEXT(list, bpage);
102
if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i);
103
if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i);
105
ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
106
ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
107
#endif /* UNIV_DEBUG_VALGRIND */
109
ut_ad(buf_pool_mutex_own(buf_pool));
110
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
111
UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage);
113
#ifdef UNIV_DEBUG_VALGRIND
114
if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i);
115
if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i);
116
#endif /* UNIV_DEBUG_VALGRIND */
119
/**********************************************************************//**
120
Try to allocate a block from buf_pool->zip_free[].
121
@return allocated block, or NULL if buf_pool->zip_free[] was empty */
126
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
127
ulint i) /*!< in: index of buf_pool->zip_free[] */
131
ut_ad(buf_pool_mutex_own(buf_pool));
132
ut_a(i < BUF_BUDDY_SIZES);
134
#ifndef UNIV_DEBUG_VALGRIND
135
/* Valgrind would complain about accessing free memory. */
136
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
137
ut_ad(buf_page_get_state(ut_list_node_313)
138
== BUF_BLOCK_ZIP_FREE)));
139
#endif /* !UNIV_DEBUG_VALGRIND */
140
bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
143
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
144
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
146
buf_buddy_remove_from_free(buf_pool, bpage, i);
147
} else if (i + 1 < BUF_BUDDY_SIZES) {
148
/* Attempt to split. */
149
bpage = buf_buddy_alloc_zip(buf_pool, i + 1);
152
buf_page_t* buddy = (buf_page_t*)
153
(((char*) bpage) + (BUF_BUDDY_LOW << i));
155
ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
156
ut_d(memset(buddy, i, BUF_BUDDY_LOW << i));
157
buddy->state = BUF_BLOCK_ZIP_FREE;
158
buf_buddy_add_to_free(buf_pool, buddy, i);
164
memset(bpage, ~i, BUF_BUDDY_LOW << i);
166
#endif /* UNIV_DEBUG */
168
UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
173
/**********************************************************************//**
174
Deallocate a buffer frame of UNIV_PAGE_SIZE. */
177
buf_buddy_block_free(
178
/*=================*/
179
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
180
void* buf) /*!< in: buffer frame to deallocate */
182
const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
186
ut_ad(buf_pool_mutex_own(buf_pool));
187
ut_ad(!mutex_own(&buf_pool->zip_mutex));
188
ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
190
HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
191
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
192
&& bpage->in_zip_hash && !bpage->in_page_hash),
193
((buf_block_t*) bpage)->frame == buf);
195
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
196
ut_ad(!bpage->in_page_hash);
197
ut_ad(bpage->in_zip_hash);
198
ut_d(bpage->in_zip_hash = FALSE);
199
HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
201
ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
202
UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
204
block = (buf_block_t*) bpage;
205
mutex_enter(&block->mutex);
206
buf_LRU_block_free_non_file_page(block);
207
mutex_exit(&block->mutex);
209
ut_ad(buf_pool->buddy_n_frames > 0);
210
ut_d(buf_pool->buddy_n_frames--);
213
/**********************************************************************//**
214
Allocate a buffer block to the buddy allocator. */
217
buf_buddy_block_register(
218
/*=====================*/
219
buf_block_t* block) /*!< in: buffer frame to allocate */
221
buf_pool_t* buf_pool = buf_pool_from_block(block);
222
const ulint fold = BUF_POOL_ZIP_FOLD(block);
223
ut_ad(buf_pool_mutex_own(buf_pool));
224
ut_ad(!mutex_own(&buf_pool->zip_mutex));
225
ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
227
buf_block_set_state(block, BUF_BLOCK_MEMORY);
230
ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
232
ut_ad(!block->page.in_page_hash);
233
ut_ad(!block->page.in_zip_hash);
234
ut_d(block->page.in_zip_hash = TRUE);
235
HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
237
ut_d(buf_pool->buddy_n_frames++);
240
/**********************************************************************//**
241
Allocate a block from a bigger object.
242
@return allocated block */
245
buf_buddy_alloc_from(
246
/*=================*/
247
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
248
void* buf, /*!< in: a block that is free to use */
249
ulint i, /*!< in: index of
250
buf_pool->zip_free[] */
251
ulint j) /*!< in: size of buf as an index
252
of buf_pool->zip_free[] */
254
ulint offs = BUF_BUDDY_LOW << j;
255
ut_ad(j <= BUF_BUDDY_SIZES);
257
ut_ad(!ut_align_offset(buf, offs));
259
/* Add the unused parts of the block to the free lists. */
266
bpage = (buf_page_t*) ((byte*) buf + offs);
267
ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
268
bpage->state = BUF_BLOCK_ZIP_FREE;
269
#ifndef UNIV_DEBUG_VALGRIND
270
/* Valgrind would complain about accessing free memory. */
271
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
272
ut_ad(buf_page_get_state(
274
== BUF_BLOCK_ZIP_FREE)));
275
#endif /* !UNIV_DEBUG_VALGRIND */
276
buf_buddy_add_to_free(buf_pool, bpage, j);
282
/**********************************************************************//**
283
Allocate a block. The thread calling this function must hold
284
buf_pool->mutex and must not hold buf_pool_zip_mutex or any block->mutex.
285
The buf_pool->mutex may only be released and reacquired if lru != NULL.
286
@return allocated block, possibly NULL if lru==NULL */
291
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
292
ulint i, /*!< in: index of buf_pool->zip_free[],
293
or BUF_BUDDY_SIZES */
294
ibool* lru) /*!< in: pointer to a variable that
295
will be assigned TRUE if storage was
296
allocated from the LRU list and
297
buf_pool->mutex was temporarily
298
released, or NULL if the LRU list
299
should not be used */
303
ut_ad(buf_pool_mutex_own(buf_pool));
304
ut_ad(!mutex_own(&buf_pool->zip_mutex));
306
if (i < BUF_BUDDY_SIZES) {
307
/* Try to allocate from the buddy system. */
308
block = buf_buddy_alloc_zip(buf_pool, i);
315
/* Try allocating from the buf_pool->free list. */
316
block = buf_LRU_get_free_only(buf_pool);
328
/* Try replacing an uncompressed page in the buffer pool. */
329
buf_pool_mutex_exit(buf_pool);
330
block = buf_LRU_get_free_block(buf_pool, 0);
332
buf_pool_mutex_enter(buf_pool);
335
buf_buddy_block_register(block);
337
block = buf_buddy_alloc_from(
338
buf_pool, block->frame, i, BUF_BUDDY_SIZES);
341
buf_pool->buddy_stat[i].used++;
345
/**********************************************************************//**
346
Try to relocate the control block of a compressed page.
347
@return TRUE if relocated */
350
buf_buddy_relocate_block(
351
/*=====================*/
352
buf_page_t* bpage, /*!< in: block to relocate */
353
buf_page_t* dpage) /*!< in: free block to relocate to */
356
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
358
ut_ad(buf_pool_mutex_own(buf_pool));
360
switch (buf_page_get_state(bpage)) {
361
case BUF_BLOCK_ZIP_FREE:
362
case BUF_BLOCK_NOT_USED:
363
case BUF_BLOCK_READY_FOR_USE:
364
case BUF_BLOCK_FILE_PAGE:
365
case BUF_BLOCK_MEMORY:
366
case BUF_BLOCK_REMOVE_HASH:
368
case BUF_BLOCK_ZIP_DIRTY:
369
/* Cannot relocate dirty pages. */
372
case BUF_BLOCK_ZIP_PAGE:
376
mutex_enter(&buf_pool->zip_mutex);
378
if (!buf_page_can_relocate(bpage)) {
379
mutex_exit(&buf_pool->zip_mutex);
383
buf_relocate(bpage, dpage);
384
ut_d(bpage->state = BUF_BLOCK_ZIP_FREE);
386
/* relocate buf_pool->zip_clean */
387
b = UT_LIST_GET_PREV(list, dpage);
388
UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage);
391
UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage);
393
UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage);
396
UNIV_MEM_INVALID(bpage, sizeof *bpage);
398
mutex_exit(&buf_pool->zip_mutex);
402
/**********************************************************************//**
403
Try to relocate a block.
404
@return TRUE if relocated */
409
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
410
void* src, /*!< in: block to relocate */
411
void* dst, /*!< in: free block to relocate to */
412
ulint i) /*!< in: index of
413
buf_pool->zip_free[] */
418
const ulint size = BUF_BUDDY_LOW << i;
419
ullint usec = ut_time_us(NULL);
421
ut_ad(buf_pool_mutex_own(buf_pool));
422
ut_ad(!mutex_own(&buf_pool->zip_mutex));
423
ut_ad(!ut_align_offset(src, size));
424
ut_ad(!ut_align_offset(dst, size));
425
UNIV_MEM_ASSERT_W(dst, size);
427
/* We assume that all memory from buf_buddy_alloc()
428
is used for either compressed pages or buf_page_t
429
objects covering compressed pages. */
431
/* We look inside the allocated objects returned by
432
buf_buddy_alloc() and assume that anything of
433
PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains
434
a valid space_id and page_no in the page header. Should the
435
fields be invalid, we will be unable to relocate the block.
436
We also assume that anything that fits sizeof(buf_page_t)
437
actually is a properly initialized buf_page_t object. */
439
if (size >= PAGE_ZIP_MIN_SIZE) {
440
/* This is a compressed page. */
443
/* The src block may be split into smaller blocks,
444
some of which may be free. Thus, the
445
mach_read_from_4() calls below may attempt to read
446
from free memory. The memory is "owned" by the buddy
447
allocator (and it has been allocated from the buffer
448
pool), so there is nothing wrong about this. The
449
mach_read_from_4() calls here will only trigger bogus
450
Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
451
space = mach_read_from_4(
452
(const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
453
page_no = mach_read_from_4(
454
(const byte*) src + FIL_PAGE_OFFSET);
455
/* Suppress Valgrind warnings about conditional jump
456
on uninitialized value. */
457
UNIV_MEM_VALID(&space, sizeof space);
458
UNIV_MEM_VALID(&page_no, sizeof page_no);
459
bpage = buf_page_hash_get(buf_pool, space, page_no);
461
if (!bpage || bpage->zip.data != src) {
462
/* The block has probably been freshly
463
allocated by buf_LRU_get_free_block() but not
464
added to buf_pool->page_hash yet. Obviously,
465
it cannot be relocated. */
470
ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
472
if (page_zip_get_size(&bpage->zip) != size) {
473
/* The block is of different size. We would
474
have to relocate all blocks covered by src.
475
For the sake of simplicity, give up. */
476
ut_ad(page_zip_get_size(&bpage->zip) < size);
481
/* The block must have been allocated, but it may
482
contain uninitialized data. */
483
UNIV_MEM_ASSERT_W(src, size);
485
mutex = buf_page_get_mutex(bpage);
489
if (buf_page_can_relocate(bpage)) {
490
/* Relocate the compressed page. */
491
ut_a(bpage->zip.data == src);
492
memcpy(dst, src, size);
493
bpage->zip.data = dst;
496
UNIV_MEM_INVALID(src, size);
498
buf_buddy_stat_t* buddy_stat
499
= &buf_pool->buddy_stat[i];
500
buddy_stat->relocated++;
501
buddy_stat->relocated_usec
502
+= ut_time_us(NULL) - usec;
508
} else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) {
509
/* This must be a buf_page_t object. */
510
#if UNIV_WORD_SIZE == 4
511
/* On 32-bit systems, there is no padding in
512
buf_page_t. On other systems, Valgrind could complain
513
about uninitialized pad bytes. */
514
UNIV_MEM_ASSERT_RW(src, size);
516
if (buf_buddy_relocate_block(src, dst)) {
525
/**********************************************************************//**
526
Deallocate a block. */
531
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
532
void* buf, /*!< in: block to be freed, must not be
533
pointed to by the buffer pool */
534
ulint i) /*!< in: index of buf_pool->zip_free[],
535
or BUF_BUDDY_SIZES */
540
ut_ad(buf_pool_mutex_own(buf_pool));
541
ut_ad(!mutex_own(&buf_pool->zip_mutex));
542
ut_ad(i <= BUF_BUDDY_SIZES);
543
ut_ad(buf_pool->buddy_stat[i].used > 0);
545
buf_pool->buddy_stat[i].used--;
547
UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
548
ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE);
550
if (i == BUF_BUDDY_SIZES) {
551
buf_buddy_block_free(buf_pool, buf);
555
ut_ad(i < BUF_BUDDY_SIZES);
556
ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
557
ut_ad(!buf_pool_contains_zip(buf_pool, buf));
559
/* Try to combine adjacent blocks. */
561
buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
563
#ifndef UNIV_DEBUG_VALGRIND
564
/* Valgrind would complain about accessing free memory. */
566
if (buddy->state != BUF_BLOCK_ZIP_FREE) {
571
/* The field buddy->state can only be trusted for free blocks.
572
If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if
573
it is in the free list. */
574
#endif /* !UNIV_DEBUG_VALGRIND */
576
for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
577
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
578
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
580
if (bpage == buddy) {
582
/* The buddy is free: recombine */
583
buf_buddy_remove_from_free(buf_pool, bpage, i);
585
ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
586
ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
588
buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
596
buf_page_t* next = UT_LIST_GET_NEXT(list, bpage);
597
UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
602
#ifndef UNIV_DEBUG_VALGRIND
604
/* Valgrind would complain about accessing free memory. */
605
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
606
ut_ad(buf_page_get_state(ut_list_node_313)
607
== BUF_BLOCK_ZIP_FREE)));
608
#endif /* UNIV_DEBUG_VALGRIND */
610
/* The buddy is not free. Is there a free block of this size? */
611
bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
614
/* Remove the block from the free list, because a successful
615
buf_buddy_relocate() will overwrite bpage->list. */
617
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
618
buf_buddy_remove_from_free(buf_pool, bpage, i);
620
/* Try to relocate the buddy of buf to the free block. */
621
if (buf_buddy_relocate(buf_pool, buddy, bpage, i)) {
623
ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
627
buf_buddy_add_to_free(buf_pool, bpage, i);
629
/* Try to relocate the buddy of the free block to buf. */
630
buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage),
633
#ifndef UNIV_DEBUG_VALGRIND
634
/* Valgrind would complain about accessing free memory. */
636
/* The buddy must not be (completely) free, because we
637
always recombine adjacent free blocks.
639
(Parts of the buddy can be free in
640
buf_pool->zip_free[j] with j < i.) */
641
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
642
ut_ad(buf_page_get_state(
644
== BUF_BLOCK_ZIP_FREE
645
&& ut_list_node_313 != buddy)));
646
#endif /* !UNIV_DEBUG_VALGRIND */
648
if (buf_buddy_relocate(buf_pool, buddy, buf, i)) {
651
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
652
ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
657
/* Free the block to the buddy list. */
660
if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) {
661
/* This area has most likely been allocated for at
662
least one compressed-only block descriptor. Check
663
that there are no live objects in the area. This is
664
not a complete check: it may yield false positives as
665
well as false negatives. Also, due to buddy blocks
666
being recombined, it is possible (although unlikely)
667
that this branch is never reached. */
671
# ifndef UNIV_DEBUG_VALGRIND
672
/* Valgrind would complain about accessing
673
uninitialized memory. Besides, Valgrind performs a
674
more exhaustive check, at every memory access. */
675
const buf_page_t* b = buf;
676
const buf_page_t* const b_end = (buf_page_t*)
677
((char*) b + (BUF_BUDDY_LOW << i));
679
for (; b < b_end; b++) {
680
/* Avoid false positives (and cause false
681
negatives) by checking for b->space < 1000. */
683
if ((b->state == BUF_BLOCK_ZIP_PAGE
684
|| b->state == BUF_BLOCK_ZIP_DIRTY)
685
&& b->space > 0 && b->space < 1000) {
687
"buddy dirty %p %u (%u,%u) %p,%lu\n",
689
b->state, b->space, b->offset,
693
# endif /* !UNIV_DEBUG_VALGRIND */
695
/* Scramble the block. This should make any pointers
696
invalid and trigger a segmentation violation. Because
697
the scrambling can be reversed, it may be possible to
698
track down the object pointing to the freed data by
699
dereferencing the unscrambled bpage->LRU or
700
bpage->list pointers. */
701
for (c = (char*) buf + (BUF_BUDDY_LOW << i);
702
c-- > (char*) buf; ) {
706
/* Fill large blocks with a constant pattern. */
707
memset(bpage, i, BUF_BUDDY_LOW << i);
709
#endif /* UNIV_DEBUG */
710
bpage->state = BUF_BLOCK_ZIP_FREE;
711
buf_buddy_add_to_free(buf_pool, bpage, i);