641.1.2
by Monty Taylor
Imported 1.0.1 with clean - with no changes. |
1 |
/******************************************************
|
2 |
The database buffer buf_pool flush algorithm
|
|
3 |
||
4 |
(c) 1995-2001 Innobase Oy
|
|
5 |
||
6 |
Created 11/11/1995 Heikki Tuuri
|
|
7 |
*******************************************************/
|
|
8 |
||
9 |
#include "buf0flu.h" |
|
10 |
||
11 |
#ifdef UNIV_NONINL
|
|
12 |
#include "buf0flu.ic" |
|
13 |
#include "trx0sys.h" |
|
14 |
#endif
|
|
15 |
||
16 |
#include "ut0byte.h" |
|
17 |
#include "ut0lst.h" |
|
18 |
#include "page0page.h" |
|
19 |
#include "page0zip.h" |
|
20 |
#include "fil0fil.h" |
|
21 |
#include "buf0buf.h" |
|
22 |
#include "buf0lru.h" |
|
23 |
#include "buf0rea.h" |
|
24 |
#include "ibuf0ibuf.h" |
|
25 |
#include "log0log.h" |
|
26 |
#include "os0file.h" |
|
27 |
#include "trx0sys.h" |
|
28 |
#include "srv0srv.h" |
|
29 |
||
30 |
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
|
31 |
/**********************************************************************
|
|
32 |
Validates the flush list. */
|
|
33 |
static
|
|
34 |
ibool
|
|
35 |
buf_flush_validate_low(void); |
|
36 |
/*========================*/
|
|
37 |
/* out: TRUE if ok */
|
|
38 |
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
|
39 |
||
40 |
/************************************************************************
|
|
41 |
Inserts a modified block into the flush list. */
|
|
42 |
UNIV_INTERN
|
|
43 |
void
|
|
44 |
buf_flush_insert_into_flush_list( |
|
45 |
/*=============================*/
|
|
46 |
buf_page_t* bpage) /* in: block which is modified */ |
|
47 |
{
|
|
48 |
ut_ad(buf_pool_mutex_own()); |
|
49 |
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) |
|
50 |
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification |
|
51 |
<= bpage->oldest_modification)); |
|
52 |
||
53 |
switch (buf_page_get_state(bpage)) { |
|
54 |
case BUF_BLOCK_ZIP_PAGE: |
|
55 |
mutex_enter(&buf_pool_zip_mutex); |
|
56 |
buf_page_set_state(bpage, BUF_BLOCK_ZIP_DIRTY); |
|
57 |
mutex_exit(&buf_pool_zip_mutex); |
|
58 |
UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); |
|
59 |
/* fall through */
|
|
60 |
case BUF_BLOCK_ZIP_DIRTY: |
|
61 |
case BUF_BLOCK_FILE_PAGE: |
|
62 |
ut_ad(bpage->in_LRU_list); |
|
63 |
ut_ad(bpage->in_page_hash); |
|
64 |
ut_ad(!bpage->in_zip_hash); |
|
65 |
ut_ad(!bpage->in_flush_list); |
|
66 |
ut_d(bpage->in_flush_list = TRUE); |
|
67 |
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage); |
|
68 |
break; |
|
69 |
case BUF_BLOCK_ZIP_FREE: |
|
70 |
case BUF_BLOCK_NOT_USED: |
|
71 |
case BUF_BLOCK_READY_FOR_USE: |
|
72 |
case BUF_BLOCK_MEMORY: |
|
73 |
case BUF_BLOCK_REMOVE_HASH: |
|
74 |
ut_error; |
|
75 |
return; |
|
76 |
}
|
|
77 |
||
78 |
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
|
79 |
ut_a(buf_flush_validate_low()); |
|
80 |
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
|
81 |
}
|
|
82 |
||
83 |
/************************************************************************
|
|
84 |
Inserts a modified block into the flush list in the right sorted position.
|
|
85 |
This function is used by recovery, because there the modifications do not
|
|
86 |
necessarily come in the order of lsn's. */
|
|
87 |
UNIV_INTERN
|
|
88 |
void
|
|
89 |
buf_flush_insert_sorted_into_flush_list( |
|
90 |
/*====================================*/
|
|
91 |
buf_page_t* bpage) /* in: block which is modified */ |
|
92 |
{
|
|
93 |
buf_page_t* prev_b; |
|
94 |
buf_page_t* b; |
|
95 |
||
96 |
ut_ad(buf_pool_mutex_own()); |
|
97 |
||
98 |
switch (buf_page_get_state(bpage)) { |
|
99 |
case BUF_BLOCK_ZIP_PAGE: |
|
100 |
mutex_enter(&buf_pool_zip_mutex); |
|
101 |
buf_page_set_state(bpage, BUF_BLOCK_ZIP_DIRTY); |
|
102 |
mutex_exit(&buf_pool_zip_mutex); |
|
103 |
UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); |
|
104 |
/* fall through */
|
|
105 |
case BUF_BLOCK_ZIP_DIRTY: |
|
106 |
case BUF_BLOCK_FILE_PAGE: |
|
107 |
ut_ad(bpage->in_LRU_list); |
|
108 |
ut_ad(bpage->in_page_hash); |
|
109 |
ut_ad(!bpage->in_zip_hash); |
|
110 |
ut_ad(!bpage->in_flush_list); |
|
111 |
ut_d(bpage->in_flush_list = TRUE); |
|
112 |
break; |
|
113 |
case BUF_BLOCK_ZIP_FREE: |
|
114 |
case BUF_BLOCK_NOT_USED: |
|
115 |
case BUF_BLOCK_READY_FOR_USE: |
|
116 |
case BUF_BLOCK_MEMORY: |
|
117 |
case BUF_BLOCK_REMOVE_HASH: |
|
118 |
ut_error; |
|
119 |
return; |
|
120 |
}
|
|
121 |
||
122 |
prev_b = NULL; |
|
123 |
b = UT_LIST_GET_FIRST(buf_pool->flush_list); |
|
124 |
||
125 |
while (b && b->oldest_modification > bpage->oldest_modification) { |
|
126 |
ut_ad(b->in_flush_list); |
|
127 |
prev_b = b; |
|
128 |
b = UT_LIST_GET_NEXT(list, b); |
|
129 |
}
|
|
130 |
||
131 |
if (prev_b == NULL) { |
|
132 |
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage); |
|
133 |
} else { |
|
134 |
UT_LIST_INSERT_AFTER(list, buf_pool->flush_list, |
|
135 |
prev_b, bpage); |
|
136 |
}
|
|
137 |
||
138 |
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
|
139 |
ut_a(buf_flush_validate_low()); |
|
140 |
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
|
141 |
}
|
|
142 |
||
143 |
/************************************************************************
|
|
144 |
Returns TRUE if the file page block is immediately suitable for replacement,
|
|
145 |
i.e., the transition FILE_PAGE => NOT_USED allowed. */
|
|
146 |
UNIV_INTERN
|
|
147 |
ibool
|
|
148 |
buf_flush_ready_for_replace( |
|
149 |
/*========================*/
|
|
150 |
/* out: TRUE if can replace immediately */
|
|
151 |
buf_page_t* bpage) /* in: buffer control block, must be |
|
152 |
buf_page_in_file(bpage) and in the LRU list */
|
|
153 |
{
|
|
154 |
ut_ad(buf_pool_mutex_own()); |
|
155 |
ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
|
156 |
ut_ad(bpage->in_LRU_list); |
|
157 |
||
158 |
if (UNIV_LIKELY(buf_page_in_file(bpage))) { |
|
159 |
||
160 |
return(bpage->oldest_modification == 0 |
|
161 |
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE |
|
162 |
&& bpage->buf_fix_count == 0); |
|
163 |
}
|
|
164 |
||
165 |
ut_print_timestamp(stderr); |
|
166 |
fprintf(stderr, |
|
167 |
" InnoDB: Error: buffer block state %lu"
|
|
168 |
" in the LRU list!\n", |
|
169 |
(ulong) buf_page_get_state(bpage)); |
|
170 |
ut_print_buf(stderr, bpage, sizeof(buf_page_t)); |
|
641.2.1
by Monty Taylor
InnoDB Plugin 1.0.2 |
171 |
putc('\n', stderr); |
641.1.2
by Monty Taylor
Imported 1.0.1 with clean - with no changes. |
172 |
|
173 |
return(FALSE); |
|
174 |
}
|
|
175 |
||
176 |
/************************************************************************
|
|
177 |
Returns TRUE if the block is modified and ready for flushing. */
|
|
178 |
UNIV_INLINE
|
|
179 |
ibool
|
|
180 |
buf_flush_ready_for_flush( |
|
181 |
/*======================*/
|
|
182 |
/* out: TRUE if can flush immediately */
|
|
183 |
buf_page_t* bpage, /* in: buffer control block, must be |
|
184 |
buf_page_in_file(bpage) */
|
|
185 |
enum buf_flush flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ |
|
186 |
{
|
|
187 |
ut_a(buf_page_in_file(bpage)); |
|
188 |
ut_ad(buf_pool_mutex_own()); |
|
189 |
ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
|
190 |
||
191 |
if (bpage->oldest_modification != 0 |
|
192 |
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) { |
|
193 |
ut_ad(bpage->in_flush_list); |
|
194 |
||
195 |
if (flush_type != BUF_FLUSH_LRU) { |
|
196 |
||
197 |
return(TRUE); |
|
198 |
||
199 |
} else if (bpage->buf_fix_count == 0) { |
|
200 |
||
201 |
/* If we are flushing the LRU list, to avoid deadlocks
|
|
202 |
we require the block not to be bufferfixed, and hence
|
|
203 |
not latched. */
|
|
204 |
||
205 |
return(TRUE); |
|
206 |
}
|
|
207 |
}
|
|
208 |
||
209 |
return(FALSE); |
|
210 |
}
|
|
211 |
||
212 |
/************************************************************************
|
|
213 |
Remove a block from the flush list of modified blocks. */
|
|
214 |
UNIV_INTERN
|
|
215 |
void
|
|
216 |
buf_flush_remove( |
|
217 |
/*=============*/
|
|
218 |
buf_page_t* bpage) /* in: pointer to the block in question */ |
|
219 |
{
|
|
220 |
ut_ad(buf_pool_mutex_own()); |
|
221 |
ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
|
222 |
ut_ad(bpage->in_flush_list); |
|
223 |
ut_d(bpage->in_flush_list = FALSE); |
|
224 |
||
225 |
switch (buf_page_get_state(bpage)) { |
|
226 |
case BUF_BLOCK_ZIP_PAGE: |
|
227 |
/* clean compressed pages should not be on the flush list */
|
|
228 |
case BUF_BLOCK_ZIP_FREE: |
|
229 |
case BUF_BLOCK_NOT_USED: |
|
230 |
case BUF_BLOCK_READY_FOR_USE: |
|
231 |
case BUF_BLOCK_MEMORY: |
|
232 |
case BUF_BLOCK_REMOVE_HASH: |
|
233 |
ut_error; |
|
234 |
return; |
|
235 |
case BUF_BLOCK_ZIP_DIRTY: |
|
236 |
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); |
|
237 |
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); |
|
238 |
buf_LRU_insert_zip_clean(bpage); |
|
239 |
break; |
|
240 |
case BUF_BLOCK_FILE_PAGE: |
|
241 |
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); |
|
242 |
break; |
|
243 |
}
|
|
244 |
||
245 |
bpage->oldest_modification = 0; |
|
246 |
||
247 |
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list)); |
|
248 |
}
|
|
249 |
||
250 |
/************************************************************************
|
|
251 |
Updates the flush system data structures when a write is completed. */
|
|
252 |
UNIV_INTERN
|
|
253 |
void
|
|
254 |
buf_flush_write_complete( |
|
255 |
/*=====================*/
|
|
256 |
buf_page_t* bpage) /* in: pointer to the block in question */ |
|
257 |
{
|
|
258 |
enum buf_flush flush_type; |
|
259 |
||
260 |
ut_ad(bpage); |
|
261 |
||
262 |
buf_flush_remove(bpage); |
|
263 |
||
264 |
flush_type = buf_page_get_flush_type(bpage); |
|
265 |
buf_pool->n_flush[flush_type]--; |
|
266 |
||
267 |
if (flush_type == BUF_FLUSH_LRU) { |
|
268 |
/* Put the block to the end of the LRU list to wait to be
|
|
269 |
moved to the free list */
|
|
270 |
||
271 |
buf_LRU_make_block_old(bpage); |
|
272 |
||
273 |
buf_pool->LRU_flush_ended++; |
|
274 |
}
|
|
275 |
||
276 |
/* fprintf(stderr, "n pending flush %lu\n",
|
|
277 |
buf_pool->n_flush[flush_type]); */
|
|
278 |
||
279 |
if ((buf_pool->n_flush[flush_type] == 0) |
|
280 |
&& (buf_pool->init_flush[flush_type] == FALSE)) { |
|
281 |
||
282 |
/* The running flush batch has ended */
|
|
283 |
||
284 |
os_event_set(buf_pool->no_flush[flush_type]); |
|
285 |
}
|
|
286 |
}
|
|
287 |
||
288 |
/************************************************************************
|
|
289 |
Flushes possible buffered writes from the doublewrite memory buffer to disk,
|
|
290 |
and also wakes up the aio thread if simulated aio is used. It is very
|
|
291 |
important to call this function after a batch of writes has been posted,
|
|
292 |
and also when we may have to wait for a page latch! Otherwise a deadlock
|
|
293 |
of threads can occur. */
|
|
294 |
static
|
|
295 |
void
|
|
296 |
buf_flush_buffered_writes(void) |
|
297 |
/*===========================*/
|
|
298 |
{
|
|
299 |
byte* write_buf; |
|
300 |
ulint len; |
|
301 |
ulint len2; |
|
302 |
ulint i; |
|
303 |
||
304 |
if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) { |
|
305 |
os_aio_simulated_wake_handler_threads(); |
|
306 |
||
307 |
return; |
|
308 |
}
|
|
309 |
||
310 |
mutex_enter(&(trx_doublewrite->mutex)); |
|
311 |
||
312 |
/* Write first to doublewrite buffer blocks. We use synchronous
|
|
313 |
aio and thus know that file write has been completed when the
|
|
314 |
control returns. */
|
|
315 |
||
316 |
if (trx_doublewrite->first_free == 0) { |
|
317 |
||
318 |
mutex_exit(&(trx_doublewrite->mutex)); |
|
319 |
||
320 |
return; |
|
321 |
}
|
|
322 |
||
323 |
for (i = 0; i < trx_doublewrite->first_free; i++) { |
|
324 |
||
325 |
const buf_block_t* block; |
|
326 |
||
327 |
block = (buf_block_t*) trx_doublewrite->buf_block_arr[i]; |
|
328 |
||
329 |
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE |
|
330 |
|| block->page.zip.data) { |
|
331 |
/* No simple validate for compressed pages exists. */
|
|
332 |
continue; |
|
333 |
}
|
|
334 |
||
335 |
if (UNIV_UNLIKELY |
|
336 |
(memcmp(block->frame + (FIL_PAGE_LSN + 4), |
|
337 |
block->frame + (UNIV_PAGE_SIZE |
|
338 |
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), |
|
339 |
4))) { |
|
340 |
ut_print_timestamp(stderr); |
|
341 |
fprintf(stderr, |
|
342 |
" InnoDB: ERROR: The page to be written"
|
|
343 |
" seems corrupt!\n" |
|
344 |
"InnoDB: The lsn fields do not match!"
|
|
345 |
" Noticed in the buffer pool\n" |
|
346 |
"InnoDB: before posting to the"
|
|
347 |
" doublewrite buffer.\n"); |
|
348 |
}
|
|
349 |
||
350 |
if (!block->check_index_page_at_flush) { |
|
351 |
} else if (page_is_comp(block->frame)) { |
|
352 |
if (UNIV_UNLIKELY |
|
353 |
(!page_simple_validate_new(block->frame))) { |
|
354 |
corrupted_page: |
|
355 |
buf_page_print(block->frame, 0); |
|
356 |
||
357 |
ut_print_timestamp(stderr); |
|
358 |
fprintf(stderr, |
|
359 |
" InnoDB: Apparent corruption of an"
|
|
360 |
" index page n:o %lu in space %lu\n" |
|
361 |
"InnoDB: to be written to data file."
|
|
362 |
" We intentionally crash server\n" |
|
363 |
"InnoDB: to prevent corrupt data"
|
|
364 |
" from ending up in data\n" |
|
365 |
"InnoDB: files.\n", |
|
366 |
(ulong) buf_block_get_page_no(block), |
|
367 |
(ulong) buf_block_get_space(block)); |
|
368 |
||
369 |
ut_error; |
|
370 |
}
|
|
371 |
} else if (UNIV_UNLIKELY |
|
372 |
(!page_simple_validate_old(block->frame))) { |
|
373 |
||
374 |
goto corrupted_page; |
|
375 |
}
|
|
376 |
}
|
|
377 |
||
378 |
/* increment the doublewrite flushed pages counter */
|
|
379 |
srv_dblwr_pages_written+= trx_doublewrite->first_free; |
|
380 |
srv_dblwr_writes++; |
|
381 |
||
382 |
len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, |
|
383 |
trx_doublewrite->first_free) * UNIV_PAGE_SIZE; |
|
384 |
||
385 |
write_buf = trx_doublewrite->write_buf; |
|
386 |
i = 0; |
|
387 |
||
388 |
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, |
|
389 |
trx_doublewrite->block1, 0, len, |
|
390 |
(void*) write_buf, NULL); |
|
391 |
||
392 |
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; |
|
393 |
len2 += UNIV_PAGE_SIZE, i++) { |
|
394 |
const buf_block_t* block = (buf_block_t*) |
|
395 |
trx_doublewrite->buf_block_arr[i]; |
|
396 |
||
397 |
if (UNIV_LIKELY(!block->page.zip.data) |
|
398 |
&& UNIV_LIKELY(buf_block_get_state(block) |
|
399 |
== BUF_BLOCK_FILE_PAGE) |
|
400 |
&& UNIV_UNLIKELY |
|
401 |
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4), |
|
402 |
write_buf + len2 |
|
403 |
+ (UNIV_PAGE_SIZE |
|
404 |
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) { |
|
405 |
ut_print_timestamp(stderr); |
|
406 |
fprintf(stderr, |
|
407 |
" InnoDB: ERROR: The page to be written"
|
|
408 |
" seems corrupt!\n" |
|
409 |
"InnoDB: The lsn fields do not match!"
|
|
410 |
" Noticed in the doublewrite block1.\n"); |
|
411 |
}
|
|
412 |
}
|
|
413 |
||
414 |
if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { |
|
415 |
goto flush; |
|
416 |
}
|
|
417 |
||
418 |
len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) |
|
419 |
* UNIV_PAGE_SIZE; |
|
420 |
||
421 |
write_buf = trx_doublewrite->write_buf |
|
422 |
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; |
|
423 |
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE); |
|
424 |
||
425 |
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, |
|
426 |
trx_doublewrite->block2, 0, len, |
|
427 |
(void*) write_buf, NULL); |
|
428 |
||
429 |
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; |
|
430 |
len2 += UNIV_PAGE_SIZE, i++) { |
|
431 |
const buf_block_t* block = (buf_block_t*) |
|
432 |
trx_doublewrite->buf_block_arr[i]; |
|
433 |
||
434 |
if (UNIV_LIKELY(!block->page.zip.data) |
|
435 |
&& UNIV_LIKELY(buf_block_get_state(block) |
|
436 |
== BUF_BLOCK_FILE_PAGE) |
|
437 |
&& UNIV_UNLIKELY |
|
438 |
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4), |
|
439 |
write_buf + len2 |
|
440 |
+ (UNIV_PAGE_SIZE |
|
441 |
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) { |
|
442 |
ut_print_timestamp(stderr); |
|
443 |
fprintf(stderr, |
|
444 |
" InnoDB: ERROR: The page to be"
|
|
445 |
" written seems corrupt!\n" |
|
446 |
"InnoDB: The lsn fields do not match!"
|
|
447 |
" Noticed in"
|
|
448 |
" the doublewrite block2.\n"); |
|
449 |
}
|
|
450 |
}
|
|
451 |
||
452 |
flush: |
|
453 |
/* Now flush the doublewrite buffer data to disk */
|
|
454 |
||
455 |
fil_flush(TRX_SYS_SPACE); |
|
456 |
||
457 |
/* We know that the writes have been flushed to disk now
|
|
458 |
and in recovery we will find them in the doublewrite buffer
|
|
459 |
blocks. Next do the writes to the intended positions. */
|
|
460 |
||
461 |
for (i = 0; i < trx_doublewrite->first_free; i++) { |
|
462 |
const buf_block_t* block = (buf_block_t*) |
|
463 |
trx_doublewrite->buf_block_arr[i]; |
|
464 |
||
465 |
ut_a(buf_page_in_file(&block->page)); |
|
466 |
if (UNIV_LIKELY_NULL(block->page.zip.data)) { |
|
467 |
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, |
|
468 |
FALSE, buf_page_get_space(&block->page), |
|
469 |
buf_page_get_zip_size(&block->page), |
|
470 |
buf_page_get_page_no(&block->page), 0, |
|
471 |
buf_page_get_zip_size(&block->page), |
|
472 |
(void*)block->page.zip.data, |
|
473 |
(void*)block); |
|
474 |
||
475 |
/* Increment the counter of I/O operations used
|
|
476 |
for selecting LRU policy. */
|
|
477 |
buf_LRU_stat_inc_io(); |
|
478 |
||
479 |
continue; |
|
480 |
}
|
|
481 |
||
482 |
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
|
483 |
||
484 |
if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4), |
|
485 |
block->frame |
|
486 |
+ (UNIV_PAGE_SIZE |
|
487 |
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), |
|
488 |
4))) { |
|
489 |
ut_print_timestamp(stderr); |
|
490 |
fprintf(stderr, |
|
491 |
" InnoDB: ERROR: The page to be written"
|
|
492 |
" seems corrupt!\n" |
|
493 |
"InnoDB: The lsn fields do not match!"
|
|
494 |
" Noticed in the buffer pool\n" |
|
495 |
"InnoDB: after posting and flushing"
|
|
496 |
" the doublewrite buffer.\n" |
|
497 |
"InnoDB: Page buf fix count %lu,"
|
|
498 |
" io fix %lu, state %lu\n", |
|
499 |
(ulong)block->page.buf_fix_count, |
|
500 |
(ulong)buf_block_get_io_fix(block), |
|
501 |
(ulong)buf_block_get_state(block)); |
|
502 |
}
|
|
503 |
||
504 |
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, |
|
505 |
FALSE, buf_block_get_space(block), 0, |
|
506 |
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, |
|
507 |
(void*)block->frame, (void*)block); |
|
508 |
||
509 |
/* Increment the counter of I/O operations used
|
|
510 |
for selecting LRU policy. */
|
|
511 |
buf_LRU_stat_inc_io(); |
|
512 |
}
|
|
513 |
||
514 |
/* Wake possible simulated aio thread to actually post the
|
|
515 |
writes to the operating system */
|
|
516 |
||
517 |
os_aio_simulated_wake_handler_threads(); |
|
518 |
||
519 |
/* Wait that all async writes to tablespaces have been posted to
|
|
520 |
the OS */
|
|
521 |
||
522 |
os_aio_wait_until_no_pending_writes(); |
|
523 |
||
524 |
/* Now we flush the data to disk (for example, with fsync) */
|
|
525 |
||
526 |
fil_flush_file_spaces(FIL_TABLESPACE); |
|
527 |
||
528 |
/* We can now reuse the doublewrite memory buffer: */
|
|
529 |
||
530 |
trx_doublewrite->first_free = 0; |
|
531 |
||
532 |
mutex_exit(&(trx_doublewrite->mutex)); |
|
533 |
}
|
|
534 |
||
535 |
/************************************************************************
|
|
536 |
Posts a buffer page for writing. If the doublewrite memory buffer is
|
|
537 |
full, calls buf_flush_buffered_writes and waits for for free space to
|
|
538 |
appear. */
|
|
539 |
static
|
|
540 |
void
|
|
541 |
buf_flush_post_to_doublewrite_buf( |
|
542 |
/*==============================*/
|
|
543 |
buf_page_t* bpage) /* in: buffer block to write */ |
|
544 |
{
|
|
545 |
ulint zip_size; |
|
546 |
try_again: |
|
547 |
mutex_enter(&(trx_doublewrite->mutex)); |
|
548 |
||
549 |
ut_a(buf_page_in_file(bpage)); |
|
550 |
||
551 |
if (trx_doublewrite->first_free |
|
552 |
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { |
|
553 |
mutex_exit(&(trx_doublewrite->mutex)); |
|
554 |
||
555 |
buf_flush_buffered_writes(); |
|
556 |
||
557 |
goto try_again; |
|
558 |
}
|
|
559 |
||
560 |
zip_size = buf_page_get_zip_size(bpage); |
|
561 |
||
562 |
if (UNIV_UNLIKELY(zip_size)) { |
|
563 |
/* Copy the compressed page and clear the rest. */
|
|
564 |
memcpy(trx_doublewrite->write_buf |
|
565 |
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free, |
|
566 |
bpage->zip.data, zip_size); |
|
567 |
memset(trx_doublewrite->write_buf |
|
568 |
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free |
|
569 |
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size); |
|
570 |
} else { |
|
571 |
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); |
|
572 |
||
573 |
memcpy(trx_doublewrite->write_buf |
|
574 |
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free, |
|
575 |
((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE); |
|
576 |
}
|
|
577 |
||
578 |
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage; |
|
579 |
||
580 |
trx_doublewrite->first_free++; |
|
581 |
||
582 |
if (trx_doublewrite->first_free |
|
583 |
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { |
|
584 |
mutex_exit(&(trx_doublewrite->mutex)); |
|
585 |
||
586 |
buf_flush_buffered_writes(); |
|
587 |
||
588 |
return; |
|
589 |
}
|
|
590 |
||
591 |
mutex_exit(&(trx_doublewrite->mutex)); |
|
592 |
}
|
|
593 |
||
594 |
/************************************************************************
|
|
595 |
Initializes a page for writing to the tablespace. */
|
|
596 |
UNIV_INTERN
|
|
597 |
void
|
|
598 |
buf_flush_init_for_writing( |
|
599 |
/*=======================*/
|
|
600 |
byte* page, /* in/out: page */ |
|
601 |
void* page_zip_, /* in/out: compressed page, or NULL */ |
|
602 |
ib_uint64_t newest_lsn) /* in: newest modification lsn |
|
603 |
to the page */
|
|
604 |
{
|
|
605 |
ut_ad(page); |
|
606 |
||
607 |
if (page_zip_) { |
|
608 |
page_zip_des_t* page_zip = page_zip_; |
|
609 |
ulint zip_size = page_zip_get_size(page_zip); |
|
610 |
ut_ad(zip_size); |
|
611 |
ut_ad(ut_is_2pow(zip_size)); |
|
612 |
ut_ad(zip_size <= UNIV_PAGE_SIZE); |
|
613 |
||
614 |
switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) { |
|
615 |
case FIL_PAGE_TYPE_ALLOCATED: |
|
616 |
case FIL_PAGE_INODE: |
|
617 |
case FIL_PAGE_IBUF_BITMAP: |
|
618 |
case FIL_PAGE_TYPE_FSP_HDR: |
|
619 |
case FIL_PAGE_TYPE_XDES: |
|
620 |
/* These are essentially uncompressed pages. */
|
|
621 |
memcpy(page_zip->data, page, zip_size); |
|
622 |
/* fall through */
|
|
623 |
case FIL_PAGE_TYPE_ZBLOB: |
|
624 |
case FIL_PAGE_TYPE_ZBLOB2: |
|
625 |
case FIL_PAGE_INDEX: |
|
626 |
mach_write_ull(page_zip->data |
|
627 |
+ FIL_PAGE_LSN, newest_lsn); |
|
628 |
memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); |
|
629 |
mach_write_to_4(page_zip->data |
|
630 |
+ FIL_PAGE_SPACE_OR_CHKSUM, |
|
631 |
srv_use_checksums
|
|
632 |
? page_zip_calc_checksum( |
|
633 |
page_zip->data, zip_size) |
|
634 |
: BUF_NO_CHECKSUM_MAGIC); |
|
635 |
return; |
|
636 |
}
|
|
637 |
||
641.2.1
by Monty Taylor
InnoDB Plugin 1.0.2 |
638 |
ut_print_timestamp(stderr); |
639 |
fputs(" InnoDB: ERROR: The compressed page to be written" |
|
640 |
" seems corrupt:", stderr); |
|
641 |
ut_print_buf(stderr, page, zip_size); |
|
642 |
fputs("\nInnoDB: Possibly older version of the page:", stderr); |
|
643 |
ut_print_buf(stderr, page_zip->data, zip_size); |
|
644 |
putc('\n', stderr); |
|
641.1.2
by Monty Taylor
Imported 1.0.1 with clean - with no changes. |
645 |
ut_error; |
646 |
}
|
|
647 |
||
648 |
/* Write the newest modification lsn to the page header and trailer */
|
|
649 |
mach_write_ull(page + FIL_PAGE_LSN, newest_lsn); |
|
650 |
||
651 |
mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, |
|
652 |
newest_lsn); |
|
653 |
||
654 |
/* Store the new formula checksum */
|
|
655 |
||
656 |
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
|
657 |
srv_use_checksums
|
|
658 |
? buf_calc_page_new_checksum(page) |
|
659 |
: BUF_NO_CHECKSUM_MAGIC); |
|
660 |
||
661 |
/* We overwrite the first 4 bytes of the end lsn field to store
|
|
662 |
the old formula checksum. Since it depends also on the field
|
|
663 |
FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
|
|
664 |
new formula checksum. */
|
|
665 |
||
666 |
mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, |
|
667 |
srv_use_checksums
|
|
668 |
? buf_calc_page_old_checksum(page) |
|
669 |
: BUF_NO_CHECKSUM_MAGIC); |
|
670 |
}
|
|
671 |
||
672 |
/************************************************************************
|
|
673 |
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
|
|
674 |
also when the doublewrite buffer is used, we must call
|
|
675 |
buf_flush_buffered_writes after we have posted a batch of writes! */
|
|
676 |
static
|
|
677 |
void
|
|
678 |
buf_flush_write_block_low( |
|
679 |
/*======================*/
|
|
680 |
buf_page_t* bpage) /* in: buffer block to write */ |
|
681 |
{
|
|
682 |
ulint zip_size = buf_page_get_zip_size(bpage); |
|
683 |
page_t* frame = NULL; |
|
684 |
#ifdef UNIV_LOG_DEBUG
|
|
685 |
static ibool univ_log_debug_warned; |
|
686 |
#endif /* UNIV_LOG_DEBUG */ |
|
687 |
||
688 |
ut_ad(buf_page_in_file(bpage)); |
|
689 |
||
690 |
#ifdef UNIV_IBUF_COUNT_DEBUG
|
|
691 |
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); |
|
692 |
#endif
|
|
693 |
ut_ad(bpage->newest_modification != 0); |
|
694 |
||
695 |
#ifdef UNIV_LOG_DEBUG
|
|
696 |
if (!univ_log_debug_warned) { |
|
697 |
univ_log_debug_warned = TRUE; |
|
698 |
fputs("Warning: cannot force log to disk if" |
|
699 |
" UNIV_LOG_DEBUG is defined!\n" |
|
700 |
"Crash recovery will not work!\n", |
|
701 |
stderr); |
|
702 |
}
|
|
703 |
#else
|
|
704 |
/* Force the log to the disk before writing the modified block */
|
|
705 |
log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE); |
|
706 |
#endif
|
|
707 |
switch (buf_page_get_state(bpage)) { |
|
708 |
case BUF_BLOCK_ZIP_FREE: |
|
709 |
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */ |
|
710 |
case BUF_BLOCK_NOT_USED: |
|
711 |
case BUF_BLOCK_READY_FOR_USE: |
|
712 |
case BUF_BLOCK_MEMORY: |
|
713 |
case BUF_BLOCK_REMOVE_HASH: |
|
714 |
ut_error; |
|
715 |
break; |
|
716 |
case BUF_BLOCK_ZIP_DIRTY: |
|
717 |
frame = bpage->zip.data; |
|
718 |
if (UNIV_LIKELY(srv_use_checksums)) { |
|
719 |
ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) |
|
720 |
== page_zip_calc_checksum(frame, zip_size)); |
|
721 |
}
|
|
722 |
mach_write_ull(frame + FIL_PAGE_LSN, |
|
723 |
bpage->newest_modification); |
|
724 |
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); |
|
725 |
break; |
|
726 |
case BUF_BLOCK_FILE_PAGE: |
|
727 |
frame = bpage->zip.data; |
|
728 |
if (!frame) { |
|
729 |
frame = ((buf_block_t*) bpage)->frame; |
|
730 |
}
|
|
731 |
||
732 |
buf_flush_init_for_writing(((buf_block_t*) bpage)->frame, |
|
733 |
bpage->zip.data |
|
734 |
? &bpage->zip : NULL, |
|
735 |
bpage->newest_modification); |
|
736 |
break; |
|
737 |
}
|
|
738 |
||
739 |
if (!srv_use_doublewrite_buf || !trx_doublewrite) { |
|
740 |
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, |
|
741 |
FALSE, buf_page_get_space(bpage), zip_size, |
|
742 |
buf_page_get_page_no(bpage), 0, |
|
743 |
zip_size ? zip_size : UNIV_PAGE_SIZE, |
|
744 |
frame, bpage); |
|
745 |
} else { |
|
746 |
buf_flush_post_to_doublewrite_buf(bpage); |
|
747 |
}
|
|
748 |
}
|
|
749 |
||
750 |
/************************************************************************
|
|
751 |
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
|
|
752 |
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
|
|
753 |
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
|
|
754 |
of writes! */
|
|
755 |
static
|
|
756 |
ulint
|
|
757 |
buf_flush_try_page( |
|
758 |
/*===============*/
|
|
759 |
/* out: 1 if a page was
|
|
760 |
flushed, 0 otherwise */
|
|
761 |
ulint space, /* in: space id */ |
|
762 |
ulint offset, /* in: page offset */ |
|
763 |
enum buf_flush flush_type) /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, |
|
764 |
or BUF_FLUSH_SINGLE_PAGE */
|
|
765 |
{
|
|
766 |
buf_page_t* bpage; |
|
767 |
mutex_t* block_mutex; |
|
768 |
ibool locked; |
|
769 |
||
770 |
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST |
|
771 |
|| flush_type == BUF_FLUSH_SINGLE_PAGE); |
|
772 |
||
773 |
buf_pool_mutex_enter(); |
|
774 |
||
775 |
bpage = buf_page_hash_get(space, offset); |
|
776 |
||
777 |
if (!bpage) { |
|
778 |
buf_pool_mutex_exit(); |
|
779 |
return(0); |
|
780 |
}
|
|
781 |
||
782 |
ut_a(buf_page_in_file(bpage)); |
|
783 |
block_mutex = buf_page_get_mutex(bpage); |
|
784 |
||
785 |
mutex_enter(block_mutex); |
|
786 |
||
787 |
if (!buf_flush_ready_for_flush(bpage, flush_type)) { |
|
788 |
mutex_exit(block_mutex); |
|
789 |
buf_pool_mutex_exit(); |
|
790 |
return(0); |
|
791 |
}
|
|
792 |
||
793 |
switch (flush_type) { |
|
794 |
case BUF_FLUSH_LIST: |
|
795 |
buf_page_set_io_fix(bpage, BUF_IO_WRITE); |
|
796 |
||
797 |
buf_page_set_flush_type(bpage, flush_type); |
|
798 |
||
799 |
if (buf_pool->n_flush[flush_type] == 0) { |
|
800 |
||
801 |
os_event_reset(buf_pool->no_flush[flush_type]); |
|
802 |
}
|
|
803 |
||
804 |
buf_pool->n_flush[flush_type]++; |
|
805 |
||
806 |
/* If the simulated aio thread is not running, we must
|
|
807 |
not wait for any latch, as we may end up in a deadlock:
|
|
808 |
if buf_fix_count == 0, then we know we need not wait */
|
|
809 |
||
810 |
locked = bpage->buf_fix_count == 0; |
|
811 |
if (locked |
|
812 |
&& buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { |
|
813 |
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock, |
|
814 |
BUF_IO_WRITE); |
|
815 |
}
|
|
816 |
||
817 |
mutex_exit(block_mutex); |
|
818 |
buf_pool_mutex_exit(); |
|
819 |
||
820 |
if (!locked) { |
|
821 |
buf_flush_buffered_writes(); |
|
822 |
||
823 |
if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { |
|
824 |
rw_lock_s_lock_gen(&((buf_block_t*) bpage) |
|
825 |
->lock, BUF_IO_WRITE); |
|
826 |
}
|
|
827 |
}
|
|
828 |
||
829 |
break; |
|
830 |
||
831 |
case BUF_FLUSH_LRU: |
|
832 |
/* VERY IMPORTANT:
|
|
833 |
Because any thread may call the LRU flush, even when owning
|
|
834 |
locks on pages, to avoid deadlocks, we must make sure that the
|
|
835 |
s-lock is acquired on the page without waiting: this is
|
|
836 |
accomplished because in the if-condition above we require
|
|
837 |
the page not to be bufferfixed (in function
|
|
838 |
..._ready_for_flush). */
|
|
839 |
||
840 |
buf_page_set_io_fix(bpage, BUF_IO_WRITE); |
|
841 |
||
842 |
buf_page_set_flush_type(bpage, flush_type); |
|
843 |
||
844 |
if (buf_pool->n_flush[flush_type] == 0) { |
|
845 |
||
846 |
os_event_reset(buf_pool->no_flush[flush_type]); |
|
847 |
}
|
|
848 |
||
849 |
buf_pool->n_flush[flush_type]++; |
|
850 |
||
851 |
if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { |
|
852 |
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock, |
|
853 |
BUF_IO_WRITE); |
|
854 |
}
|
|
855 |
||
856 |
/* Note that the s-latch is acquired before releasing the
|
|
857 |
buf_pool mutex: this ensures that the latch is acquired
|
|
858 |
immediately. */
|
|
859 |
||
860 |
mutex_exit(block_mutex); |
|
861 |
buf_pool_mutex_exit(); |
|
862 |
break; |
|
863 |
||
864 |
case BUF_FLUSH_SINGLE_PAGE: |
|
865 |
buf_page_set_io_fix(bpage, BUF_IO_WRITE); |
|
866 |
||
867 |
buf_page_set_flush_type(bpage, flush_type); |
|
868 |
||
869 |
if (buf_pool->n_flush[flush_type] == 0) { |
|
870 |
||
871 |
os_event_reset(buf_pool->no_flush[flush_type]); |
|
872 |
}
|
|
873 |
||
874 |
buf_pool->n_flush[flush_type]++; |
|
875 |
||
876 |
mutex_exit(block_mutex); |
|
877 |
buf_pool_mutex_exit(); |
|
878 |
||
879 |
if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { |
|
880 |
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock, |
|
881 |
BUF_IO_WRITE); |
|
882 |
}
|
|
883 |
break; |
|
884 |
||
885 |
default: |
|
886 |
ut_error; |
|
887 |
}
|
|
888 |
||
889 |
#ifdef UNIV_DEBUG
|
|
890 |
if (buf_debug_prints) { |
|
891 |
fprintf(stderr, |
|
892 |
"Flushing %u space %u page %u\n", |
|
893 |
flush_type, bpage->space, bpage->offset); |
|
894 |
}
|
|
895 |
#endif /* UNIV_DEBUG */ |
|
896 |
buf_flush_write_block_low(bpage); |
|
897 |
||
898 |
return(1); |
|
899 |
}
|
|
900 |
||
901 |
/***************************************************************
|
|
902 |
Flushes to disk all flushable pages within the flush area. */
|
|
903 |
static
|
|
904 |
ulint
|
|
905 |
buf_flush_try_neighbors( |
|
906 |
/*====================*/
|
|
907 |
/* out: number of pages flushed */
|
|
908 |
ulint space, /* in: space id */ |
|
909 |
ulint offset, /* in: page offset */ |
|
910 |
enum buf_flush flush_type) /* in: BUF_FLUSH_LRU or |
|
911 |
BUF_FLUSH_LIST */
|
|
912 |
{
|
|
913 |
buf_page_t* bpage; |
|
914 |
ulint low, high; |
|
915 |
ulint count = 0; |
|
916 |
ulint i; |
|
917 |
||
918 |
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
|
919 |
||
920 |
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { |
|
921 |
/* If there is little space, it is better not to flush any
|
|
922 |
block except from the end of the LRU list */
|
|
923 |
||
924 |
low = offset; |
|
925 |
high = offset + 1; |
|
926 |
} else { |
|
927 |
/* When flushed, dirty blocks are searched in neighborhoods of
|
|
928 |
this size, and flushed along with the original page. */
|
|
929 |
||
930 |
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA, |
|
931 |
buf_pool->curr_size / 16); |
|
932 |
||
933 |
low = (offset / buf_flush_area) * buf_flush_area; |
|
934 |
high = (offset / buf_flush_area + 1) * buf_flush_area; |
|
935 |
}
|
|
936 |
||
937 |
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
|
|
938 |
||
939 |
if (high > fil_space_get_size(space)) { |
|
940 |
high = fil_space_get_size(space); |
|
941 |
}
|
|
942 |
||
943 |
buf_pool_mutex_enter(); |
|
944 |
||
945 |
for (i = low; i < high; i++) { |
|
946 |
||
947 |
bpage = buf_page_hash_get(space, i); |
|
948 |
ut_a(!bpage || buf_page_in_file(bpage)); |
|
949 |
||
950 |
if (!bpage) { |
|
951 |
||
952 |
continue; |
|
953 |
||
954 |
} else if (flush_type == BUF_FLUSH_LRU && i != offset |
|
955 |
&& !buf_page_is_old(bpage)) { |
|
956 |
||
957 |
/* We avoid flushing 'non-old' blocks in an LRU flush,
|
|
958 |
because the flushed blocks are soon freed */
|
|
959 |
||
960 |
continue; |
|
961 |
} else { |
|
962 |
||
963 |
mutex_t* block_mutex = buf_page_get_mutex(bpage); |
|
964 |
||
965 |
mutex_enter(block_mutex); |
|
966 |
||
967 |
if (buf_flush_ready_for_flush(bpage, flush_type) |
|
968 |
&& (i == offset || !bpage->buf_fix_count)) { |
|
969 |
/* We only try to flush those
|
|
970 |
neighbors != offset where the buf fix count is
|
|
971 |
zero, as we then know that we probably can
|
|
972 |
latch the page without a semaphore wait.
|
|
973 |
Semaphore waits are expensive because we must
|
|
974 |
flush the doublewrite buffer before we start
|
|
975 |
waiting. */
|
|
976 |
||
977 |
buf_pool_mutex_exit(); |
|
978 |
||
979 |
mutex_exit(block_mutex); |
|
980 |
||
981 |
/* Note: as we release the buf_pool mutex
|
|
982 |
above, in buf_flush_try_page we cannot be sure
|
|
983 |
the page is still in a flushable state:
|
|
984 |
therefore we check it again inside that
|
|
985 |
function. */
|
|
986 |
||
987 |
count += buf_flush_try_page(space, i, |
|
988 |
flush_type); |
|
989 |
||
990 |
buf_pool_mutex_enter(); |
|
991 |
} else { |
|
992 |
mutex_exit(block_mutex); |
|
993 |
}
|
|
994 |
}
|
|
995 |
}
|
|
996 |
||
997 |
buf_pool_mutex_exit(); |
|
998 |
||
999 |
return(count); |
|
1000 |
}
|
|
1001 |
||
1002 |
/***********************************************************************
|
|
1003 |
This utility flushes dirty blocks from the end of the LRU list or flush_list.
|
|
1004 |
NOTE 1: in the case of an LRU flush the calling thread may own latches to
|
|
1005 |
pages: to avoid deadlocks, this function must be written so that it cannot
|
|
1006 |
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
|
|
1007 |
the calling thread is not allowed to own any latches on pages! */
|
|
1008 |
UNIV_INTERN
|
|
1009 |
ulint
|
|
1010 |
buf_flush_batch( |
|
1011 |
/*============*/
|
|
1012 |
/* out: number of blocks for which the
|
|
1013 |
write request was queued;
|
|
1014 |
ULINT_UNDEFINED if there was a flush
|
|
1015 |
of the same type already running */
|
|
1016 |
enum buf_flush flush_type, /* in: BUF_FLUSH_LRU or |
|
1017 |
BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
|
|
1018 |
then the caller must not own any
|
|
1019 |
latches on pages */
|
|
1020 |
ulint min_n, /* in: wished minimum mumber of blocks |
|
1021 |
flushed (it is not guaranteed that the
|
|
1022 |
actual number is that big, though) */
|
|
1023 |
ib_uint64_t lsn_limit) /* in the case BUF_FLUSH_LIST all |
|
1024 |
blocks whose oldest_modification is
|
|
1025 |
smaller than this should be flushed
|
|
1026 |
(if their number does not exceed
|
|
1027 |
min_n), otherwise ignored */
|
|
1028 |
{
|
|
1029 |
buf_page_t* bpage; |
|
1030 |
ulint page_count = 0; |
|
1031 |
ulint old_page_count; |
|
1032 |
ulint space; |
|
1033 |
ulint offset; |
|
1034 |
||
1035 |
ut_ad((flush_type == BUF_FLUSH_LRU) |
|
1036 |
|| (flush_type == BUF_FLUSH_LIST)); |
|
1037 |
#ifdef UNIV_SYNC_DEBUG
|
|
1038 |
ut_ad((flush_type != BUF_FLUSH_LIST) |
|
1039 |
|| sync_thread_levels_empty_gen(TRUE)); |
|
1040 |
#endif /* UNIV_SYNC_DEBUG */ |
|
1041 |
buf_pool_mutex_enter(); |
|
1042 |
||
1043 |
if ((buf_pool->n_flush[flush_type] > 0) |
|
1044 |
|| (buf_pool->init_flush[flush_type] == TRUE)) { |
|
1045 |
||
1046 |
/* There is already a flush batch of the same type running */
|
|
1047 |
||
1048 |
buf_pool_mutex_exit(); |
|
1049 |
||
1050 |
return(ULINT_UNDEFINED); |
|
1051 |
}
|
|
1052 |
||
1053 |
buf_pool->init_flush[flush_type] = TRUE; |
|
1054 |
||
779.3.32
by Monty Taylor
Undid bad solaris warning fix. |
1055 |
for (;;) { |
641.1.2
by Monty Taylor
Imported 1.0.1 with clean - with no changes. |
1056 |
flush_next: |
1057 |
/* If we have flushed enough, leave the loop */
|
|
1058 |
if (page_count >= min_n) { |
|
1059 |
||
1060 |
break; |
|
1061 |
}
|
|
1062 |
||
1063 |
/* Start from the end of the list looking for a suitable
|
|
1064 |
block to be flushed. */
|
|
1065 |
||
1066 |
if (flush_type == BUF_FLUSH_LRU) { |
|
1067 |
bpage = UT_LIST_GET_LAST(buf_pool->LRU); |
|
1068 |
} else { |
|
1069 |
ut_ad(flush_type == BUF_FLUSH_LIST); |
|
1070 |
||
1071 |
bpage = UT_LIST_GET_LAST(buf_pool->flush_list); |
|
1072 |
if (!bpage |
|
1073 |
|| bpage->oldest_modification >= lsn_limit) { |
|
1074 |
/* We have flushed enough */
|
|
1075 |
||
1076 |
break; |
|
1077 |
}
|
|
1078 |
ut_ad(bpage->in_flush_list); |
|
1079 |
}
|
|
1080 |
||
1081 |
/* Note that after finding a single flushable page, we try to
|
|
1082 |
flush also all its neighbors, and after that start from the
|
|
1083 |
END of the LRU list or flush list again: the list may change
|
|
1084 |
during the flushing and we cannot safely preserve within this
|
|
1085 |
function a pointer to a block in the list! */
|
|
1086 |
||
1087 |
do { |
|
1088 |
mutex_t* block_mutex = buf_page_get_mutex(bpage); |
|
1089 |
||
1090 |
ut_a(buf_page_in_file(bpage)); |
|
1091 |
||
1092 |
mutex_enter(block_mutex); |
|
1093 |
||
1094 |
if (buf_flush_ready_for_flush(bpage, flush_type)) { |
|
1095 |
||
1096 |
space = buf_page_get_space(bpage); |
|
1097 |
offset = buf_page_get_page_no(bpage); |
|
1098 |
||
1099 |
buf_pool_mutex_exit(); |
|
1100 |
mutex_exit(block_mutex); |
|
1101 |
||
1102 |
old_page_count = page_count; |
|
1103 |
||
1104 |
/* Try to flush also all the neighbors */
|
|
1105 |
page_count += buf_flush_try_neighbors( |
|
1106 |
space, offset, flush_type); |
|
1107 |
/* fprintf(stderr,
|
|
1108 |
"Flush type %lu, page no %lu, neighb %lu\n",
|
|
1109 |
flush_type, offset,
|
|
1110 |
page_count - old_page_count); */
|
|
1111 |
||
1112 |
buf_pool_mutex_enter(); |
|
1113 |
goto flush_next; |
|
1114 |
||
1115 |
} else if (flush_type == BUF_FLUSH_LRU) { |
|
1116 |
||
1117 |
mutex_exit(block_mutex); |
|
1118 |
||
1119 |
bpage = UT_LIST_GET_PREV(LRU, bpage); |
|
1120 |
} else { |
|
1121 |
ut_ad(flush_type == BUF_FLUSH_LIST); |
|
1122 |
||
1123 |
mutex_exit(block_mutex); |
|
1124 |
||
1125 |
bpage = UT_LIST_GET_PREV(list, bpage); |
|
1126 |
ut_ad(!bpage || bpage->in_flush_list); |
|
1127 |
}
|
|
1128 |
} while (bpage != NULL); |
|
1129 |
||
779.3.32
by Monty Taylor
Undid bad solaris warning fix. |
1130 |
/* If we could not find anything to flush, leave the loop */
|
1131 |
||
1132 |
break; |
|
1133 |
||
641.1.2
by Monty Taylor
Imported 1.0.1 with clean - with no changes. |
1134 |
}
|
1135 |
||
1136 |
buf_pool->init_flush[flush_type] = FALSE; |
|
1137 |
||
1138 |
if ((buf_pool->n_flush[flush_type] == 0) |
|
1139 |
&& (buf_pool->init_flush[flush_type] == FALSE)) { |
|
1140 |
||
1141 |
/* The running flush batch has ended */
|
|
1142 |
||
1143 |
os_event_set(buf_pool->no_flush[flush_type]); |
|
1144 |
}
|
|
1145 |
||
1146 |
buf_pool_mutex_exit(); |
|
1147 |
||
1148 |
buf_flush_buffered_writes(); |
|
1149 |
||
1150 |
#ifdef UNIV_DEBUG
|
|
1151 |
if (buf_debug_prints && page_count > 0) { |
|
1152 |
ut_a(flush_type == BUF_FLUSH_LRU |
|
1153 |
|| flush_type == BUF_FLUSH_LIST); |
|
1154 |
fprintf(stderr, flush_type == BUF_FLUSH_LRU |
|
1155 |
? "Flushed %lu pages in LRU flush\n" |
|
1156 |
: "Flushed %lu pages in flush list flush\n", |
|
1157 |
(ulong) page_count); |
|
1158 |
}
|
|
1159 |
#endif /* UNIV_DEBUG */ |
|
1160 |
||
1161 |
srv_buf_pool_flushed += page_count; |
|
1162 |
||
1163 |
return(page_count); |
|
1164 |
}
|
|
1165 |
||
1166 |
/**********************************************************************
|
|
1167 |
Waits until a flush batch of the given type ends */
|
|
1168 |
UNIV_INTERN
|
|
1169 |
void
|
|
1170 |
buf_flush_wait_batch_end( |
|
1171 |
/*=====================*/
|
|
1172 |
enum buf_flush type) /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ |
|
1173 |
{
|
|
1174 |
ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST)); |
|
1175 |
||
1176 |
os_event_wait(buf_pool->no_flush[type]); |
|
1177 |
}
|
|
1178 |
||
1179 |
/**********************************************************************
|
|
1180 |
Gives a recommendation of how many blocks should be flushed to establish
|
|
1181 |
a big enough margin of replaceable blocks near the end of the LRU list
|
|
1182 |
and in the free list. */
|
|
1183 |
static
|
|
1184 |
ulint
|
|
1185 |
buf_flush_LRU_recommendation(void) |
|
1186 |
/*==============================*/
|
|
1187 |
/* out: number of blocks which should be flushed
|
|
1188 |
from the end of the LRU list */
|
|
1189 |
{
|
|
1190 |
buf_page_t* bpage; |
|
1191 |
ulint n_replaceable; |
|
1192 |
ulint distance = 0; |
|
1193 |
||
1194 |
buf_pool_mutex_enter(); |
|
1195 |
||
1196 |
n_replaceable = UT_LIST_GET_LEN(buf_pool->free); |
|
1197 |
||
1198 |
bpage = UT_LIST_GET_LAST(buf_pool->LRU); |
|
1199 |
||
1200 |
while ((bpage != NULL) |
|
1201 |
&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN |
|
1202 |
+ BUF_FLUSH_EXTRA_MARGIN) |
|
1203 |
&& (distance < BUF_LRU_FREE_SEARCH_LEN)) { |
|
1204 |
||
1205 |
mutex_t* block_mutex = buf_page_get_mutex(bpage); |
|
1206 |
||
1207 |
mutex_enter(block_mutex); |
|
1208 |
||
1209 |
if (buf_flush_ready_for_replace(bpage)) { |
|
1210 |
n_replaceable++; |
|
1211 |
}
|
|
1212 |
||
1213 |
mutex_exit(block_mutex); |
|
1214 |
||
1215 |
distance++; |
|
1216 |
||
1217 |
bpage = UT_LIST_GET_PREV(LRU, bpage); |
|
1218 |
}
|
|
1219 |
||
1220 |
buf_pool_mutex_exit(); |
|
1221 |
||
1222 |
if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { |
|
1223 |
||
1224 |
return(0); |
|
1225 |
}
|
|
1226 |
||
1227 |
return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN |
|
1228 |
- n_replaceable); |
|
1229 |
}
|
|
1230 |
||
1231 |
/*************************************************************************
|
|
1232 |
Flushes pages from the end of the LRU list if there is too small a margin
|
|
1233 |
of replaceable pages there or in the free list. VERY IMPORTANT: this function
|
|
1234 |
is called also by threads which have locks on pages. To avoid deadlocks, we
|
|
1235 |
flush only pages such that the s-lock required for flushing can be acquired
|
|
1236 |
immediately, without waiting. */
|
|
1237 |
UNIV_INTERN
|
|
1238 |
void
|
|
1239 |
buf_flush_free_margin(void) |
|
1240 |
/*=======================*/
|
|
1241 |
{
|
|
1242 |
ulint n_to_flush; |
|
1243 |
ulint n_flushed; |
|
1244 |
||
1245 |
n_to_flush = buf_flush_LRU_recommendation(); |
|
1246 |
||
1247 |
if (n_to_flush > 0) { |
|
1248 |
n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0); |
|
1249 |
if (n_flushed == ULINT_UNDEFINED) { |
|
1250 |
/* There was an LRU type flush batch already running;
|
|
1251 |
let us wait for it to end */
|
|
1252 |
||
1253 |
buf_flush_wait_batch_end(BUF_FLUSH_LRU); |
|
1254 |
}
|
|
1255 |
}
|
|
1256 |
}
|
|
1257 |
||
1258 |
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
|
1259 |
/**********************************************************************
|
|
1260 |
Validates the flush list. */
|
|
1261 |
static
|
|
1262 |
ibool
|
|
1263 |
buf_flush_validate_low(void) |
|
1264 |
/*========================*/
|
|
1265 |
/* out: TRUE if ok */
|
|
1266 |
{
|
|
1267 |
buf_page_t* bpage; |
|
1268 |
||
1269 |
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list); |
|
1270 |
||
1271 |
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); |
|
1272 |
||
1273 |
while (bpage != NULL) { |
|
1274 |
const ib_uint64_t om = bpage->oldest_modification; |
|
1275 |
ut_ad(bpage->in_flush_list); |
|
1276 |
ut_a(buf_page_in_file(bpage)); |
|
1277 |
ut_a(om > 0); |
|
1278 |
||
1279 |
bpage = UT_LIST_GET_NEXT(list, bpage); |
|
1280 |
||
1281 |
ut_a(!bpage || om >= bpage->oldest_modification); |
|
1282 |
}
|
|
1283 |
||
1284 |
return(TRUE); |
|
1285 |
}
|
|
1286 |
||
1287 |
/**********************************************************************
|
|
1288 |
Validates the flush list. */
|
|
1289 |
UNIV_INTERN
|
|
1290 |
ibool
|
|
1291 |
buf_flush_validate(void) |
|
1292 |
/*====================*/
|
|
1293 |
/* out: TRUE if ok */
|
|
1294 |
{
|
|
1295 |
ibool ret; |
|
1296 |
||
1297 |
buf_pool_mutex_enter(); |
|
1298 |
||
1299 |
ret = buf_flush_validate_low(); |
|
1300 |
||
1301 |
buf_pool_mutex_exit(); |
|
1302 |
||
1303 |
return(ret); |
|
1304 |
}
|
|
1305 |
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |