~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/******************************************************
2
The database buffer buf_pool flush algorithm
3
4
(c) 1995-2001 Innobase Oy
5
6
Created 11/11/1995 Heikki Tuuri
7
*******************************************************/
8
9
#include "buf0flu.h"
10
11
#ifdef UNIV_NONINL
12
#include "buf0flu.ic"
13
#include "trx0sys.h"
14
#endif
15
16
#include "ut0byte.h"
17
#include "ut0lst.h"
18
#include "page0page.h"
19
#include "fil0fil.h"
20
#include "buf0buf.h"
21
#include "buf0lru.h"
22
#include "buf0rea.h"
23
#include "ibuf0ibuf.h"
24
#include "log0log.h"
25
#include "os0file.h"
26
#include "trx0sys.h"
27
#include "srv0srv.h"
28
29
/* When flushed, dirty blocks are searched in neighborhoods of this size, and
30
flushed along with the original page. */
31
32
#define BUF_FLUSH_AREA		ut_min(BUF_READ_AHEAD_AREA,\
33
		buf_pool->curr_size / 16)
34
35
/**********************************************************************
36
Validates the flush list. */
37
static
38
ibool
39
buf_flush_validate_low(void);
40
/*========================*/
41
		/* out: TRUE if ok */
42
43
/************************************************************************
44
Inserts a modified block into the flush list. */
45
46
void
47
buf_flush_insert_into_flush_list(
48
/*=============================*/
49
	buf_block_t*	block)	/* in: block which is modified */
50
{
51
	ut_ad(mutex_own(&(buf_pool->mutex)));
52
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
53
54
	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
55
	      || (ut_dulint_cmp((UT_LIST_GET_FIRST(buf_pool->flush_list))
56
				->oldest_modification,
57
				block->oldest_modification) <= 0));
58
59
	UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
60
61
	ut_ad(buf_flush_validate_low());
62
}
63
64
/************************************************************************
65
Inserts a modified block into the flush list in the right sorted position.
66
This function is used by recovery, because there the modifications do not
67
necessarily come in the order of lsn's. */
68
69
void
70
buf_flush_insert_sorted_into_flush_list(
71
/*====================================*/
72
	buf_block_t*	block)	/* in: block which is modified */
73
{
74
	buf_block_t*	prev_b;
75
	buf_block_t*	b;
76
77
	ut_ad(mutex_own(&(buf_pool->mutex)));
78
79
	prev_b = NULL;
80
	b = UT_LIST_GET_FIRST(buf_pool->flush_list);
81
82
	while (b && (ut_dulint_cmp(b->oldest_modification,
83
				   block->oldest_modification) > 0)) {
84
		prev_b = b;
85
		b = UT_LIST_GET_NEXT(flush_list, b);
86
	}
87
88
	if (prev_b == NULL) {
89
		UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
90
	} else {
91
		UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b,
92
				     block);
93
	}
94
95
	ut_ad(buf_flush_validate_low());
96
}
97
98
/************************************************************************
99
Returns TRUE if the file page block is immediately suitable for replacement,
100
i.e., the transition FILE_PAGE => NOT_USED allowed. */
101
102
ibool
103
buf_flush_ready_for_replace(
104
/*========================*/
105
				/* out: TRUE if can replace immediately */
106
	buf_block_t*	block)	/* in: buffer control block, must be in state
107
				BUF_BLOCK_FILE_PAGE and in the LRU list */
108
{
109
	ut_ad(mutex_own(&(buf_pool->mutex)));
110
	ut_ad(mutex_own(&block->mutex));
111
	if (block->state != BUF_BLOCK_FILE_PAGE) {
112
		ut_print_timestamp(stderr);
113
		fprintf(stderr,
114
			"  InnoDB: Error: buffer block state %lu"
115
			" in the LRU list!\n",
116
			(ulong)block->state);
117
		ut_print_buf(stderr, block, sizeof(buf_block_t));
118
119
		return(FALSE);
120
	}
121
122
	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
123
	    || (block->buf_fix_count != 0)
124
	    || (block->io_fix != 0)) {
125
126
		return(FALSE);
127
	}
128
129
	return(TRUE);
130
}
131
132
/************************************************************************
133
Returns TRUE if the block is modified and ready for flushing. */
134
UNIV_INLINE
135
ibool
136
buf_flush_ready_for_flush(
137
/*======================*/
138
				/* out: TRUE if can flush immediately */
139
	buf_block_t*	block,	/* in: buffer control block, must be in state
140
				BUF_BLOCK_FILE_PAGE */
141
	ulint		flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
142
{
143
	ut_ad(mutex_own(&(buf_pool->mutex)));
144
	ut_ad(mutex_own(&(block->mutex)));
145
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
146
147
	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
148
	    && (block->io_fix == 0)) {
149
		if (flush_type != BUF_FLUSH_LRU) {
150
151
			return(TRUE);
152
153
		} else if (block->buf_fix_count == 0) {
154
155
			/* If we are flushing the LRU list, to avoid deadlocks
156
			we require the block not to be bufferfixed, and hence
157
			not latched. */
158
159
			return(TRUE);
160
		}
161
	}
162
163
	return(FALSE);
164
}
165
166
/************************************************************************
167
Updates the flush system data structures when a write is completed. */
168
169
void
170
buf_flush_write_complete(
171
/*=====================*/
172
	buf_block_t*	block)	/* in: pointer to the block in question */
173
{
174
	ut_ad(block);
175
#ifdef UNIV_SYNC_DEBUG
176
	ut_ad(mutex_own(&(buf_pool->mutex)));
177
#endif /* UNIV_SYNC_DEBUG */
178
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
179
180
	block->oldest_modification = ut_dulint_zero;
181
182
	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
183
184
	ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));
185
186
	(buf_pool->n_flush[block->flush_type])--;
187
188
	if (block->flush_type == BUF_FLUSH_LRU) {
189
		/* Put the block to the end of the LRU list to wait to be
190
		moved to the free list */
191
192
		buf_LRU_make_block_old(block);
193
194
		buf_pool->LRU_flush_ended++;
195
	}
196
197
	/* fprintf(stderr, "n pending flush %lu\n",
198
	buf_pool->n_flush[block->flush_type]); */
199
200
	if ((buf_pool->n_flush[block->flush_type] == 0)
201
	    && (buf_pool->init_flush[block->flush_type] == FALSE)) {
202
203
		/* The running flush batch has ended */
204
205
		os_event_set(buf_pool->no_flush[block->flush_type]);
206
	}
207
}
208
209
/************************************************************************
210
Flushes possible buffered writes from the doublewrite memory buffer to disk,
211
and also wakes up the aio thread if simulated aio is used. It is very
212
important to call this function after a batch of writes has been posted,
213
and also when we may have to wait for a page latch! Otherwise a deadlock
214
of threads can occur. */
215
static
216
void
217
buf_flush_buffered_writes(void)
218
/*===========================*/
219
{
220
	buf_block_t*	block;
221
	byte*		write_buf;
222
	ulint		len;
223
	ulint		len2;
224
	ulint		i;
225
226
	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
227
		os_aio_simulated_wake_handler_threads();
228
229
		return;
230
	}
231
232
	mutex_enter(&(trx_doublewrite->mutex));
233
234
	/* Write first to doublewrite buffer blocks. We use synchronous
235
	aio and thus know that file write has been completed when the
236
	control returns. */
237
238
	if (trx_doublewrite->first_free == 0) {
239
240
		mutex_exit(&(trx_doublewrite->mutex));
241
242
		return;
243
	}
244
245
	for (i = 0; i < trx_doublewrite->first_free; i++) {
246
247
		block = trx_doublewrite->buf_block_arr[i];
248
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
249
250
		if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
251
		    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
252
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
253
			ut_print_timestamp(stderr);
254
			fprintf(stderr,
255
				"  InnoDB: ERROR: The page to be written"
256
				" seems corrupt!\n"
257
				"InnoDB: The lsn fields do not match!"
258
				" Noticed in the buffer pool\n"
259
				"InnoDB: before posting to the"
260
				" doublewrite buffer.\n");
261
		}
262
263
		if (block->check_index_page_at_flush
264
		    && !page_simple_validate(block->frame)) {
265
266
			buf_page_print(block->frame);
267
268
			ut_print_timestamp(stderr);
269
			fprintf(stderr,
270
				"  InnoDB: Apparent corruption of an"
271
				" index page n:o %lu in space %lu\n"
272
				"InnoDB: to be written to data file."
273
				" We intentionally crash server\n"
274
				"InnoDB: to prevent corrupt data"
275
				" from ending up in data\n"
276
				"InnoDB: files.\n",
277
				(ulong) block->offset, (ulong) block->space);
278
279
			ut_error;
280
		}
281
	}
282
283
	/* increment the doublewrite flushed pages counter */
284
	srv_dblwr_pages_written+= trx_doublewrite->first_free;
285
	srv_dblwr_writes++;
286
287
	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
288
		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
289
	} else {
290
		len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
291
	}
292
293
	fil_io(OS_FILE_WRITE,
294
	       TRUE, TRX_SYS_SPACE,
295
	       trx_doublewrite->block1, 0, len,
296
	       (void*)trx_doublewrite->write_buf, NULL);
297
298
	write_buf = trx_doublewrite->write_buf;
299
300
	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) {
301
		if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4)
302
		    != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE
303
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
304
			ut_print_timestamp(stderr);
305
			fprintf(stderr,
306
				"  InnoDB: ERROR: The page to be written"
307
				" seems corrupt!\n"
308
				"InnoDB: The lsn fields do not match!"
309
				" Noticed in the doublewrite block1.\n");
310
		}
311
	}
312
313
	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
314
		len = (trx_doublewrite->first_free
315
		       - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
316
317
		fil_io(OS_FILE_WRITE,
318
		       TRUE, TRX_SYS_SPACE,
319
		       trx_doublewrite->block2, 0, len,
320
		       (void*)(trx_doublewrite->write_buf
321
			       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
322
			       * UNIV_PAGE_SIZE),
323
		       NULL);
324
325
		write_buf = trx_doublewrite->write_buf
326
			+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
327
		for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
328
		     len2 += UNIV_PAGE_SIZE) {
329
			if (mach_read_from_4(write_buf + len2
330
					     + FIL_PAGE_LSN + 4)
331
			    != mach_read_from_4(write_buf + len2
332
						+ UNIV_PAGE_SIZE
333
						- FIL_PAGE_END_LSN_OLD_CHKSUM
334
						+ 4)) {
335
				ut_print_timestamp(stderr);
336
				fprintf(stderr,
337
					"  InnoDB: ERROR: The page to be"
338
					" written seems corrupt!\n"
339
					"InnoDB: The lsn fields do not match!"
340
					" Noticed in"
341
					" the doublewrite block2.\n");
342
			}
343
		}
344
	}
345
346
	/* Now flush the doublewrite buffer data to disk */
347
348
	fil_flush(TRX_SYS_SPACE);
349
350
	/* We know that the writes have been flushed to disk now
351
	and in recovery we will find them in the doublewrite buffer
352
	blocks. Next do the writes to the intended positions. */
353
354
	for (i = 0; i < trx_doublewrite->first_free; i++) {
355
		block = trx_doublewrite->buf_block_arr[i];
356
357
		if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
358
		    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
359
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
360
			ut_print_timestamp(stderr);
361
			fprintf(stderr,
362
				"  InnoDB: ERROR: The page to be written"
363
				" seems corrupt!\n"
364
				"InnoDB: The lsn fields do not match!"
365
				" Noticed in the buffer pool\n"
366
				"InnoDB: after posting and flushing"
367
				" the doublewrite buffer.\n"
368
				"InnoDB: Page buf fix count %lu,"
369
				" io fix %lu, state %lu\n",
370
				(ulong)block->buf_fix_count,
371
				(ulong)block->io_fix,
372
				(ulong)block->state);
373
		}
374
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
375
376
		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
377
		       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
378
		       (void*)block->frame, (void*)block);
379
	}
380
381
	/* Wake possible simulated aio thread to actually post the
382
	writes to the operating system */
383
384
	os_aio_simulated_wake_handler_threads();
385
386
	/* Wait that all async writes to tablespaces have been posted to
387
	the OS */
388
389
	os_aio_wait_until_no_pending_writes();
390
391
	/* Now we flush the data to disk (for example, with fsync) */
392
393
	fil_flush_file_spaces(FIL_TABLESPACE);
394
395
	/* We can now reuse the doublewrite memory buffer: */
396
397
	trx_doublewrite->first_free = 0;
398
399
	mutex_exit(&(trx_doublewrite->mutex));
400
}
401
402
/************************************************************************
403
Posts a buffer page for writing. If the doublewrite memory buffer is
404
full, calls buf_flush_buffered_writes and waits for for free space to
405
appear. */
406
static
407
void
408
buf_flush_post_to_doublewrite_buf(
409
/*==============================*/
410
	buf_block_t*	block)	/* in: buffer block to write */
411
{
412
try_again:
413
	mutex_enter(&(trx_doublewrite->mutex));
414
415
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
416
417
	if (trx_doublewrite->first_free
418
	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
419
		mutex_exit(&(trx_doublewrite->mutex));
420
421
		buf_flush_buffered_writes();
422
423
		goto try_again;
424
	}
425
426
	ut_memcpy(trx_doublewrite->write_buf
427
		  + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
428
		  block->frame, UNIV_PAGE_SIZE);
429
430
	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
431
432
	trx_doublewrite->first_free++;
433
434
	if (trx_doublewrite->first_free
435
	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
436
		mutex_exit(&(trx_doublewrite->mutex));
437
438
		buf_flush_buffered_writes();
439
440
		return;
441
	}
442
443
	mutex_exit(&(trx_doublewrite->mutex));
444
}
445
446
/************************************************************************
447
Initializes a page for writing to the tablespace. */
448
449
void
450
buf_flush_init_for_writing(
451
/*=======================*/
452
	byte*	page,		/* in: page */
453
	dulint	newest_lsn,	/* in: newest modification lsn to the page */
454
	ulint	space,		/* in: space id */
455
	ulint	page_no)	/* in: page number */
456
{
457
	/* Write the newest modification lsn to the page header and trailer */
458
	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
459
460
	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
461
			newest_lsn);
462
	/* Write the page number and the space id */
463
464
	mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
465
	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space);
466
467
	/* Store the new formula checksum */
468
469
	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
470
			srv_use_checksums
471
			? buf_calc_page_new_checksum(page)
472
			: BUF_NO_CHECKSUM_MAGIC);
473
474
	/* We overwrite the first 4 bytes of the end lsn field to store
475
	the old formula checksum. Since it depends also on the field
476
	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
477
	new formula checksum. */
478
479
	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
480
			srv_use_checksums
481
			? buf_calc_page_old_checksum(page)
482
			: BUF_NO_CHECKSUM_MAGIC);
483
}
484
485
/************************************************************************
486
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
487
also when the doublewrite buffer is used, we must call
488
buf_flush_buffered_writes after we have posted a batch of writes! */
489
static
490
void
491
buf_flush_write_block_low(
492
/*======================*/
493
	buf_block_t*	block)	/* in: buffer block to write */
494
{
495
#ifdef UNIV_LOG_DEBUG
496
	static ibool univ_log_debug_warned;
497
#endif /* UNIV_LOG_DEBUG */
498
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
499
500
#ifdef UNIV_IBUF_DEBUG
501
	ut_a(ibuf_count_get(block->space, block->offset) == 0);
502
#endif
503
	ut_ad(!ut_dulint_is_zero(block->newest_modification));
504
505
#ifdef UNIV_LOG_DEBUG
506
	if (!univ_log_debug_warned) {
507
		univ_log_debug_warned = TRUE;
508
		fputs("Warning: cannot force log to disk if"
509
		      " UNIV_LOG_DEBUG is defined!\n"
510
		      "Crash recovery will not work!\n",
511
		      stderr);
512
	}
513
#else
514
	/* Force the log to the disk before writing the modified block */
515
	log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
516
#endif
517
	buf_flush_init_for_writing(block->frame, block->newest_modification,
518
				   block->space, block->offset);
519
	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
520
		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
521
		       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
522
		       (void*)block->frame, (void*)block);
523
	} else {
524
		buf_flush_post_to_doublewrite_buf(block);
525
	}
526
}
527
528
/************************************************************************
529
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
530
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
531
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
532
of writes! */
533
static
534
ulint
535
buf_flush_try_page(
536
/*===============*/
537
				/* out: 1 if a page was flushed, 0 otherwise */
538
	ulint	space,		/* in: space id */
539
	ulint	offset,		/* in: page offset */
540
	ulint	flush_type)	/* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or
541
				BUF_FLUSH_SINGLE_PAGE */
542
{
543
	buf_block_t*	block;
544
	ibool		locked;
545
546
	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
547
	      || flush_type == BUF_FLUSH_SINGLE_PAGE);
548
549
	mutex_enter(&(buf_pool->mutex));
550
551
	block = buf_page_hash_get(space, offset);
552
553
	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
554
555
	if (!block) {
556
		mutex_exit(&(buf_pool->mutex));
557
		return(0);
558
	}
559
560
	mutex_enter(&block->mutex);
561
562
	if (flush_type == BUF_FLUSH_LIST
563
	    && buf_flush_ready_for_flush(block, flush_type)) {
564
565
		block->io_fix = BUF_IO_WRITE;
566
567
		/* If AWE is enabled and the page is not mapped to a frame,
568
		then map it */
569
570
		if (block->frame == NULL) {
571
			ut_a(srv_use_awe);
572
573
			/* We set second parameter TRUE because the block is
574
			in the LRU list and we must put it to
575
			awe_LRU_free_mapped list once mapped to a frame */
576
577
			buf_awe_map_page_to_frame(block, TRUE);
578
		}
579
580
		block->flush_type = flush_type;
581
582
		if (buf_pool->n_flush[flush_type] == 0) {
583
584
			os_event_reset(buf_pool->no_flush[flush_type]);
585
		}
586
587
		(buf_pool->n_flush[flush_type])++;
588
589
		locked = FALSE;
590
591
		/* If the simulated aio thread is not running, we must
592
		not wait for any latch, as we may end up in a deadlock:
593
		if buf_fix_count == 0, then we know we need not wait */
594
595
		if (block->buf_fix_count == 0) {
596
			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
597
598
			locked = TRUE;
599
		}
600
601
		mutex_exit(&block->mutex);
602
		mutex_exit(&(buf_pool->mutex));
603
604
		if (!locked) {
605
			buf_flush_buffered_writes();
606
607
			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
608
		}
609
610
#ifdef UNIV_DEBUG
611
		if (buf_debug_prints) {
612
			fprintf(stderr,
613
				"Flushing page space %lu, page no %lu \n",
614
				(ulong) block->space, (ulong) block->offset);
615
		}
616
#endif /* UNIV_DEBUG */
617
618
		buf_flush_write_block_low(block);
619
620
		return(1);
621
622
	} else if (flush_type == BUF_FLUSH_LRU
623
		   && buf_flush_ready_for_flush(block, flush_type)) {
624
625
		/* VERY IMPORTANT:
626
		Because any thread may call the LRU flush, even when owning
627
		locks on pages, to avoid deadlocks, we must make sure that the
628
		s-lock is acquired on the page without waiting: this is
629
		accomplished because in the if-condition above we require
630
		the page not to be bufferfixed (in function
631
		..._ready_for_flush). */
632
633
		block->io_fix = BUF_IO_WRITE;
634
635
		/* If AWE is enabled and the page is not mapped to a frame,
636
		then map it */
637
638
		if (block->frame == NULL) {
639
			ut_a(srv_use_awe);
640
641
			/* We set second parameter TRUE because the block is
642
			in the LRU list and we must put it to
643
			awe_LRU_free_mapped list once mapped to a frame */
644
645
			buf_awe_map_page_to_frame(block, TRUE);
646
		}
647
648
		block->flush_type = flush_type;
649
650
		if (buf_pool->n_flush[flush_type] == 0) {
651
652
			os_event_reset(buf_pool->no_flush[flush_type]);
653
		}
654
655
		(buf_pool->n_flush[flush_type])++;
656
657
		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
658
659
		/* Note that the s-latch is acquired before releasing the
660
		buf_pool mutex: this ensures that the latch is acquired
661
		immediately. */
662
663
		mutex_exit(&block->mutex);
664
		mutex_exit(&(buf_pool->mutex));
665
666
		buf_flush_write_block_low(block);
667
668
		return(1);
669
670
	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE
671
		   && buf_flush_ready_for_flush(block, flush_type)) {
672
673
		block->io_fix = BUF_IO_WRITE;
674
675
		/* If AWE is enabled and the page is not mapped to a frame,
676
		then map it */
677
678
		if (block->frame == NULL) {
679
			ut_a(srv_use_awe);
680
681
			/* We set second parameter TRUE because the block is
682
			in the LRU list and we must put it to
683
			awe_LRU_free_mapped list once mapped to a frame */
684
685
			buf_awe_map_page_to_frame(block, TRUE);
686
		}
687
688
		block->flush_type = flush_type;
689
690
		if (buf_pool->n_flush[block->flush_type] == 0) {
691
692
			os_event_reset(buf_pool->no_flush[block->flush_type]);
693
		}
694
695
		(buf_pool->n_flush[flush_type])++;
696
697
		mutex_exit(&block->mutex);
698
		mutex_exit(&(buf_pool->mutex));
699
700
		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
701
702
#ifdef UNIV_DEBUG
703
		if (buf_debug_prints) {
704
			fprintf(stderr,
705
				"Flushing single page space %lu,"
706
				" page no %lu \n",
707
				(ulong) block->space,
708
				(ulong) block->offset);
709
		}
710
#endif /* UNIV_DEBUG */
711
712
		buf_flush_write_block_low(block);
713
714
		return(1);
715
	}
716
717
	mutex_exit(&block->mutex);
718
	mutex_exit(&(buf_pool->mutex));
719
720
	return(0);
721
}
722
723
/***************************************************************
724
Flushes to disk all flushable pages within the flush area. */
725
static
726
ulint
727
buf_flush_try_neighbors(
728
/*====================*/
729
				/* out: number of pages flushed */
730
	ulint	space,		/* in: space id */
731
	ulint	offset,		/* in: page offset */
732
	ulint	flush_type)	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
733
{
734
	buf_block_t*	block;
735
	ulint		low, high;
736
	ulint		count		= 0;
737
	ulint		i;
738
739
	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
740
741
	low = (offset / BUF_FLUSH_AREA) * BUF_FLUSH_AREA;
742
	high = (offset / BUF_FLUSH_AREA + 1) * BUF_FLUSH_AREA;
743
744
	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
745
		/* If there is little space, it is better not to flush any
746
		block except from the end of the LRU list */
747
748
		low = offset;
749
		high = offset + 1;
750
	}
751
752
	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
753
754
	if (high > fil_space_get_size(space)) {
755
		high = fil_space_get_size(space);
756
	}
757
758
	mutex_enter(&(buf_pool->mutex));
759
760
	for (i = low; i < high; i++) {
761
762
		block = buf_page_hash_get(space, i);
763
		ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
764
765
		if (!block) {
766
767
			continue;
768
769
		} else if (flush_type == BUF_FLUSH_LRU && i != offset
770
			   && !block->old) {
771
772
			/* We avoid flushing 'non-old' blocks in an LRU flush,
773
			because the flushed blocks are soon freed */
774
775
			continue;
776
		} else {
777
778
			mutex_enter(&block->mutex);
779
780
			if (buf_flush_ready_for_flush(block, flush_type)
781
			    && (i == offset || block->buf_fix_count == 0)) {
782
				/* We only try to flush those
783
				neighbors != offset where the buf fix count is
784
				zero, as we then know that we probably can
785
				latch the page without a semaphore wait.
786
				Semaphore waits are expensive because we must
787
				flush the doublewrite buffer before we start
788
				waiting. */
789
790
				mutex_exit(&block->mutex);
791
792
				mutex_exit(&(buf_pool->mutex));
793
794
				/* Note: as we release the buf_pool mutex
795
				above, in buf_flush_try_page we cannot be sure
796
				the page is still in a flushable state:
797
				therefore we check it again inside that
798
				function. */
799
800
				count += buf_flush_try_page(space, i,
801
							    flush_type);
802
803
				mutex_enter(&(buf_pool->mutex));
804
			} else {
805
				mutex_exit(&block->mutex);
806
			}
807
		}
808
	}
809
810
	mutex_exit(&(buf_pool->mutex));
811
812
	return(count);
813
}
814
815
/***********************************************************************
816
This utility flushes dirty blocks from the end of the LRU list or flush_list.
817
NOTE 1: in the case of an LRU flush the calling thread may own latches to
818
pages: to avoid deadlocks, this function must be written so that it cannot
819
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
820
the calling thread is not allowed to own any latches on pages! */
821
822
ulint
823
buf_flush_batch(
824
/*============*/
825
				/* out: number of blocks for which the write
826
				request was queued; ULINT_UNDEFINED if there
827
				was a flush of the same type already running */
828
	ulint	flush_type,	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
829
				BUF_FLUSH_LIST, then the caller must not own
830
				any latches on pages */
831
	ulint	min_n,		/* in: wished minimum mumber of blocks flushed
832
				(it is not guaranteed that the actual number
833
				is that big, though) */
834
	dulint	lsn_limit)	/* in the case BUF_FLUSH_LIST all blocks whose
835
				oldest_modification is smaller than this
836
				should be flushed (if their number does not
837
				exceed min_n), otherwise ignored */
838
{
839
	buf_block_t*	block;
840
	ulint		page_count	= 0;
841
	ulint		old_page_count;
842
	ulint		space;
843
	ulint		offset;
844
	ibool		found;
845
846
	ut_ad((flush_type == BUF_FLUSH_LRU)
847
	      || (flush_type == BUF_FLUSH_LIST));
848
#ifdef UNIV_SYNC_DEBUG
849
	ut_ad((flush_type != BUF_FLUSH_LIST)
850
	      || sync_thread_levels_empty_gen(TRUE));
851
#endif /* UNIV_SYNC_DEBUG */
852
	mutex_enter(&(buf_pool->mutex));
853
854
	if ((buf_pool->n_flush[flush_type] > 0)
855
	    || (buf_pool->init_flush[flush_type] == TRUE)) {
856
857
		/* There is already a flush batch of the same type running */
858
859
		mutex_exit(&(buf_pool->mutex));
860
861
		return(ULINT_UNDEFINED);
862
	}
863
864
	(buf_pool->init_flush)[flush_type] = TRUE;
865
866
	for (;;) {
867
		/* If we have flushed enough, leave the loop */
868
		if (page_count >= min_n) {
869
870
			break;
871
		}
872
873
		/* Start from the end of the list looking for a suitable
874
		block to be flushed. */
875
876
		if (flush_type == BUF_FLUSH_LRU) {
877
			block = UT_LIST_GET_LAST(buf_pool->LRU);
878
		} else {
879
			ut_ad(flush_type == BUF_FLUSH_LIST);
880
881
			block = UT_LIST_GET_LAST(buf_pool->flush_list);
882
			if (!block
883
			    || (ut_dulint_cmp(block->oldest_modification,
884
					      lsn_limit) >= 0)) {
885
				/* We have flushed enough */
886
887
				break;
888
			}
889
		}
890
891
		found = FALSE;
892
893
		/* Note that after finding a single flushable page, we try to
894
		flush also all its neighbors, and after that start from the
895
		END of the LRU list or flush list again: the list may change
896
		during the flushing and we cannot safely preserve within this
897
		function a pointer to a block in the list! */
898
899
		while ((block != NULL) && !found) {
900
			ut_a(block->state == BUF_BLOCK_FILE_PAGE);
901
902
			mutex_enter(&block->mutex);
903
904
			if (buf_flush_ready_for_flush(block, flush_type)) {
905
906
				found = TRUE;
907
				space = block->space;
908
				offset = block->offset;
909
910
				mutex_exit(&block->mutex);
911
				mutex_exit(&(buf_pool->mutex));
912
913
				old_page_count = page_count;
914
915
				/* Try to flush also all the neighbors */
916
				page_count += buf_flush_try_neighbors(
917
					space, offset, flush_type);
918
				/* fprintf(stderr,
919
				"Flush type %lu, page no %lu, neighb %lu\n",
920
				flush_type, offset,
921
				page_count - old_page_count); */
922
923
				mutex_enter(&(buf_pool->mutex));
924
925
			} else if (flush_type == BUF_FLUSH_LRU) {
926
927
				mutex_exit(&block->mutex);
928
929
				block = UT_LIST_GET_PREV(LRU, block);
930
			} else {
931
				ut_ad(flush_type == BUF_FLUSH_LIST);
932
933
				mutex_exit(&block->mutex);
934
935
				block = UT_LIST_GET_PREV(flush_list, block);
936
			}
937
		}
938
939
		/* If we could not find anything to flush, leave the loop */
940
941
		if (!found) {
942
			break;
943
		}
944
	}
945
946
	(buf_pool->init_flush)[flush_type] = FALSE;
947
948
	if ((buf_pool->n_flush[flush_type] == 0)
949
	    && (buf_pool->init_flush[flush_type] == FALSE)) {
950
951
		/* The running flush batch has ended */
952
953
		os_event_set(buf_pool->no_flush[flush_type]);
954
	}
955
956
	mutex_exit(&(buf_pool->mutex));
957
958
	buf_flush_buffered_writes();
959
960
#ifdef UNIV_DEBUG
961
	if (buf_debug_prints && page_count > 0) {
962
		ut_a(flush_type == BUF_FLUSH_LRU
963
		     || flush_type == BUF_FLUSH_LIST);
964
		fprintf(stderr, flush_type == BUF_FLUSH_LRU
965
			? "Flushed %lu pages in LRU flush\n"
966
			: "Flushed %lu pages in flush list flush\n",
967
			(ulong) page_count);
968
	}
969
#endif /* UNIV_DEBUG */
970
971
	srv_buf_pool_flushed += page_count;
972
973
	return(page_count);
974
}
975
976
/**********************************************************************
977
Waits until a flush batch of the given type ends */
978
979
void
980
buf_flush_wait_batch_end(
981
/*=====================*/
982
	ulint	type)	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
983
{
984
	ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
985
986
	os_event_wait(buf_pool->no_flush[type]);
987
}
988
989
/**********************************************************************
990
Gives a recommendation of how many blocks should be flushed to establish
991
a big enough margin of replaceable blocks near the end of the LRU list
992
and in the free list. */
993
static
994
ulint
995
buf_flush_LRU_recommendation(void)
996
/*==============================*/
997
			/* out: number of blocks which should be flushed
998
			from the end of the LRU list */
999
{
1000
	buf_block_t*	block;
1001
	ulint		n_replaceable;
1002
	ulint		distance	= 0;
1003
1004
	mutex_enter(&(buf_pool->mutex));
1005
1006
	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1007
1008
	block = UT_LIST_GET_LAST(buf_pool->LRU);
1009
1010
	while ((block != NULL)
1011
	       && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
1012
		   + BUF_FLUSH_EXTRA_MARGIN)
1013
	       && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
1014
1015
		mutex_enter(&block->mutex);
1016
1017
		if (buf_flush_ready_for_replace(block)) {
1018
			n_replaceable++;
1019
		}
1020
1021
		mutex_exit(&block->mutex);
1022
1023
		distance++;
1024
1025
		block = UT_LIST_GET_PREV(LRU, block);
1026
	}
1027
1028
	mutex_exit(&(buf_pool->mutex));
1029
1030
	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
1031
1032
		return(0);
1033
	}
1034
1035
	return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
1036
	       - n_replaceable);
1037
}
1038
1039
/*************************************************************************
1040
Flushes pages from the end of the LRU list if there is too small a margin
1041
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1042
is called also by threads which have locks on pages. To avoid deadlocks, we
1043
flush only pages such that the s-lock required for flushing can be acquired
1044
immediately, without waiting. */
1045
1046
void
1047
buf_flush_free_margin(void)
1048
/*=======================*/
1049
{
1050
	ulint	n_to_flush;
1051
	ulint	n_flushed;
1052
1053
	n_to_flush = buf_flush_LRU_recommendation();
1054
1055
	if (n_to_flush > 0) {
1056
		n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush,
1057
					    ut_dulint_zero);
1058
		if (n_flushed == ULINT_UNDEFINED) {
1059
			/* There was an LRU type flush batch already running;
1060
			let us wait for it to end */
1061
1062
			buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1063
		}
1064
	}
1065
}
1066
1067
/**********************************************************************
1068
Validates the flush list. */
1069
static
1070
ibool
1071
buf_flush_validate_low(void)
1072
/*========================*/
1073
		/* out: TRUE if ok */
1074
{
1075
	buf_block_t*	block;
1076
	dulint		om;
1077
1078
	UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);
1079
1080
	block = UT_LIST_GET_FIRST(buf_pool->flush_list);
1081
1082
	while (block != NULL) {
1083
		om = block->oldest_modification;
1084
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1085
		ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
1086
1087
		block = UT_LIST_GET_NEXT(flush_list, block);
1088
1089
		if (block) {
1090
			ut_a(ut_dulint_cmp(om, block->oldest_modification)
1091
			     >= 0);
1092
		}
1093
	}
1094
1095
	return(TRUE);
1096
}
1097
1098
/**********************************************************************
1099
Validates the flush list. */
1100
1101
ibool
1102
buf_flush_validate(void)
1103
/*====================*/
1104
		/* out: TRUE if ok */
1105
{
1106
	ibool	ret;
1107
1108
	mutex_enter(&(buf_pool->mutex));
1109
1110
	ret = buf_flush_validate_low();
1111
1112
	mutex_exit(&(buf_pool->mutex));
1113
1114
	return(ret);
1115
}