~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/******************************************************
2
Database log
3
4
(c) 1995-1997 Innobase Oy
5
6
Created 12/9/1995 Heikki Tuuri
7
*******************************************************/
8
9
#include "log0log.h"
10
11
#ifdef UNIV_NONINL
12
#include "log0log.ic"
13
#endif
14
15
#include "mem0mem.h"
16
#include "buf0buf.h"
17
#include "buf0flu.h"
18
#include "srv0srv.h"
19
#include "log0recv.h"
20
#include "fil0fil.h"
21
#include "dict0boot.h"
22
#include "srv0srv.h"
23
#include "srv0start.h"
24
#include "trx0sys.h"
25
#include "trx0trx.h"
26
27
/*
28
General philosophy of InnoDB redo-logs:
29
30
1) Every change to a contents of a data page must be done
31
through mtr, which in mtr_commit() writes log records
32
to the InnoDB redo log.
33
34
2) Normally these changes are performed using a mlog_write_ulint()
35
or similar function.
36
37
3) In some page level operations only a code number of a
38
c-function and its parameters are written to the log to
39
reduce the size of the log.
40
41
  3a) You should not add parameters to these kind of functions
42
  (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
43
44
  3b) You should not add such functionality which either change
45
  working when compared with the old or are dependent on data
46
  outside of the page. These kind of functions should implement
47
  self-contained page transformation and it should be unchanged
48
  if you don't have very essential reasons to change log
49
  semantics or format.
50
51
*/
52
53
/* Current free limit of space 0; protected by the log sys mutex; 0 means
54
uninitialized */
55
ulint	log_fsp_current_free_limit		= 0;
56
57
/* Global log system variable */
58
log_t*	log_sys	= NULL;
59
60
#ifdef UNIV_DEBUG
61
ibool	log_do_write = TRUE;
62
63
ibool	log_debug_writes = FALSE;
64
#endif /* UNIV_DEBUG */
65
66
/* These control how often we print warnings if the last checkpoint is too
67
old */
68
ibool	log_has_printed_chkp_warning = FALSE;
69
time_t	log_last_warning_time;
70
71
#ifdef UNIV_LOG_ARCHIVE
72
/* Pointer to this variable is used as the i/o-message when we do i/o to an
73
archive */
74
byte	log_archive_io;
75
#endif /* UNIV_LOG_ARCHIVE */
76
77
/* A margin for free space in the log buffer before a log entry is catenated */
78
#define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
79
80
/* Margins for free space in the log buffer after a log entry is catenated */
81
#define LOG_BUF_FLUSH_RATIO	2
82
#define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
83
84
/* Margin for the free space in the smallest log group, before a new query
85
step which modifies the database, is started */
86
87
#define LOG_CHECKPOINT_FREE_PER_THREAD	(4 * UNIV_PAGE_SIZE)
88
#define LOG_CHECKPOINT_EXTRA_FREE	(8 * UNIV_PAGE_SIZE)
89
90
/* This parameter controls asynchronous making of a new checkpoint; the value
91
should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
92
93
#define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
94
95
/* This parameter controls synchronous preflushing of modified buffer pages */
96
#define LOG_POOL_PREFLUSH_RATIO_SYNC	16
97
98
/* The same ratio for asynchronous preflushing; this value should be less than
99
the previous */
100
#define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
101
102
/* Extra margin, in addition to one log file, used in archiving */
103
#define LOG_ARCHIVE_EXTRA_MARGIN	(4 * UNIV_PAGE_SIZE)
104
105
/* This parameter controls asynchronous writing to the archive */
106
#define LOG_ARCHIVE_RATIO_ASYNC		16
107
108
/* Codes used in unlocking flush latches */
109
#define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
110
#define LOG_UNLOCK_FLUSH_LOCK		2
111
112
/* States of an archiving operation */
113
#define	LOG_ARCHIVE_READ	1
114
#define	LOG_ARCHIVE_WRITE	2
115
116
/**********************************************************
117
Completes a checkpoint write i/o to a log file. */
118
static
119
void
120
log_io_complete_checkpoint(void);
121
/*============================*/
122
#ifdef UNIV_LOG_ARCHIVE
123
/**********************************************************
124
Completes an archiving i/o. */
125
static
126
void
127
log_io_complete_archive(void);
128
/*=========================*/
129
#endif /* UNIV_LOG_ARCHIVE */
130
131
/********************************************************************
132
Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
133
so that we know that the limit has been written to a log checkpoint field
134
on disk. */
135
136
void
137
log_fsp_current_free_limit_set_and_checkpoint(
138
/*==========================================*/
139
	ulint	limit)	/* in: limit to set */
140
{
141
	ibool	success;
142
143
	mutex_enter(&(log_sys->mutex));
144
145
	log_fsp_current_free_limit = limit;
146
147
	mutex_exit(&(log_sys->mutex));
148
149
	/* Try to make a synchronous checkpoint */
150
151
	success = FALSE;
152
153
	while (!success) {
154
		success = log_checkpoint(TRUE, TRUE);
155
	}
156
}
157
158
/********************************************************************
159
Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
160
exists. */
161
static
162
dulint
163
log_buf_pool_get_oldest_modification(void)
164
/*======================================*/
165
{
166
	dulint	lsn;
167
168
	ut_ad(mutex_own(&(log_sys->mutex)));
169
170
	lsn = buf_pool_get_oldest_modification();
171
172
	if (ut_dulint_is_zero(lsn)) {
173
174
		lsn = log_sys->lsn;
175
	}
176
177
	return(lsn);
178
}
179
180
/****************************************************************
181
Opens the log for log_write_low. The log must be closed with log_close and
182
released with log_release. */
183
184
dulint
185
log_reserve_and_open(
186
/*=================*/
187
			/* out: start lsn of the log record */
188
	ulint	len)	/* in: length of data to be catenated */
189
{
190
	log_t*	log			= log_sys;
191
	ulint	len_upper_limit;
192
#ifdef UNIV_LOG_ARCHIVE
193
	ulint	archived_lsn_age;
194
	ulint	dummy;
195
#endif /* UNIV_LOG_ARCHIVE */
196
#ifdef UNIV_DEBUG
197
	ulint	count			= 0;
198
#endif /* UNIV_DEBUG */
199
200
	ut_a(len < log->buf_size / 2);
201
loop:
202
	mutex_enter(&(log->mutex));
203
204
	/* Calculate an upper limit for the space the string may take in the
205
	log buffer */
206
207
	len_upper_limit = LOG_BUF_WRITE_MARGIN + (5 * len) / 4;
208
209
	if (log->buf_free + len_upper_limit > log->buf_size) {
210
211
		mutex_exit(&(log->mutex));
212
213
		/* Not enough free space, do a syncronous flush of the log
214
		buffer */
215
216
		log_buffer_flush_to_disk();
217
218
		srv_log_waits++;
219
220
		ut_ad(++count < 50);
221
222
		goto loop;
223
	}
224
225
#ifdef UNIV_LOG_ARCHIVE
226
	if (log->archiving_state != LOG_ARCH_OFF) {
227
228
		archived_lsn_age = ut_dulint_minus(log->lsn,
229
						   log->archived_lsn);
230
		if (archived_lsn_age + len_upper_limit
231
		    > log->max_archived_lsn_age) {
232
			/* Not enough free archived space in log groups: do a
233
			synchronous archive write batch: */
234
235
			mutex_exit(&(log->mutex));
236
237
			ut_ad(len_upper_limit <= log->max_archived_lsn_age);
238
239
			log_archive_do(TRUE, &dummy);
240
241
			ut_ad(++count < 50);
242
243
			goto loop;
244
		}
245
	}
246
#endif /* UNIV_LOG_ARCHIVE */
247
248
#ifdef UNIV_LOG_DEBUG
249
	log->old_buf_free = log->buf_free;
250
	log->old_lsn = log->lsn;
251
#endif
252
	return(log->lsn);
253
}
254
255
/****************************************************************
256
Writes to the log the string given. It is assumed that the caller holds the
257
log mutex. */
258
259
void
260
log_write_low(
261
/*==========*/
262
	byte*	str,		/* in: string */
263
	ulint	str_len)	/* in: string length */
264
{
265
	log_t*	log	= log_sys;
266
	ulint	len;
267
	ulint	data_len;
268
	byte*	log_block;
269
270
	ut_ad(mutex_own(&(log->mutex)));
271
part_loop:
272
	/* Calculate a part length */
273
274
	data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
275
276
	if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
277
278
		/* The string fits within the current log block */
279
280
		len = str_len;
281
	} else {
282
		data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
283
284
		len = OS_FILE_LOG_BLOCK_SIZE
285
			- (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
286
			- LOG_BLOCK_TRL_SIZE;
287
	}
288
289
	ut_memcpy(log->buf + log->buf_free, str, len);
290
291
	str_len -= len;
292
	str = str + len;
293
294
	log_block = ut_align_down(log->buf + log->buf_free,
295
				  OS_FILE_LOG_BLOCK_SIZE);
296
	log_block_set_data_len(log_block, data_len);
297
298
	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
299
		/* This block became full */
300
		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
301
		log_block_set_checkpoint_no(log_block,
302
					    log_sys->next_checkpoint_no);
303
		len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
304
305
		log->lsn = ut_dulint_add(log->lsn, len);
306
307
		/* Initialize the next block header */
308
		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
309
	} else {
310
		log->lsn = ut_dulint_add(log->lsn, len);
311
	}
312
313
	log->buf_free += len;
314
315
	ut_ad(log->buf_free <= log->buf_size);
316
317
	if (str_len > 0) {
318
		goto part_loop;
319
	}
320
321
	srv_log_write_requests++;
322
}
323
324
/****************************************************************
325
Closes the log. */
326
327
dulint
328
log_close(void)
329
/*===========*/
330
			/* out: lsn */
331
{
332
	byte*	log_block;
333
	ulint	first_rec_group;
334
	dulint	oldest_lsn;
335
	dulint	lsn;
336
	log_t*	log	= log_sys;
337
	ulint	checkpoint_age;
338
339
	ut_ad(mutex_own(&(log->mutex)));
340
341
	lsn = log->lsn;
342
343
	log_block = ut_align_down(log->buf + log->buf_free,
344
				  OS_FILE_LOG_BLOCK_SIZE);
345
	first_rec_group = log_block_get_first_rec_group(log_block);
346
347
	if (first_rec_group == 0) {
348
		/* We initialized a new log block which was not written
349
		full by the current mtr: the next mtr log record group
350
		will start within this block at the offset data_len */
351
352
		log_block_set_first_rec_group(
353
			log_block, log_block_get_data_len(log_block));
354
	}
355
356
	if (log->buf_free > log->max_buf_free) {
357
358
		log->check_flush_or_checkpoint = TRUE;
359
	}
360
361
	checkpoint_age = ut_dulint_minus(lsn, log->last_checkpoint_lsn);
362
363
	if (checkpoint_age >= log->log_group_capacity) {
364
		/* TODO: split btr_store_big_rec_extern_fields() into small
365
		steps so that we can release all latches in the middle, and
366
		call log_free_check() to ensure we never write over log written
367
		after the latest checkpoint. In principle, we should split all
368
		big_rec operations, but other operations are smaller. */
369
370
		if (!log_has_printed_chkp_warning
371
		    || difftime(time(NULL), log_last_warning_time) > 15) {
372
373
			log_has_printed_chkp_warning = TRUE;
374
			log_last_warning_time = time(NULL);
375
376
			ut_print_timestamp(stderr);
377
			fprintf(stderr,
378
				"  InnoDB: ERROR: the age of the last"
379
				" checkpoint is %lu,\n"
380
				"InnoDB: which exceeds the log group"
381
				" capacity %lu.\n"
382
				"InnoDB: If you are using big"
383
				" BLOB or TEXT rows, you must set the\n"
384
				"InnoDB: combined size of log files"
385
				" at least 10 times bigger than the\n"
386
				"InnoDB: largest such row.\n",
387
				(ulong) checkpoint_age,
388
				(ulong) log->log_group_capacity);
389
		}
390
	}
391
392
	if (checkpoint_age <= log->max_modified_age_async) {
393
394
		goto function_exit;
395
	}
396
397
	oldest_lsn = buf_pool_get_oldest_modification();
398
399
	if (ut_dulint_is_zero(oldest_lsn)
400
	    || (ut_dulint_minus(lsn, oldest_lsn)
401
		> log->max_modified_age_async)
402
	    || checkpoint_age > log->max_checkpoint_age_async) {
403
404
		log->check_flush_or_checkpoint = TRUE;
405
	}
406
function_exit:
407
408
#ifdef UNIV_LOG_DEBUG
409
	log_check_log_recs(log->buf + log->old_buf_free,
410
			   log->buf_free - log->old_buf_free, log->old_lsn);
411
#endif
412
413
	return(lsn);
414
}
415
416
#ifdef UNIV_LOG_ARCHIVE
417
/**********************************************************
418
Pads the current log block full with dummy log records. Used in producing
419
consistent archived log files. */
420
static
421
void
422
log_pad_current_log_block(void)
423
/*===========================*/
424
{
425
	byte	b		= MLOG_DUMMY_RECORD;
426
	ulint	pad_length;
427
	ulint	i;
428
	dulint	lsn;
429
430
	/* We retrieve lsn only because otherwise gcc crashed on HP-UX */
431
	lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
432
433
	pad_length = OS_FILE_LOG_BLOCK_SIZE
434
		- (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE)
435
		- LOG_BLOCK_TRL_SIZE;
436
437
	for (i = 0; i < pad_length; i++) {
438
		log_write_low(&b, 1);
439
	}
440
441
	lsn = log_sys->lsn;
442
443
	log_close();
444
	log_release();
445
446
	ut_a((ut_dulint_get_low(lsn) % OS_FILE_LOG_BLOCK_SIZE)
447
	     == LOG_BLOCK_HDR_SIZE);
448
}
449
#endif /* UNIV_LOG_ARCHIVE */
450
451
/**********************************************************
452
Calculates the data capacity of a log group, when the log file headers are not
453
included. */
454
455
ulint
456
log_group_get_capacity(
457
/*===================*/
458
				/* out: capacity in bytes */
459
	log_group_t*	group)	/* in: log group */
460
{
461
	ut_ad(mutex_own(&(log_sys->mutex)));
462
463
	return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
464
}
465
466
/**********************************************************
467
Calculates the offset within a log group, when the log file headers are not
468
included. */
469
UNIV_INLINE
470
ulint
471
log_group_calc_size_offset(
472
/*=======================*/
473
				/* out: size offset (<= offset) */
474
	ulint		offset,	/* in: real offset within the log group */
475
	log_group_t*	group)	/* in: log group */
476
{
477
	ut_ad(mutex_own(&(log_sys->mutex)));
478
479
	return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
480
}
481
482
/**********************************************************
483
Calculates the offset within a log group, when the log file headers are
484
included. */
485
UNIV_INLINE
486
ulint
487
log_group_calc_real_offset(
488
/*=======================*/
489
				/* out: real offset (>= offset) */
490
	ulint		offset,	/* in: size offset within the log group */
491
	log_group_t*	group)	/* in: log group */
492
{
493
	ut_ad(mutex_own(&(log_sys->mutex)));
494
495
	return(offset + LOG_FILE_HDR_SIZE
496
	       * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
497
}
498
499
/**********************************************************
500
Calculates the offset of an lsn within a log group. */
501
static
502
ulint
503
log_group_calc_lsn_offset(
504
/*======================*/
505
				/* out: offset within the log group */
506
	dulint		lsn,	/* in: lsn, must be within 4 GB of
507
				group->lsn */
508
	log_group_t*	group)	/* in: log group */
509
{
510
	dulint		gr_lsn;
511
	ib_longlong	gr_lsn_size_offset;
512
	ib_longlong	difference;
513
	ib_longlong	group_size;
514
	ib_longlong	offset;
515
516
	ut_ad(mutex_own(&(log_sys->mutex)));
517
518
	/* If total log file size is > 2 GB we can easily get overflows
519
	with 32-bit integers. Use 64-bit integers instead. */
520
521
	gr_lsn = group->lsn;
522
523
	gr_lsn_size_offset = (ib_longlong)
524
		log_group_calc_size_offset(group->lsn_offset, group);
525
526
	group_size = (ib_longlong) log_group_get_capacity(group);
527
528
	if (ut_dulint_cmp(lsn, gr_lsn) >= 0) {
529
530
		difference = (ib_longlong) ut_dulint_minus(lsn, gr_lsn);
531
	} else {
532
		difference = (ib_longlong) ut_dulint_minus(gr_lsn, lsn);
533
534
		difference = difference % group_size;
535
536
		difference = group_size - difference;
537
	}
538
539
	offset = (gr_lsn_size_offset + difference) % group_size;
540
541
	ut_a(offset < (((ib_longlong) 1) << 32)); /* offset must be < 4 GB */
542
543
	/* fprintf(stderr,
544
	"Offset is %lu gr_lsn_offset is %lu difference is %lu\n",
545
	(ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference);
546
	*/
547
548
	return(log_group_calc_real_offset((ulint)offset, group));
549
}
550
551
/***********************************************************************
552
Calculates where in log files we find a specified lsn. */
553
554
ulint
555
log_calc_where_lsn_is(
556
/*==================*/
557
						/* out: log file number */
558
	ib_longlong*	log_file_offset,	/* out: offset in that file
559
						(including the header) */
560
	dulint		first_header_lsn,	/* in: first log file start
561
						lsn */
562
	dulint		lsn,			/* in: lsn whose position to
563
						determine */
564
	ulint		n_log_files,		/* in: total number of log
565
						files */
566
	ib_longlong	log_file_size)		/* in: log file size
567
						(including the header) */
568
{
569
	ib_longlong	ib_lsn;
570
	ib_longlong	ib_first_header_lsn;
571
	ib_longlong	capacity	= log_file_size - LOG_FILE_HDR_SIZE;
572
	ulint		file_no;
573
	ib_longlong	add_this_many;
574
575
	ib_lsn = ut_conv_dulint_to_longlong(lsn);
576
	ib_first_header_lsn = ut_conv_dulint_to_longlong(first_header_lsn);
577
578
	if (ib_lsn < ib_first_header_lsn) {
579
		add_this_many = 1 + (ib_first_header_lsn - ib_lsn)
580
			/ (capacity * (ib_longlong)n_log_files);
581
		ib_lsn += add_this_many
582
			* capacity * (ib_longlong)n_log_files;
583
	}
584
585
	ut_a(ib_lsn >= ib_first_header_lsn);
586
587
	file_no = ((ulint)((ib_lsn - ib_first_header_lsn) / capacity))
588
		% n_log_files;
589
	*log_file_offset = (ib_lsn - ib_first_header_lsn) % capacity;
590
591
	*log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
592
593
	return(file_no);
594
}
595
596
/************************************************************
597
Sets the field values in group to correspond to a given lsn. For this function
598
to work, the values must already be correctly initialized to correspond to
599
some lsn, for instance, a checkpoint lsn. */
600
601
void
602
log_group_set_fields(
603
/*=================*/
604
	log_group_t*	group,	/* in: group */
605
	dulint		lsn)	/* in: lsn for which the values should be
606
				set */
607
{
608
	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
609
	group->lsn = lsn;
610
}
611
612
/*********************************************************************
613
Calculates the recommended highest values for lsn - last_checkpoint_lsn,
614
lsn - buf_get_oldest_modification(), and lsn - max_archive_lsn_age. */
615
static
616
ibool
617
log_calc_max_ages(void)
618
/*===================*/
619
			/* out: error value FALSE if the smallest log group is
620
			too small to accommodate the number of OS threads in
621
			the database server */
622
{
623
	log_group_t*	group;
624
	ulint		margin;
625
	ulint		free;
626
	ibool		success		= TRUE;
627
	ulint		smallest_capacity;
628
	ulint		archive_margin;
629
	ulint		smallest_archive_margin;
630
631
	ut_ad(!mutex_own(&(log_sys->mutex)));
632
633
	mutex_enter(&(log_sys->mutex));
634
635
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
636
637
	ut_ad(group);
638
639
	smallest_capacity = ULINT_MAX;
640
	smallest_archive_margin = ULINT_MAX;
641
642
	while (group) {
643
		if (log_group_get_capacity(group) < smallest_capacity) {
644
645
			smallest_capacity = log_group_get_capacity(group);
646
		}
647
648
		archive_margin = log_group_get_capacity(group)
649
			- (group->file_size - LOG_FILE_HDR_SIZE)
650
			- LOG_ARCHIVE_EXTRA_MARGIN;
651
652
		if (archive_margin < smallest_archive_margin) {
653
654
			smallest_archive_margin = archive_margin;
655
		}
656
657
		group = UT_LIST_GET_NEXT(log_groups, group);
658
	}
659
660
	/* Add extra safety */
661
	smallest_capacity = smallest_capacity - smallest_capacity / 10;
662
663
	/* For each OS thread we must reserve so much free space in the
664
	smallest log group that it can accommodate the log entries produced
665
	by single query steps: running out of free log space is a serious
666
	system error which requires rebooting the database. */
667
668
	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
669
		+ LOG_CHECKPOINT_EXTRA_FREE;
670
	if (free >= smallest_capacity / 2) {
671
		success = FALSE;
672
673
		goto failure;
674
	} else {
675
		margin = smallest_capacity - free;
676
	}
677
678
	margin = ut_min(margin, log_sys->adm_checkpoint_interval);
679
680
	margin = margin - margin / 10;	/* Add still some extra safety */
681
682
	log_sys->log_group_capacity = smallest_capacity;
683
684
	log_sys->max_modified_age_async = margin
685
		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
686
	log_sys->max_modified_age_sync = margin
687
		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
688
689
	log_sys->max_checkpoint_age_async = margin - margin
690
		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
691
	log_sys->max_checkpoint_age = margin;
692
693
#ifdef UNIV_LOG_ARCHIVE
694
	log_sys->max_archived_lsn_age = smallest_archive_margin;
695
696
	log_sys->max_archived_lsn_age_async = smallest_archive_margin
697
		- smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC;
698
#endif /* UNIV_LOG_ARCHIVE */
699
failure:
700
	mutex_exit(&(log_sys->mutex));
701
702
	if (!success) {
703
		fprintf(stderr,
704
			"InnoDB: Error: ib_logfiles are too small"
705
			" for innodb_thread_concurrency %lu.\n"
706
			"InnoDB: The combined size of ib_logfiles"
707
			" should be bigger than\n"
708
			"InnoDB: 200 kB * innodb_thread_concurrency.\n"
709
			"InnoDB: To get mysqld to start up, set"
710
			" innodb_thread_concurrency in my.cnf\n"
711
			"InnoDB: to a lower value, for example, to 8."
712
			" After an ERROR-FREE shutdown\n"
713
			"InnoDB: of mysqld you can adjust the size of"
714
			" ib_logfiles, as explained in\n"
715
			"InnoDB: http://dev.mysql.com/doc/refman/5.1/en/"
716
			"adding-and-removing.html\n"
717
			"InnoDB: Cannot continue operation."
718
			" Calling exit(1).\n",
719
			(ulong)srv_thread_concurrency);
720
721
		exit(1);
722
	}
723
724
	return(success);
725
}
726
727
/**********************************************************
728
Initializes the log. */
729
730
void
731
log_init(void)
732
/*==========*/
733
{
734
	byte*	buf;
735
736
	log_sys = mem_alloc(sizeof(log_t));
737
738
	mutex_create(&log_sys->mutex, SYNC_LOG);
739
740
	mutex_enter(&(log_sys->mutex));
741
742
	/* Start the lsn from one log block from zero: this way every
743
	log record has a start lsn != zero, a fact which we will use */
744
745
	log_sys->lsn = LOG_START_LSN;
746
747
	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
748
	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
749
750
	buf = ut_malloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE);
751
	log_sys->buf = ut_align(buf, OS_FILE_LOG_BLOCK_SIZE);
752
753
	log_sys->buf_size = LOG_BUFFER_SIZE;
754
755
	memset(log_sys->buf, '\0', LOG_BUFFER_SIZE);
756
757
	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
758
		- LOG_BUF_FLUSH_MARGIN;
759
	log_sys->check_flush_or_checkpoint = TRUE;
760
	UT_LIST_INIT(log_sys->log_groups);
761
762
	log_sys->n_log_ios = 0;
763
764
	log_sys->n_log_ios_old = log_sys->n_log_ios;
765
	log_sys->last_printout_time = time(NULL);
766
	/*----------------------------*/
767
768
	log_sys->buf_next_to_write = 0;
769
770
	log_sys->write_lsn = ut_dulint_zero;
771
	log_sys->current_flush_lsn = ut_dulint_zero;
772
	log_sys->flushed_to_disk_lsn = ut_dulint_zero;
773
774
	log_sys->written_to_some_lsn = log_sys->lsn;
775
	log_sys->written_to_all_lsn = log_sys->lsn;
776
777
	log_sys->n_pending_writes = 0;
778
779
	log_sys->no_flush_event = os_event_create(NULL);
780
781
	os_event_set(log_sys->no_flush_event);
782
783
	log_sys->one_flushed_event = os_event_create(NULL);
784
785
	os_event_set(log_sys->one_flushed_event);
786
787
	/*----------------------------*/
788
	log_sys->adm_checkpoint_interval = ULINT_MAX;
789
790
	log_sys->next_checkpoint_no = ut_dulint_zero;
791
	log_sys->last_checkpoint_lsn = log_sys->lsn;
792
	log_sys->n_pending_checkpoint_writes = 0;
793
794
	rw_lock_create(&log_sys->checkpoint_lock, SYNC_NO_ORDER_CHECK);
795
796
	log_sys->checkpoint_buf
797
		= ut_align(mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE),
798
			   OS_FILE_LOG_BLOCK_SIZE);
799
	memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
800
	/*----------------------------*/
801
802
#ifdef UNIV_LOG_ARCHIVE
803
	/* Under MySQL, log archiving is always off */
804
	log_sys->archiving_state = LOG_ARCH_OFF;
805
	log_sys->archived_lsn = log_sys->lsn;
806
	log_sys->next_archived_lsn = ut_dulint_zero;
807
808
	log_sys->n_pending_archive_ios = 0;
809
810
	rw_lock_create(&log_sys->archive_lock, SYNC_NO_ORDER_CHECK);
811
812
	log_sys->archive_buf = NULL;
813
814
	/* ut_align(
815
	ut_malloc(LOG_ARCHIVE_BUF_SIZE
816
	+ OS_FILE_LOG_BLOCK_SIZE),
817
	OS_FILE_LOG_BLOCK_SIZE); */
818
	log_sys->archive_buf_size = 0;
819
820
	/* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */
821
822
	log_sys->archiving_on = os_event_create(NULL);
823
#endif /* UNIV_LOG_ARCHIVE */
824
825
	/*----------------------------*/
826
827
	log_block_init(log_sys->buf, log_sys->lsn);
828
	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
829
830
	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
831
	log_sys->lsn = ut_dulint_add(LOG_START_LSN, LOG_BLOCK_HDR_SIZE);
832
833
	mutex_exit(&(log_sys->mutex));
834
835
#ifdef UNIV_LOG_DEBUG
836
	recv_sys_create();
837
	recv_sys_init(FALSE, buf_pool_get_curr_size());
838
839
	recv_sys->parse_start_lsn = log_sys->lsn;
840
	recv_sys->scanned_lsn = log_sys->lsn;
841
	recv_sys->scanned_checkpoint_no = 0;
842
	recv_sys->recovered_lsn = log_sys->lsn;
843
	recv_sys->limit_lsn = ut_dulint_max;
844
#endif
845
}
846
847
/**********************************************************************
848
Inits a log group to the log system. */
849
850
void
851
log_group_init(
852
/*===========*/
853
	ulint	id,			/* in: group id */
854
	ulint	n_files,		/* in: number of log files */
855
	ulint	file_size,		/* in: log file size in bytes */
856
	ulint	space_id,		/* in: space id of the file space
857
					which contains the log files of this
858
					group */
859
	ulint	archive_space_id __attribute__((unused)))
860
					/* in: space id of the file space
861
					which contains some archived log
862
					files for this group; currently, only
863
					for the first log group this is
864
					used */
865
{
866
	ulint	i;
867
868
	log_group_t*	group;
869
870
	group = mem_alloc(sizeof(log_group_t));
871
872
	group->id = id;
873
	group->n_files = n_files;
874
	group->file_size = file_size;
875
	group->space_id = space_id;
876
	group->state = LOG_GROUP_OK;
877
	group->lsn = LOG_START_LSN;
878
	group->lsn_offset = LOG_FILE_HDR_SIZE;
879
	group->n_pending_writes = 0;
880
881
	group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
882
#ifdef UNIV_LOG_ARCHIVE
883
	group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
884
#endif /* UNIV_LOG_ARCHIVE */
885
886
	for (i = 0; i < n_files; i++) {
887
		*(group->file_header_bufs + i) = ut_align(
888
			mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE),
889
			OS_FILE_LOG_BLOCK_SIZE);
890
891
		memset(*(group->file_header_bufs + i), '\0',
892
		       LOG_FILE_HDR_SIZE);
893
894
#ifdef UNIV_LOG_ARCHIVE
895
		*(group->archive_file_header_bufs + i) = ut_align(
896
			mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE),
897
			OS_FILE_LOG_BLOCK_SIZE);
898
		memset(*(group->archive_file_header_bufs + i), '\0',
899
		       LOG_FILE_HDR_SIZE);
900
#endif /* UNIV_LOG_ARCHIVE */
901
	}
902
903
#ifdef UNIV_LOG_ARCHIVE
904
	group->archive_space_id = archive_space_id;
905
906
	group->archived_file_no = 0;
907
	group->archived_offset = 0;
908
#endif /* UNIV_LOG_ARCHIVE */
909
910
	group->checkpoint_buf = ut_align(
911
		mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE), OS_FILE_LOG_BLOCK_SIZE);
912
913
	memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
914
915
	UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);
916
917
	ut_a(log_calc_max_ages());
918
}
919
920
/**********************************************************************
921
Does the unlockings needed in flush i/o completion. */
922
UNIV_INLINE
923
void
924
log_flush_do_unlocks(
925
/*=================*/
926
	ulint	code)	/* in: any ORed combination of LOG_UNLOCK_FLUSH_LOCK
927
			and LOG_UNLOCK_NONE_FLUSHED_LOCK */
928
{
929
	ut_ad(mutex_own(&(log_sys->mutex)));
930
931
	/* NOTE that we must own the log mutex when doing the setting of the
932
	events: this is because transactions will wait for these events to
933
	be set, and at that moment the log flush they were waiting for must
934
	have ended. If the log mutex were not reserved here, the i/o-thread
935
	calling this function might be preempted for a while, and when it
936
	resumed execution, it might be that a new flush had been started, and
937
	this function would erroneously signal the NEW flush as completed.
938
	Thus, the changes in the state of these events are performed
939
	atomically in conjunction with the changes in the state of
940
	log_sys->n_pending_writes etc. */
941
942
	if (code & LOG_UNLOCK_NONE_FLUSHED_LOCK) {
943
		os_event_set(log_sys->one_flushed_event);
944
	}
945
946
	if (code & LOG_UNLOCK_FLUSH_LOCK) {
947
		os_event_set(log_sys->no_flush_event);
948
	}
949
}
950
951
/**********************************************************************
952
Checks if a flush is completed for a log group and does the completion
953
routine if yes. */
954
UNIV_INLINE
955
ulint
956
log_group_check_flush_completion(
957
/*=============================*/
958
				/* out: LOG_UNLOCK_NONE_FLUSHED_LOCK or 0 */
959
	log_group_t*	group)	/* in: log group */
960
{
961
	ut_ad(mutex_own(&(log_sys->mutex)));
962
963
	if (!log_sys->one_flushed && group->n_pending_writes == 0) {
964
#ifdef UNIV_DEBUG
965
		if (log_debug_writes) {
966
			fprintf(stderr,
967
				"Log flushed first to group %lu\n",
968
				(ulong) group->id);
969
		}
970
#endif /* UNIV_DEBUG */
971
		log_sys->written_to_some_lsn = log_sys->write_lsn;
972
		log_sys->one_flushed = TRUE;
973
974
		return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
975
	}
976
977
#ifdef UNIV_DEBUG
978
	if (log_debug_writes && (group->n_pending_writes == 0)) {
979
980
		fprintf(stderr, "Log flushed to group %lu\n",
981
			(ulong) group->id);
982
	}
983
#endif /* UNIV_DEBUG */
984
	return(0);
985
}
986
987
/**********************************************************
988
Checks if a flush is completed and does the completion routine if yes. */
989
static
990
ulint
991
log_sys_check_flush_completion(void)
992
/*================================*/
993
			/* out: LOG_UNLOCK_FLUSH_LOCK or 0 */
994
{
995
	ulint	move_start;
996
	ulint	move_end;
997
998
	ut_ad(mutex_own(&(log_sys->mutex)));
999
1000
	if (log_sys->n_pending_writes == 0) {
1001
1002
		log_sys->written_to_all_lsn = log_sys->write_lsn;
1003
		log_sys->buf_next_to_write = log_sys->write_end_offset;
1004
1005
		if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
1006
			/* Move the log buffer content to the start of the
1007
			buffer */
1008
1009
			move_start = ut_calc_align_down(
1010
				log_sys->write_end_offset,
1011
				OS_FILE_LOG_BLOCK_SIZE);
1012
			move_end = ut_calc_align(log_sys->buf_free,
1013
						 OS_FILE_LOG_BLOCK_SIZE);
1014
1015
			ut_memmove(log_sys->buf, log_sys->buf + move_start,
1016
				   move_end - move_start);
1017
			log_sys->buf_free -= move_start;
1018
1019
			log_sys->buf_next_to_write -= move_start;
1020
		}
1021
1022
		return(LOG_UNLOCK_FLUSH_LOCK);
1023
	}
1024
1025
	return(0);
1026
}
1027
1028
/**********************************************************
1029
Completes an i/o to a log file. */
1030
1031
void
1032
log_io_complete(
1033
/*============*/
1034
	log_group_t*	group)	/* in: log group or a dummy pointer */
1035
{
1036
	ulint	unlock;
1037
1038
#ifdef UNIV_LOG_ARCHIVE
1039
	if ((byte*)group == &log_archive_io) {
1040
		/* It was an archive write */
1041
1042
		log_io_complete_archive();
1043
1044
		return;
1045
	}
1046
#endif /* UNIV_LOG_ARCHIVE */
1047
1048
	if ((ulint)group & 0x1UL) {
1049
		/* It was a checkpoint write */
1050
		group = (log_group_t*)((ulint)group - 1);
1051
1052
		if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
1053
		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
1054
1055
			fil_flush(group->space_id);
1056
		}
1057
1058
#ifdef UNIV_DEBUG
1059
		if (log_debug_writes) {
1060
			fprintf(stderr,
1061
				"Checkpoint info written to group %lu\n",
1062
				group->id);
1063
		}
1064
#endif /* UNIV_DEBUG */
1065
		log_io_complete_checkpoint();
1066
1067
		return;
1068
	}
1069
1070
	ut_error;	/* We currently use synchronous writing of the
1071
			logs and cannot end up here! */
1072
1073
	if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
1074
	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
1075
	    && srv_flush_log_at_trx_commit != 2) {
1076
1077
		fil_flush(group->space_id);
1078
	}
1079
1080
	mutex_enter(&(log_sys->mutex));
1081
1082
	ut_a(group->n_pending_writes > 0);
1083
	ut_a(log_sys->n_pending_writes > 0);
1084
1085
	group->n_pending_writes--;
1086
	log_sys->n_pending_writes--;
1087
1088
	unlock = log_group_check_flush_completion(group);
1089
	unlock = unlock | log_sys_check_flush_completion();
1090
1091
	log_flush_do_unlocks(unlock);
1092
1093
	mutex_exit(&(log_sys->mutex));
1094
}
1095
1096
/**********************************************************
1097
Writes a log file header to a log file space. */
1098
static
1099
void
1100
log_group_file_header_flush(
1101
/*========================*/
1102
	log_group_t*	group,		/* in: log group */
1103
	ulint		nth_file,	/* in: header to the nth file in the
1104
					log file space */
1105
	dulint		start_lsn)	/* in: log file data starts at this
1106
					lsn */
1107
{
1108
	byte*	buf;
1109
	ulint	dest_offset;
1110
1111
	ut_ad(mutex_own(&(log_sys->mutex)));
1112
	ut_a(nth_file < group->n_files);
1113
1114
	buf = *(group->file_header_bufs + nth_file);
1115
1116
	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
1117
	mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn);
1118
1119
	/* Wipe over possible label of ibbackup --restore */
1120
	memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "    ", 4);
1121
1122
	dest_offset = nth_file * group->file_size;
1123
1124
#ifdef UNIV_DEBUG
1125
	if (log_debug_writes) {
1126
		fprintf(stderr,
1127
			"Writing log file header to group %lu file %lu\n",
1128
			(ulong) group->id, (ulong) nth_file);
1129
	}
1130
#endif /* UNIV_DEBUG */
1131
	if (log_do_write) {
1132
		log_sys->n_log_ios++;
1133
1134
		srv_os_log_pending_writes++;
1135
1136
		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id,
1137
		       dest_offset / UNIV_PAGE_SIZE,
1138
		       dest_offset % UNIV_PAGE_SIZE,
1139
		       OS_FILE_LOG_BLOCK_SIZE,
1140
		       buf, group);
1141
1142
		srv_os_log_pending_writes--;
1143
	}
1144
}
1145
1146
/**********************************************************
1147
Stores a 4-byte checksum to the trailer checksum field of a log block
1148
before writing it to a log file. This checksum is used in recovery to
1149
check the consistency of a log block. */
1150
static
1151
void
1152
log_block_store_checksum(
1153
/*=====================*/
1154
	byte*	block)	/* in/out: pointer to a log block */
1155
{
1156
	log_block_set_checksum(block, log_block_calc_checksum(block));
1157
}
1158
1159
/**********************************************************
1160
Writes a buffer to a log file group. */
1161
1162
void
1163
log_group_write_buf(
1164
/*================*/
1165
	log_group_t*	group,		/* in: log group */
1166
	byte*		buf,		/* in: buffer */
1167
	ulint		len,		/* in: buffer len; must be divisible
1168
					by OS_FILE_LOG_BLOCK_SIZE */
1169
	dulint		start_lsn,	/* in: start lsn of the buffer; must
1170
					be divisible by
1171
					OS_FILE_LOG_BLOCK_SIZE */
1172
	ulint		new_data_offset)/* in: start offset of new data in
1173
					buf: this parameter is used to decide
1174
					if we have to write a new log file
1175
					header */
1176
{
1177
	ulint	write_len;
1178
	ibool	write_header;
1179
	ulint	next_offset;
1180
	ulint	i;
1181
1182
	ut_ad(mutex_own(&(log_sys->mutex)));
1183
	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
1184
	ut_a(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
1185
1186
	if (new_data_offset == 0) {
1187
		write_header = TRUE;
1188
	} else {
1189
		write_header = FALSE;
1190
	}
1191
loop:
1192
	if (len == 0) {
1193
1194
		return;
1195
	}
1196
1197
	next_offset = log_group_calc_lsn_offset(start_lsn, group);
1198
1199
	if ((next_offset % group->file_size == LOG_FILE_HDR_SIZE)
1200
	    && write_header) {
1201
		/* We start to write a new log file instance in the group */
1202
1203
		log_group_file_header_flush(group,
1204
					    next_offset / group->file_size,
1205
					    start_lsn);
1206
		srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE;
1207
		srv_log_writes++;
1208
	}
1209
1210
	if ((next_offset % group->file_size) + len > group->file_size) {
1211
1212
		write_len = group->file_size
1213
			- (next_offset % group->file_size);
1214
	} else {
1215
		write_len = len;
1216
	}
1217
1218
#ifdef UNIV_DEBUG
1219
	if (log_debug_writes) {
1220
1221
		fprintf(stderr,
1222
			"Writing log file segment to group %lu"
1223
			" offset %lu len %lu\n"
1224
			"start lsn %lu %lu\n"
1225
			"First block n:o %lu last block n:o %lu\n",
1226
			(ulong) group->id, (ulong) next_offset,
1227
			(ulong) write_len,
1228
			(ulong) ut_dulint_get_high(start_lsn),
1229
			(ulong) ut_dulint_get_low(start_lsn),
1230
			(ulong) log_block_get_hdr_no(buf),
1231
			(ulong) log_block_get_hdr_no(
1232
				buf + write_len - OS_FILE_LOG_BLOCK_SIZE));
1233
		ut_a(log_block_get_hdr_no(buf)
1234
		     == log_block_convert_lsn_to_no(start_lsn));
1235
1236
		for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1237
1238
			ut_a(log_block_get_hdr_no(buf) + i
1239
			     == log_block_get_hdr_no(
1240
				     buf + i * OS_FILE_LOG_BLOCK_SIZE));
1241
		}
1242
	}
1243
#endif /* UNIV_DEBUG */
1244
	/* Calculate the checksums for each log block and write them to
1245
	the trailer fields of the log blocks */
1246
1247
	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1248
		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
1249
	}
1250
1251
	if (log_do_write) {
1252
		log_sys->n_log_ios++;
1253
1254
		srv_os_log_pending_writes++;
1255
1256
		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id,
1257
		       next_offset / UNIV_PAGE_SIZE,
1258
		       next_offset % UNIV_PAGE_SIZE, write_len, buf, group);
1259
1260
		srv_os_log_pending_writes--;
1261
1262
		srv_os_log_written+= write_len;
1263
		srv_log_writes++;
1264
	}
1265
1266
	if (write_len < len) {
1267
		start_lsn = ut_dulint_add(start_lsn, write_len);
1268
		len -= write_len;
1269
		buf += write_len;
1270
1271
		write_header = TRUE;
1272
1273
		goto loop;
1274
	}
1275
}
1276
1277
/**********************************************************
1278
This function is called, e.g., when a transaction wants to commit. It checks
1279
that the log has been written to the log file up to the last log entry written
1280
by the transaction. If there is a flush running, it waits and checks if the
1281
flush flushed enough. If not, starts a new flush. */
1282
1283
void
1284
log_write_up_to(
1285
/*============*/
1286
	dulint	lsn,	/* in: log sequence number up to which the log should
1287
			be written, ut_dulint_max if not specified */
1288
	ulint	wait,	/* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
1289
			or LOG_WAIT_ALL_GROUPS */
1290
	ibool	flush_to_disk)
1291
			/* in: TRUE if we want the written log also to be
1292
			flushed to disk */
1293
{
1294
	log_group_t*	group;
1295
	ulint		start_offset;
1296
	ulint		end_offset;
1297
	ulint		area_start;
1298
	ulint		area_end;
1299
#ifdef UNIV_DEBUG
1300
	ulint		loop_count	= 0;
1301
#endif /* UNIV_DEBUG */
1302
	ulint		unlock;
1303
1304
	if (recv_no_ibuf_operations) {
1305
		/* Recovery is running and no operations on the log files are
1306
		allowed yet (the variable name .._no_ibuf_.. is misleading) */
1307
1308
		return;
1309
	}
1310
1311
loop:
1312
#ifdef UNIV_DEBUG
1313
	loop_count++;
1314
1315
	ut_ad(loop_count < 5);
1316
1317
# if 0
1318
	if (loop_count > 2) {
1319
		fprintf(stderr, "Log loop count %lu\n", loop_count);
1320
	}
1321
# endif
1322
#endif
1323
1324
	mutex_enter(&(log_sys->mutex));
1325
1326
	if (flush_to_disk
1327
	    && ut_dulint_cmp(log_sys->flushed_to_disk_lsn, lsn) >= 0) {
1328
1329
		mutex_exit(&(log_sys->mutex));
1330
1331
		return;
1332
	}
1333
1334
	if (!flush_to_disk
1335
	    && (ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0
1336
		|| (ut_dulint_cmp(log_sys->written_to_some_lsn, lsn)
1337
		    >= 0
1338
		    && wait != LOG_WAIT_ALL_GROUPS))) {
1339
1340
		mutex_exit(&(log_sys->mutex));
1341
1342
		return;
1343
	}
1344
1345
	if (log_sys->n_pending_writes > 0) {
1346
		/* A write (+ possibly flush to disk) is running */
1347
1348
		if (flush_to_disk
1349
		    && ut_dulint_cmp(log_sys->current_flush_lsn, lsn)
1350
		    >= 0) {
1351
			/* The write + flush will write enough: wait for it to
1352
			complete  */
1353
1354
			goto do_waits;
1355
		}
1356
1357
		if (!flush_to_disk
1358
		    && ut_dulint_cmp(log_sys->write_lsn, lsn) >= 0) {
1359
			/* The write will write enough: wait for it to
1360
			complete  */
1361
1362
			goto do_waits;
1363
		}
1364
1365
		mutex_exit(&(log_sys->mutex));
1366
1367
		/* Wait for the write to complete and try to start a new
1368
		write */
1369
1370
		os_event_wait(log_sys->no_flush_event);
1371
1372
		goto loop;
1373
	}
1374
1375
	if (!flush_to_disk
1376
	    && log_sys->buf_free == log_sys->buf_next_to_write) {
1377
		/* Nothing to write and no flush to disk requested */
1378
1379
		mutex_exit(&(log_sys->mutex));
1380
1381
		return;
1382
	}
1383
1384
#ifdef UNIV_DEBUG
1385
	if (log_debug_writes) {
1386
		fprintf(stderr,
1387
			"Writing log from %lu %lu up to lsn %lu %lu\n",
1388
			(ulong) ut_dulint_get_high(
1389
				log_sys->written_to_all_lsn),
1390
			(ulong) ut_dulint_get_low(
1391
				log_sys->written_to_all_lsn),
1392
			(ulong) ut_dulint_get_high(log_sys->lsn),
1393
			(ulong)	ut_dulint_get_low(log_sys->lsn));
1394
	}
1395
#endif /* UNIV_DEBUG */
1396
	log_sys->n_pending_writes++;
1397
1398
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1399
	group->n_pending_writes++;	/* We assume here that we have only
1400
					one log group! */
1401
1402
	os_event_reset(log_sys->no_flush_event);
1403
	os_event_reset(log_sys->one_flushed_event);
1404
1405
	start_offset = log_sys->buf_next_to_write;
1406
	end_offset = log_sys->buf_free;
1407
1408
	area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
1409
	area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
1410
1411
	ut_ad(area_end - area_start > 0);
1412
1413
	log_sys->write_lsn = log_sys->lsn;
1414
1415
	if (flush_to_disk) {
1416
		log_sys->current_flush_lsn = log_sys->lsn;
1417
	}
1418
1419
	log_sys->one_flushed = FALSE;
1420
1421
	log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
1422
	log_block_set_checkpoint_no(
1423
		log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1424
		log_sys->next_checkpoint_no);
1425
1426
	/* Copy the last, incompletely written, log block a log block length
1427
	up, so that when the flush operation writes from the log buffer, the
1428
	segment to write will not be changed by writers to the log */
1429
1430
	ut_memcpy(log_sys->buf + area_end,
1431
		  log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1432
		  OS_FILE_LOG_BLOCK_SIZE);
1433
1434
	log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
1435
	log_sys->write_end_offset = log_sys->buf_free;
1436
1437
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1438
1439
	/* Do the write to the log files */
1440
1441
	while (group) {
1442
		log_group_write_buf(
1443
			group, log_sys->buf + area_start,
1444
			area_end - area_start,
1445
			ut_dulint_align_down(log_sys->written_to_all_lsn,
1446
					     OS_FILE_LOG_BLOCK_SIZE),
1447
			start_offset - area_start);
1448
1449
		log_group_set_fields(group, log_sys->write_lsn);
1450
1451
		group = UT_LIST_GET_NEXT(log_groups, group);
1452
	}
1453
1454
	mutex_exit(&(log_sys->mutex));
1455
1456
	if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1457
		/* O_DSYNC means the OS did not buffer the log file at all:
1458
		so we have also flushed to disk what we have written */
1459
1460
		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1461
1462
	} else if (flush_to_disk) {
1463
1464
		group = UT_LIST_GET_FIRST(log_sys->log_groups);
1465
1466
		fil_flush(group->space_id);
1467
		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1468
	}
1469
1470
	mutex_enter(&(log_sys->mutex));
1471
1472
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1473
1474
	ut_a(group->n_pending_writes == 1);
1475
	ut_a(log_sys->n_pending_writes == 1);
1476
1477
	group->n_pending_writes--;
1478
	log_sys->n_pending_writes--;
1479
1480
	unlock = log_group_check_flush_completion(group);
1481
	unlock = unlock | log_sys_check_flush_completion();
1482
1483
	log_flush_do_unlocks(unlock);
1484
1485
	mutex_exit(&(log_sys->mutex));
1486
1487
	return;
1488
1489
do_waits:
1490
	mutex_exit(&(log_sys->mutex));
1491
1492
	if (wait == LOG_WAIT_ONE_GROUP) {
1493
		os_event_wait(log_sys->one_flushed_event);
1494
	} else if (wait == LOG_WAIT_ALL_GROUPS) {
1495
		os_event_wait(log_sys->no_flush_event);
1496
	} else {
1497
		ut_ad(wait == LOG_NO_WAIT);
1498
	}
1499
}
1500
1501
/********************************************************************
1502
Does a syncronous flush of the log buffer to disk. */
1503
1504
void
1505
log_buffer_flush_to_disk(void)
1506
/*==========================*/
1507
{
1508
	dulint	lsn;
1509
1510
	mutex_enter(&(log_sys->mutex));
1511
1512
	lsn = log_sys->lsn;
1513
1514
	mutex_exit(&(log_sys->mutex));
1515
1516
	log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE);
1517
}
1518
1519
/********************************************************************
1520
Tries to establish a big enough margin of free space in the log buffer, such
1521
that a new log entry can be catenated without an immediate need for a flush. */
1522
static
1523
void
1524
log_flush_margin(void)
1525
/*==================*/
1526
{
1527
	ibool	do_flush	= FALSE;
1528
	log_t*	log		= log_sys;
1529
	dulint	lsn;
1530
1531
	mutex_enter(&(log->mutex));
1532
1533
	if (log->buf_free > log->max_buf_free) {
1534
1535
		if (log->n_pending_writes > 0) {
1536
			/* A flush is running: hope that it will provide enough
1537
			free space */
1538
		} else {
1539
			do_flush = TRUE;
1540
			lsn = log->lsn;
1541
		}
1542
	}
1543
1544
	mutex_exit(&(log->mutex));
1545
1546
	if (do_flush) {
1547
		log_write_up_to(lsn, LOG_NO_WAIT, FALSE);
1548
	}
1549
}
1550
1551
/********************************************************************
1552
Advances the smallest lsn for which there are unflushed dirty blocks in the
1553
buffer pool. NOTE: this function may only be called if the calling thread owns
1554
no synchronization objects! */
1555
1556
ibool
1557
log_preflush_pool_modified_pages(
1558
/*=============================*/
1559
				/* out: FALSE if there was a flush batch of
1560
				the same type running, which means that we
1561
				could not start this flush batch */
1562
	dulint	new_oldest,	/* in: try to advance oldest_modified_lsn
1563
				at least to this lsn */
1564
	ibool	sync)		/* in: TRUE if synchronous operation is
1565
				desired */
1566
{
1567
	ulint	n_pages;
1568
1569
	if (recv_recovery_on) {
1570
		/* If the recovery is running, we must first apply all
1571
		log records to their respective file pages to get the
1572
		right modify lsn values to these pages: otherwise, there
1573
		might be pages on disk which are not yet recovered to the
1574
		current lsn, and even after calling this function, we could
1575
		not know how up-to-date the disk version of the database is,
1576
		and we could not make a new checkpoint on the basis of the
1577
		info on the buffer pool only. */
1578
1579
		recv_apply_hashed_log_recs(TRUE);
1580
	}
1581
1582
	n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, new_oldest);
1583
1584
	if (sync) {
1585
		buf_flush_wait_batch_end(BUF_FLUSH_LIST);
1586
	}
1587
1588
	if (n_pages == ULINT_UNDEFINED) {
1589
1590
		return(FALSE);
1591
	}
1592
1593
	return(TRUE);
1594
}
1595
1596
/**********************************************************
1597
Completes a checkpoint. */
1598
static
1599
void
1600
log_complete_checkpoint(void)
1601
/*=========================*/
1602
{
1603
	ut_ad(mutex_own(&(log_sys->mutex)));
1604
	ut_ad(log_sys->n_pending_checkpoint_writes == 0);
1605
1606
	log_sys->next_checkpoint_no
1607
		= ut_dulint_add(log_sys->next_checkpoint_no, 1);
1608
1609
	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
1610
1611
	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
1612
}
1613
1614
/**********************************************************
1615
Completes an asynchronous checkpoint info write i/o to a log file. */
1616
static
1617
void
1618
log_io_complete_checkpoint(void)
1619
/*============================*/
1620
{
1621
	mutex_enter(&(log_sys->mutex));
1622
1623
	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
1624
1625
	log_sys->n_pending_checkpoint_writes--;
1626
1627
	if (log_sys->n_pending_checkpoint_writes == 0) {
1628
		log_complete_checkpoint();
1629
	}
1630
1631
	mutex_exit(&(log_sys->mutex));
1632
}
1633
1634
/***********************************************************************
1635
Writes info to a checkpoint about a log group. */
1636
static
1637
void
1638
log_checkpoint_set_nth_group_info(
1639
/*==============================*/
1640
	byte*	buf,	/* in: buffer for checkpoint info */
1641
	ulint	n,	/* in: nth slot */
1642
	ulint	file_no,/* in: archived file number */
1643
	ulint	offset)	/* in: archived file offset */
1644
{
1645
	ut_ad(n < LOG_MAX_N_GROUPS);
1646
1647
	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
1648
			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, file_no);
1649
	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
1650
			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET, offset);
1651
}
1652
1653
/***********************************************************************
1654
Gets info from a checkpoint about a log group. */
1655
1656
void
1657
log_checkpoint_get_nth_group_info(
1658
/*==============================*/
1659
	byte*	buf,	/* in: buffer containing checkpoint info */
1660
	ulint	n,	/* in: nth slot */
1661
	ulint*	file_no,/* out: archived file number */
1662
	ulint*	offset)	/* out: archived file offset */
1663
{
1664
	ut_ad(n < LOG_MAX_N_GROUPS);
1665
1666
	*file_no = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
1667
				    + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO);
1668
	*offset = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
1669
				   + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET);
1670
}
1671
1672
/**********************************************************
1673
Writes the checkpoint info to a log group header. */
1674
static
1675
void
1676
log_group_checkpoint(
1677
/*=================*/
1678
	log_group_t*	group)	/* in: log group */
1679
{
1680
	log_group_t*	group2;
1681
#ifdef UNIV_LOG_ARCHIVE
1682
	dulint	archived_lsn;
1683
	dulint	next_archived_lsn;
1684
#endif /* UNIV_LOG_ARCHIVE */
1685
	ulint	write_offset;
1686
	ulint	fold;
1687
	byte*	buf;
1688
	ulint	i;
1689
1690
	ut_ad(mutex_own(&(log_sys->mutex)));
1691
#if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
1692
# error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
1693
#endif
1694
1695
	buf = group->checkpoint_buf;
1696
1697
	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
1698
	mach_write_to_8(buf + LOG_CHECKPOINT_LSN,
1699
			log_sys->next_checkpoint_lsn);
1700
1701
	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
1702
			log_group_calc_lsn_offset(
1703
				log_sys->next_checkpoint_lsn, group));
1704
1705
	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
1706
1707
#ifdef UNIV_LOG_ARCHIVE
1708
	if (log_sys->archiving_state == LOG_ARCH_OFF) {
1709
		archived_lsn = ut_dulint_max;
1710
	} else {
1711
		archived_lsn = log_sys->archived_lsn;
1712
1713
		if (0 != ut_dulint_cmp(archived_lsn,
1714
				       log_sys->next_archived_lsn)) {
1715
			next_archived_lsn = log_sys->next_archived_lsn;
1716
			/* For debugging only */
1717
		}
1718
	}
1719
1720
	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn);
1721
#else /* UNIV_LOG_ARCHIVE */
1722
	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, ut_dulint_max);
1723
#endif /* UNIV_LOG_ARCHIVE */
1724
1725
	for (i = 0; i < LOG_MAX_N_GROUPS; i++) {
1726
		log_checkpoint_set_nth_group_info(buf, i, 0, 0);
1727
	}
1728
1729
	group2 = UT_LIST_GET_FIRST(log_sys->log_groups);
1730
1731
	while (group2) {
1732
		log_checkpoint_set_nth_group_info(buf, group2->id,
1733
#ifdef UNIV_LOG_ARCHIVE
1734
						  group2->archived_file_no,
1735
						  group2->archived_offset
1736
#else /* UNIV_LOG_ARCHIVE */
1737
						  0, 0
1738
#endif /* UNIV_LOG_ARCHIVE */
1739
						  );
1740
1741
		group2 = UT_LIST_GET_NEXT(log_groups, group2);
1742
	}
1743
1744
	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
1745
	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
1746
1747
	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
1748
			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
1749
	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
1750
1751
	/* Starting from InnoDB-3.23.50, we also write info on allocated
1752
	size in the tablespace */
1753
1754
	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT,
1755
			log_fsp_current_free_limit);
1756
1757
	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N,
1758
			LOG_CHECKPOINT_FSP_MAGIC_N_VAL);
1759
1760
	/* We alternate the physical place of the checkpoint info in the first
1761
	log file */
1762
1763
	if (ut_dulint_get_low(log_sys->next_checkpoint_no) % 2 == 0) {
1764
		write_offset = LOG_CHECKPOINT_1;
1765
	} else {
1766
		write_offset = LOG_CHECKPOINT_2;
1767
	}
1768
1769
	if (log_do_write) {
1770
		if (log_sys->n_pending_checkpoint_writes == 0) {
1771
1772
			rw_lock_x_lock_gen(&(log_sys->checkpoint_lock),
1773
					   LOG_CHECKPOINT);
1774
		}
1775
1776
		log_sys->n_pending_checkpoint_writes++;
1777
1778
		log_sys->n_log_ios++;
1779
1780
		/* We send as the last parameter the group machine address
1781
		added with 1, as we want to distinguish between a normal log
1782
		file write and a checkpoint field write */
1783
1784
		fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->space_id,
1785
		       write_offset / UNIV_PAGE_SIZE,
1786
		       write_offset % UNIV_PAGE_SIZE,
1787
		       OS_FILE_LOG_BLOCK_SIZE,
1788
		       buf, ((byte*)group + 1));
1789
1790
		ut_ad(((ulint)group & 0x1UL) == 0);
1791
	}
1792
}
1793
1794
/**********************************************************
1795
Writes info to a buffer of a log group when log files are created in
1796
backup restoration. */
1797
1798
void
1799
log_reset_first_header_and_checkpoint(
1800
/*==================================*/
1801
	byte*	hdr_buf,/* in: buffer which will be written to the start
1802
			of the first log file */
1803
	dulint	start)	/* in: lsn of the start of the first log file;
1804
			we pretend that there is a checkpoint at
1805
			start + LOG_BLOCK_HDR_SIZE */
1806
{
1807
	ulint	fold;
1808
	byte*	buf;
1809
	dulint	lsn;
1810
1811
	mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0);
1812
	mach_write_to_8(hdr_buf + LOG_FILE_START_LSN, start);
1813
1814
	lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE);
1815
1816
	/* Write the label of ibbackup --restore */
1817
	strcpy((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
1818
	       "ibbackup ");
1819
	ut_sprintf_timestamp((char*) hdr_buf
1820
			     + (LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
1821
				+ (sizeof "ibbackup ") - 1));
1822
	buf = hdr_buf + LOG_CHECKPOINT_1;
1823
1824
	mach_write_to_8(buf + LOG_CHECKPOINT_NO, ut_dulint_zero);
1825
	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
1826
1827
	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
1828
			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
1829
1830
	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
1831
1832
	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, ut_dulint_max);
1833
1834
	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
1835
	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
1836
1837
	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
1838
			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
1839
	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
1840
1841
	/* Starting from InnoDB-3.23.50, we should also write info on
1842
	allocated size in the tablespace, but unfortunately we do not
1843
	know it here */
1844
}
1845
1846
/**********************************************************
1847
Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
1848
1849
void
1850
log_group_read_checkpoint_info(
1851
/*===========================*/
1852
	log_group_t*	group,	/* in: log group */
1853
	ulint		field)	/* in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
1854
{
1855
	ut_ad(mutex_own(&(log_sys->mutex)));
1856
1857
	log_sys->n_log_ios++;
1858
1859
	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id,
1860
	       field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
1861
	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
1862
}
1863
1864
/**********************************************************
1865
Writes checkpoint info to groups. */
1866
1867
void
1868
log_groups_write_checkpoint_info(void)
1869
/*==================================*/
1870
{
1871
	log_group_t*	group;
1872
1873
	ut_ad(mutex_own(&(log_sys->mutex)));
1874
1875
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1876
1877
	while (group) {
1878
		log_group_checkpoint(group);
1879
1880
		group = UT_LIST_GET_NEXT(log_groups, group);
1881
	}
1882
}
1883
1884
/**********************************************************
1885
Makes a checkpoint. Note that this function does not flush dirty
1886
blocks from the buffer pool: it only checks what is lsn of the oldest
1887
modification in the pool, and writes information about the lsn in
1888
log files. Use log_make_checkpoint_at to flush also the pool. */
1889
1890
ibool
1891
log_checkpoint(
1892
/*===========*/
1893
				/* out: TRUE if success, FALSE if a checkpoint
1894
				write was already running */
1895
	ibool	sync,		/* in: TRUE if synchronous operation is
1896
				desired */
1897
	ibool	write_always)	/* in: the function normally checks if the
1898
				the new checkpoint would have a greater
1899
				lsn than the previous one: if not, then no
1900
				physical write is done; by setting this
1901
				parameter TRUE, a physical write will always be
1902
				made to log files */
1903
{
1904
	dulint	oldest_lsn;
1905
1906
	if (recv_recovery_is_on()) {
1907
		recv_apply_hashed_log_recs(TRUE);
1908
	}
1909
1910
	if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
1911
		fil_flush_file_spaces(FIL_TABLESPACE);
1912
	}
1913
1914
	mutex_enter(&(log_sys->mutex));
1915
1916
	oldest_lsn = log_buf_pool_get_oldest_modification();
1917
1918
	mutex_exit(&(log_sys->mutex));
1919
1920
	/* Because log also contains headers and dummy log records,
1921
	if the buffer pool contains no dirty buffers, oldest_lsn
1922
	gets the value log_sys->lsn from the previous function,
1923
	and we must make sure that the log is flushed up to that
1924
	lsn. If there are dirty buffers in the buffer pool, then our
1925
	write-ahead-logging algorithm ensures that the log has been flushed
1926
	up to oldest_lsn. */
1927
1928
	log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
1929
1930
	mutex_enter(&(log_sys->mutex));
1931
1932
	if (!write_always
1933
	    && ut_dulint_cmp(log_sys->last_checkpoint_lsn, oldest_lsn) >= 0) {
1934
1935
		mutex_exit(&(log_sys->mutex));
1936
1937
		return(TRUE);
1938
	}
1939
1940
	ut_ad(ut_dulint_cmp(log_sys->written_to_all_lsn, oldest_lsn) >= 0);
1941
1942
	if (log_sys->n_pending_checkpoint_writes > 0) {
1943
		/* A checkpoint write is running */
1944
1945
		mutex_exit(&(log_sys->mutex));
1946
1947
		if (sync) {
1948
			/* Wait for the checkpoint write to complete */
1949
			rw_lock_s_lock(&(log_sys->checkpoint_lock));
1950
			rw_lock_s_unlock(&(log_sys->checkpoint_lock));
1951
		}
1952
1953
		return(FALSE);
1954
	}
1955
1956
	log_sys->next_checkpoint_lsn = oldest_lsn;
1957
1958
#ifdef UNIV_DEBUG
1959
	if (log_debug_writes) {
1960
		fprintf(stderr, "Making checkpoint no %lu at lsn %lu %lu\n",
1961
			(ulong) ut_dulint_get_low(log_sys->next_checkpoint_no),
1962
			(ulong) ut_dulint_get_high(oldest_lsn),
1963
			(ulong) ut_dulint_get_low(oldest_lsn));
1964
	}
1965
#endif /* UNIV_DEBUG */
1966
1967
	log_groups_write_checkpoint_info();
1968
1969
	mutex_exit(&(log_sys->mutex));
1970
1971
	if (sync) {
1972
		/* Wait for the checkpoint write to complete */
1973
		rw_lock_s_lock(&(log_sys->checkpoint_lock));
1974
		rw_lock_s_unlock(&(log_sys->checkpoint_lock));
1975
	}
1976
1977
	return(TRUE);
1978
}
1979
1980
/********************************************************************
1981
Makes a checkpoint at a given lsn or later. */
1982
1983
void
1984
log_make_checkpoint_at(
1985
/*===================*/
1986
	dulint	lsn,		/* in: make a checkpoint at this or a later
1987
				lsn, if ut_dulint_max, makes a checkpoint at
1988
				the latest lsn */
1989
	ibool	write_always)	/* in: the function normally checks if the
1990
				the new checkpoint would have a greater
1991
				lsn than the previous one: if not, then no
1992
				physical write is done; by setting this
1993
				parameter TRUE, a physical write will always be
1994
				made to log files */
1995
{
1996
	ibool	success;
1997
1998
	/* Preflush pages synchronously */
1999
2000
	success = FALSE;
2001
2002
	while (!success) {
2003
		success = log_preflush_pool_modified_pages(lsn, TRUE);
2004
	}
2005
2006
	success = FALSE;
2007
2008
	while (!success) {
2009
		success = log_checkpoint(TRUE, write_always);
2010
	}
2011
}
2012
2013
/********************************************************************
2014
Tries to establish a big enough margin of free space in the log groups, such
2015
that a new log entry can be catenated without an immediate need for a
2016
checkpoint. NOTE: this function may only be called if the calling thread
2017
owns no synchronization objects! */
2018
static
2019
void
2020
log_checkpoint_margin(void)
2021
/*=======================*/
2022
{
2023
	log_t*	log		= log_sys;
2024
	ulint	age;
2025
	ulint	checkpoint_age;
2026
	ulint	advance;
2027
	dulint	oldest_lsn;
2028
	ibool	sync;
2029
	ibool	checkpoint_sync;
2030
	ibool	do_checkpoint;
2031
	ibool	success;
2032
loop:
2033
	sync = FALSE;
2034
	checkpoint_sync = FALSE;
2035
	do_checkpoint = FALSE;
2036
2037
	mutex_enter(&(log->mutex));
2038
2039
	if (log->check_flush_or_checkpoint == FALSE) {
2040
		mutex_exit(&(log->mutex));
2041
2042
		return;
2043
	}
2044
2045
	oldest_lsn = log_buf_pool_get_oldest_modification();
2046
2047
	age = ut_dulint_minus(log->lsn, oldest_lsn);
2048
2049
	if (age > log->max_modified_age_sync) {
2050
2051
		/* A flush is urgent: we have to do a synchronous preflush */
2052
2053
		sync = TRUE;
2054
		advance = 2 * (age - log->max_modified_age_sync);
2055
	} else if (age > log->max_modified_age_async) {
2056
2057
		/* A flush is not urgent: we do an asynchronous preflush */
2058
		advance = age - log->max_modified_age_async;
2059
	} else {
2060
		advance = 0;
2061
	}
2062
2063
	checkpoint_age = ut_dulint_minus(log->lsn, log->last_checkpoint_lsn);
2064
2065
	if (checkpoint_age > log->max_checkpoint_age) {
2066
		/* A checkpoint is urgent: we do it synchronously */
2067
2068
		checkpoint_sync = TRUE;
2069
2070
		do_checkpoint = TRUE;
2071
2072
	} else if (checkpoint_age > log->max_checkpoint_age_async) {
2073
		/* A checkpoint is not urgent: do it asynchronously */
2074
2075
		do_checkpoint = TRUE;
2076
2077
		log->check_flush_or_checkpoint = FALSE;
2078
	} else {
2079
		log->check_flush_or_checkpoint = FALSE;
2080
	}
2081
2082
	mutex_exit(&(log->mutex));
2083
2084
	if (advance) {
2085
		dulint	new_oldest = ut_dulint_add(oldest_lsn, advance);
2086
2087
		success = log_preflush_pool_modified_pages(new_oldest, sync);
2088
2089
		/* If the flush succeeded, this thread has done its part
2090
		and can proceed. If it did not succeed, there was another
2091
		thread doing a flush at the same time. If sync was FALSE,
2092
		the flush was not urgent, and we let this thread proceed.
2093
		Otherwise, we let it start from the beginning again. */
2094
2095
		if (sync && !success) {
2096
			mutex_enter(&(log->mutex));
2097
2098
			log->check_flush_or_checkpoint = TRUE;
2099
2100
			mutex_exit(&(log->mutex));
2101
			goto loop;
2102
		}
2103
	}
2104
2105
	if (do_checkpoint) {
2106
		log_checkpoint(checkpoint_sync, FALSE);
2107
2108
		if (checkpoint_sync) {
2109
2110
			goto loop;
2111
		}
2112
	}
2113
}
2114
2115
/**********************************************************
2116
Reads a specified log segment to a buffer. */
2117
2118
void
2119
log_group_read_log_seg(
2120
/*===================*/
2121
	ulint		type,		/* in: LOG_ARCHIVE or LOG_RECOVER */
2122
	byte*		buf,		/* in: buffer where to read */
2123
	log_group_t*	group,		/* in: log group */
2124
	dulint		start_lsn,	/* in: read area start */
2125
	dulint		end_lsn)	/* in: read area end */
2126
{
2127
	ulint	len;
2128
	ulint	source_offset;
2129
	ibool	sync;
2130
2131
	ut_ad(mutex_own(&(log_sys->mutex)));
2132
2133
	sync = FALSE;
2134
2135
	if (type == LOG_RECOVER) {
2136
		sync = TRUE;
2137
	}
2138
loop:
2139
	source_offset = log_group_calc_lsn_offset(start_lsn, group);
2140
2141
	len = ut_dulint_minus(end_lsn, start_lsn);
2142
2143
	ut_ad(len != 0);
2144
2145
	if ((source_offset % group->file_size) + len > group->file_size) {
2146
2147
		len = group->file_size - (source_offset % group->file_size);
2148
	}
2149
2150
#ifdef UNIV_LOG_ARCHIVE
2151
	if (type == LOG_ARCHIVE) {
2152
2153
		log_sys->n_pending_archive_ios++;
2154
	}
2155
#endif /* UNIV_LOG_ARCHIVE */
2156
2157
	log_sys->n_log_ios++;
2158
2159
	fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id,
2160
	       source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE,
2161
	       len, buf, NULL);
2162
2163
	start_lsn = ut_dulint_add(start_lsn, len);
2164
	buf += len;
2165
2166
	if (ut_dulint_cmp(start_lsn, end_lsn) != 0) {
2167
2168
		goto loop;
2169
	}
2170
}
2171
2172
#ifdef UNIV_LOG_ARCHIVE
2173
/**********************************************************
2174
Generates an archived log file name. */
2175
2176
void
2177
log_archived_file_name_gen(
2178
/*=======================*/
2179
	char*	buf,	/* in: buffer where to write */
2180
	ulint	id __attribute__((unused)),
2181
			/* in: group id;
2182
			currently we only archive the first group */
2183
	ulint	file_no)/* in: file number */
2184
{
2185
	sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no);
2186
}
2187
2188
/**********************************************************
2189
Writes a log file header to a log file space. */
2190
static
2191
void
2192
log_group_archive_file_header_write(
2193
/*================================*/
2194
	log_group_t*	group,		/* in: log group */
2195
	ulint		nth_file,	/* in: header to the nth file in the
2196
					archive log file space */
2197
	ulint		file_no,	/* in: archived file number */
2198
	dulint		start_lsn)	/* in: log file data starts at this
2199
					lsn */
2200
{
2201
	byte*	buf;
2202
	ulint	dest_offset;
2203
2204
	ut_ad(mutex_own(&(log_sys->mutex)));
2205
2206
	ut_a(nth_file < group->n_files);
2207
2208
	buf = *(group->archive_file_header_bufs + nth_file);
2209
2210
	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
2211
	mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn);
2212
	mach_write_to_4(buf + LOG_FILE_NO, file_no);
2213
2214
	mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, FALSE);
2215
2216
	dest_offset = nth_file * group->file_size;
2217
2218
	log_sys->n_log_ios++;
2219
2220
	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
2221
	       dest_offset / UNIV_PAGE_SIZE,
2222
	       dest_offset % UNIV_PAGE_SIZE,
2223
	       2 * OS_FILE_LOG_BLOCK_SIZE,
2224
	       buf, &log_archive_io);
2225
}
2226
2227
/**********************************************************
2228
Writes a log file header to a completed archived log file. */
2229
static
2230
void
2231
log_group_archive_completed_header_write(
2232
/*=====================================*/
2233
	log_group_t*	group,		/* in: log group */
2234
	ulint		nth_file,	/* in: header to the nth file in the
2235
					archive log file space */
2236
	dulint		end_lsn)	/* in: end lsn of the file */
2237
{
2238
	byte*	buf;
2239
	ulint	dest_offset;
2240
2241
	ut_ad(mutex_own(&(log_sys->mutex)));
2242
	ut_a(nth_file < group->n_files);
2243
2244
	buf = *(group->archive_file_header_bufs + nth_file);
2245
2246
	mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, TRUE);
2247
	mach_write_to_8(buf + LOG_FILE_END_LSN, end_lsn);
2248
2249
	dest_offset = nth_file * group->file_size + LOG_FILE_ARCH_COMPLETED;
2250
2251
	log_sys->n_log_ios++;
2252
2253
	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
2254
	       dest_offset / UNIV_PAGE_SIZE,
2255
	       dest_offset % UNIV_PAGE_SIZE,
2256
	       OS_FILE_LOG_BLOCK_SIZE,
2257
	       buf + LOG_FILE_ARCH_COMPLETED,
2258
	       &log_archive_io);
2259
}
2260
2261
/**********************************************************
2262
Does the archive writes for a single log group. */
2263
static
2264
void
2265
log_group_archive(
2266
/*==============*/
2267
	log_group_t*	group)	/* in: log group */
2268
{
2269
	os_file_t file_handle;
2270
	dulint	start_lsn;
2271
	dulint	end_lsn;
2272
	char	name[1024];
2273
	byte*	buf;
2274
	ulint	len;
2275
	ibool	ret;
2276
	ulint	next_offset;
2277
	ulint	n_files;
2278
	ulint	open_mode;
2279
2280
	ut_ad(mutex_own(&(log_sys->mutex)));
2281
2282
	start_lsn = log_sys->archived_lsn;
2283
2284
	ut_a(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
2285
2286
	end_lsn = log_sys->next_archived_lsn;
2287
2288
	ut_a(ut_dulint_get_low(end_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
2289
2290
	buf = log_sys->archive_buf;
2291
2292
	n_files = 0;
2293
2294
	next_offset = group->archived_offset;
2295
loop:
2296
	if ((next_offset % group->file_size == 0)
2297
	    || (fil_space_get_size(group->archive_space_id) == 0)) {
2298
2299
		/* Add the file to the archive file space; create or open the
2300
		file */
2301
2302
		if (next_offset % group->file_size == 0) {
2303
			open_mode = OS_FILE_CREATE;
2304
		} else {
2305
			open_mode = OS_FILE_OPEN;
2306
		}
2307
2308
		log_archived_file_name_gen(name, group->id,
2309
					   group->archived_file_no + n_files);
2310
2311
		file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
2312
					     OS_DATA_FILE, &ret);
2313
2314
		if (!ret && (open_mode == OS_FILE_CREATE)) {
2315
			file_handle = os_file_create(
2316
				name, OS_FILE_OPEN, OS_FILE_AIO,
2317
				OS_DATA_FILE, &ret);
2318
		}
2319
2320
		if (!ret) {
2321
			fprintf(stderr,
2322
				"InnoDB: Cannot create or open"
2323
				" archive log file %s.\n"
2324
				"InnoDB: Cannot continue operation.\n"
2325
				"InnoDB: Check that the log archive"
2326
				" directory exists,\n"
2327
				"InnoDB: you have access rights to it, and\n"
2328
				"InnoDB: there is space available.\n", name);
2329
			exit(1);
2330
		}
2331
2332
#ifdef UNIV_DEBUG
2333
		if (log_debug_writes) {
2334
			fprintf(stderr, "Created archive file %s\n", name);
2335
		}
2336
#endif /* UNIV_DEBUG */
2337
2338
		ret = os_file_close(file_handle);
2339
2340
		ut_a(ret);
2341
2342
		/* Add the archive file as a node to the space */
2343
2344
		fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
2345
				group->archive_space_id, FALSE);
2346
2347
		if (next_offset % group->file_size == 0) {
2348
			log_group_archive_file_header_write(
2349
				group, n_files,
2350
				group->archived_file_no + n_files,
2351
				start_lsn);
2352
2353
			next_offset += LOG_FILE_HDR_SIZE;
2354
		}
2355
	}
2356
2357
	len = ut_dulint_minus(end_lsn, start_lsn);
2358
2359
	if (group->file_size < (next_offset % group->file_size) + len) {
2360
2361
		len = group->file_size - (next_offset % group->file_size);
2362
	}
2363
2364
#ifdef UNIV_DEBUG
2365
	if (log_debug_writes) {
2366
		fprintf(stderr,
2367
			"Archiving starting at lsn %lu %lu, len %lu"
2368
			" to group %lu\n",
2369
			(ulong) ut_dulint_get_high(start_lsn),
2370
			(ulong) ut_dulint_get_low(start_lsn),
2371
			(ulong) len, (ulong) group->id);
2372
	}
2373
#endif /* UNIV_DEBUG */
2374
2375
	log_sys->n_pending_archive_ios++;
2376
2377
	log_sys->n_log_ios++;
2378
2379
	fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->archive_space_id,
2380
	       next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE,
2381
	       ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
2382
	       &log_archive_io);
2383
2384
	start_lsn = ut_dulint_add(start_lsn, len);
2385
	next_offset += len;
2386
	buf += len;
2387
2388
	if (next_offset % group->file_size == 0) {
2389
		n_files++;
2390
	}
2391
2392
	if (ut_dulint_cmp(end_lsn, start_lsn) != 0) {
2393
2394
		goto loop;
2395
	}
2396
2397
	group->next_archived_file_no = group->archived_file_no + n_files;
2398
	group->next_archived_offset = next_offset % group->file_size;
2399
2400
	ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
2401
}
2402
2403
/*********************************************************
2404
(Writes to the archive of each log group.) Currently, only the first
2405
group is archived. */
2406
static
2407
void
2408
log_archive_groups(void)
2409
/*====================*/
2410
{
2411
	log_group_t*	group;
2412
2413
	ut_ad(mutex_own(&(log_sys->mutex)));
2414
2415
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
2416
2417
	log_group_archive(group);
2418
}
2419
2420
/*********************************************************
2421
Completes the archiving write phase for (each log group), currently,
2422
the first log group. */
2423
static
2424
void
2425
log_archive_write_complete_groups(void)
2426
/*===================================*/
2427
{
2428
	log_group_t*	group;
2429
	ulint		end_offset;
2430
	ulint		trunc_files;
2431
	ulint		n_files;
2432
	dulint		start_lsn;
2433
	dulint		end_lsn;
2434
	ulint		i;
2435
2436
	ut_ad(mutex_own(&(log_sys->mutex)));
2437
2438
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
2439
2440
	group->archived_file_no = group->next_archived_file_no;
2441
	group->archived_offset = group->next_archived_offset;
2442
2443
	/* Truncate from the archive file space all but the last
2444
	file, or if it has been written full, all files */
2445
2446
	n_files = (UNIV_PAGE_SIZE
2447
		   * fil_space_get_size(group->archive_space_id))
2448
		/ group->file_size;
2449
	ut_ad(n_files > 0);
2450
2451
	end_offset = group->archived_offset;
2452
2453
	if (end_offset % group->file_size == 0) {
2454
2455
		trunc_files = n_files;
2456
	} else {
2457
		trunc_files = n_files - 1;
2458
	}
2459
2460
#ifdef UNIV_DEBUG
2461
	if (log_debug_writes && trunc_files) {
2462
		fprintf(stderr,
2463
			"Complete file(s) archived to group %lu\n",
2464
			(ulong) group->id);
2465
	}
2466
#endif /* UNIV_DEBUG */
2467
2468
	/* Calculate the archive file space start lsn */
2469
	start_lsn = ut_dulint_subtract(
2470
		log_sys->next_archived_lsn,
2471
		end_offset - LOG_FILE_HDR_SIZE + trunc_files
2472
		* (group->file_size - LOG_FILE_HDR_SIZE));
2473
	end_lsn = start_lsn;
2474
2475
	for (i = 0; i < trunc_files; i++) {
2476
2477
		end_lsn = ut_dulint_add(end_lsn,
2478
					group->file_size - LOG_FILE_HDR_SIZE);
2479
2480
		/* Write a notice to the headers of archived log
2481
		files that the file write has been completed */
2482
2483
		log_group_archive_completed_header_write(group, i, end_lsn);
2484
	}
2485
2486
	fil_space_truncate_start(group->archive_space_id,
2487
				 trunc_files * group->file_size);
2488
2489
#ifdef UNIV_DEBUG
2490
	if (log_debug_writes) {
2491
		fputs("Archiving writes completed\n", stderr);
2492
	}
2493
#endif /* UNIV_DEBUG */
2494
}
2495
2496
/**********************************************************
2497
Completes an archiving i/o. */
2498
static
2499
void
2500
log_archive_check_completion_low(void)
2501
/*==================================*/
2502
{
2503
	ut_ad(mutex_own(&(log_sys->mutex)));
2504
2505
	if (log_sys->n_pending_archive_ios == 0
2506
	    && log_sys->archiving_phase == LOG_ARCHIVE_READ) {
2507
2508
#ifdef UNIV_DEBUG
2509
		if (log_debug_writes) {
2510
			fputs("Archiving read completed\n", stderr);
2511
		}
2512
#endif /* UNIV_DEBUG */
2513
2514
		/* Archive buffer has now been read in: start archive writes */
2515
2516
		log_sys->archiving_phase = LOG_ARCHIVE_WRITE;
2517
2518
		log_archive_groups();
2519
	}
2520
2521
	if (log_sys->n_pending_archive_ios == 0
2522
	    && log_sys->archiving_phase == LOG_ARCHIVE_WRITE) {
2523
2524
		log_archive_write_complete_groups();
2525
2526
		log_sys->archived_lsn = log_sys->next_archived_lsn;
2527
2528
		rw_lock_x_unlock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
2529
	}
2530
}
2531
2532
/**********************************************************
2533
Completes an archiving i/o. */
2534
static
2535
void
2536
log_io_complete_archive(void)
2537
/*=========================*/
2538
{
2539
	log_group_t*	group;
2540
2541
	mutex_enter(&(log_sys->mutex));
2542
2543
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
2544
2545
	mutex_exit(&(log_sys->mutex));
2546
2547
	fil_flush(group->archive_space_id);
2548
2549
	mutex_enter(&(log_sys->mutex));
2550
2551
	ut_ad(log_sys->n_pending_archive_ios > 0);
2552
2553
	log_sys->n_pending_archive_ios--;
2554
2555
	log_archive_check_completion_low();
2556
2557
	mutex_exit(&(log_sys->mutex));
2558
}
2559
2560
/************************************************************************
2561
Starts an archiving operation. */
2562
2563
ibool
2564
log_archive_do(
2565
/*===========*/
2566
			/* out: TRUE if succeed, FALSE if an archiving
2567
			operation was already running */
2568
	ibool	sync,	/* in: TRUE if synchronous operation is desired */
2569
	ulint*	n_bytes)/* out: archive log buffer size, 0 if nothing to
2570
			archive */
2571
{
2572
	ibool	calc_new_limit;
2573
	dulint	start_lsn;
2574
	dulint	limit_lsn;
2575
2576
	calc_new_limit = TRUE;
2577
loop:
2578
	mutex_enter(&(log_sys->mutex));
2579
2580
	if (log_sys->archiving_state == LOG_ARCH_OFF) {
2581
		mutex_exit(&(log_sys->mutex));
2582
2583
		*n_bytes = 0;
2584
2585
		return(TRUE);
2586
2587
	} else if (log_sys->archiving_state == LOG_ARCH_STOPPED
2588
		   || log_sys->archiving_state == LOG_ARCH_STOPPING2) {
2589
2590
		mutex_exit(&(log_sys->mutex));
2591
2592
		os_event_wait(log_sys->archiving_on);
2593
2594
		mutex_enter(&(log_sys->mutex));
2595
2596
		goto loop;
2597
	}
2598
2599
	start_lsn = log_sys->archived_lsn;
2600
2601
	if (calc_new_limit) {
2602
		ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0);
2603
		limit_lsn = ut_dulint_add(start_lsn,
2604
					  log_sys->archive_buf_size);
2605
2606
		*n_bytes = log_sys->archive_buf_size;
2607
2608
		if (ut_dulint_cmp(limit_lsn, log_sys->lsn) >= 0) {
2609
2610
			limit_lsn = ut_dulint_align_down(
2611
				log_sys->lsn, OS_FILE_LOG_BLOCK_SIZE);
2612
		}
2613
	}
2614
2615
	if (ut_dulint_cmp(log_sys->archived_lsn, limit_lsn) >= 0) {
2616
2617
		mutex_exit(&(log_sys->mutex));
2618
2619
		*n_bytes = 0;
2620
2621
		return(TRUE);
2622
	}
2623
2624
	if (ut_dulint_cmp(log_sys->written_to_all_lsn, limit_lsn) < 0) {
2625
2626
		mutex_exit(&(log_sys->mutex));
2627
2628
		log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
2629
2630
		calc_new_limit = FALSE;
2631
2632
		goto loop;
2633
	}
2634
2635
	if (log_sys->n_pending_archive_ios > 0) {
2636
		/* An archiving operation is running */
2637
2638
		mutex_exit(&(log_sys->mutex));
2639
2640
		if (sync) {
2641
			rw_lock_s_lock(&(log_sys->archive_lock));
2642
			rw_lock_s_unlock(&(log_sys->archive_lock));
2643
		}
2644
2645
		*n_bytes = log_sys->archive_buf_size;
2646
2647
		return(FALSE);
2648
	}
2649
2650
	rw_lock_x_lock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
2651
2652
	log_sys->archiving_phase = LOG_ARCHIVE_READ;
2653
2654
	log_sys->next_archived_lsn = limit_lsn;
2655
2656
#ifdef UNIV_DEBUG
2657
	if (log_debug_writes) {
2658
		fprintf(stderr,
2659
			"Archiving from lsn %lu %lu to lsn %lu %lu\n",
2660
			(ulong) ut_dulint_get_high(log_sys->archived_lsn),
2661
			(ulong) ut_dulint_get_low(log_sys->archived_lsn),
2662
			(ulong) ut_dulint_get_high(limit_lsn),
2663
			(ulong) ut_dulint_get_low(limit_lsn));
2664
	}
2665
#endif /* UNIV_DEBUG */
2666
2667
	/* Read the log segment to the archive buffer */
2668
2669
	log_group_read_log_seg(LOG_ARCHIVE, log_sys->archive_buf,
2670
			       UT_LIST_GET_FIRST(log_sys->log_groups),
2671
			       start_lsn, limit_lsn);
2672
2673
	mutex_exit(&(log_sys->mutex));
2674
2675
	if (sync) {
2676
		rw_lock_s_lock(&(log_sys->archive_lock));
2677
		rw_lock_s_unlock(&(log_sys->archive_lock));
2678
	}
2679
2680
	*n_bytes = log_sys->archive_buf_size;
2681
2682
	return(TRUE);
2683
}
2684
2685
/********************************************************************
2686
Writes the log contents to the archive at least up to the lsn when this
2687
function was called. */
2688
static
2689
void
2690
log_archive_all(void)
2691
/*=================*/
2692
{
2693
	dulint	present_lsn;
2694
	ulint	dummy;
2695
2696
	mutex_enter(&(log_sys->mutex));
2697
2698
	if (log_sys->archiving_state == LOG_ARCH_OFF) {
2699
		mutex_exit(&(log_sys->mutex));
2700
2701
		return;
2702
	}
2703
2704
	present_lsn = log_sys->lsn;
2705
2706
	mutex_exit(&(log_sys->mutex));
2707
2708
	log_pad_current_log_block();
2709
2710
	for (;;) {
2711
		mutex_enter(&(log_sys->mutex));
2712
2713
		if (ut_dulint_cmp(present_lsn, log_sys->archived_lsn) <= 0) {
2714
2715
			mutex_exit(&(log_sys->mutex));
2716
2717
			return;
2718
		}
2719
2720
		mutex_exit(&(log_sys->mutex));
2721
2722
		log_archive_do(TRUE, &dummy);
2723
	}
2724
}
2725
2726
/*********************************************************
2727
Closes the possible open archive log file (for each group) the first group,
2728
and if it was open, increments the group file count by 2, if desired. */
2729
static
2730
void
2731
log_archive_close_groups(
2732
/*=====================*/
2733
	ibool	increment_file_count)	/* in: TRUE if we want to increment
2734
					the file count */
2735
{
2736
	log_group_t*	group;
2737
	ulint		trunc_len;
2738
2739
	ut_ad(mutex_own(&(log_sys->mutex)));
2740
2741
	if (log_sys->archiving_state == LOG_ARCH_OFF) {
2742
2743
		return;
2744
	}
2745
2746
	group = UT_LIST_GET_FIRST(log_sys->log_groups);
2747
2748
	trunc_len = UNIV_PAGE_SIZE
2749
		* fil_space_get_size(group->archive_space_id);
2750
	if (trunc_len > 0) {
2751
		ut_a(trunc_len == group->file_size);
2752
2753
		/* Write a notice to the headers of archived log
2754
		files that the file write has been completed */
2755
2756
		log_group_archive_completed_header_write(
2757
			group, 0, log_sys->archived_lsn);
2758
2759
		fil_space_truncate_start(group->archive_space_id,
2760
					 trunc_len);
2761
		if (increment_file_count) {
2762
			group->archived_offset = 0;
2763
			group->archived_file_no += 2;
2764
		}
2765
2766
#ifdef UNIV_DEBUG
2767
		if (log_debug_writes) {
2768
			fprintf(stderr,
2769
				"Incrementing arch file no to %lu"
2770
				" in log group %lu\n",
2771
				(ulong) group->archived_file_no + 2,
2772
				(ulong) group->id);
2773
		}
2774
#endif /* UNIV_DEBUG */
2775
	}
2776
}
2777
2778
/********************************************************************
2779
Writes the log contents to the archive up to the lsn when this function was
2780
called, and stops the archiving. When archiving is started again, the archived
2781
log file numbers start from 2 higher, so that the archiving will not write
2782
again to the archived log files which exist when this function returns. */
2783
2784
ulint
2785
log_archive_stop(void)
2786
/*==================*/
2787
			/* out: DB_SUCCESS or DB_ERROR */
2788
{
2789
	ibool	success;
2790
2791
	mutex_enter(&(log_sys->mutex));
2792
2793
	if (log_sys->archiving_state != LOG_ARCH_ON) {
2794
2795
		mutex_exit(&(log_sys->mutex));
2796
2797
		return(DB_ERROR);
2798
	}
2799
2800
	log_sys->archiving_state = LOG_ARCH_STOPPING;
2801
2802
	mutex_exit(&(log_sys->mutex));
2803
2804
	log_archive_all();
2805
2806
	mutex_enter(&(log_sys->mutex));
2807
2808
	log_sys->archiving_state = LOG_ARCH_STOPPING2;
2809
	os_event_reset(log_sys->archiving_on);
2810
2811
	mutex_exit(&(log_sys->mutex));
2812
2813
	/* Wait for a possible archiving operation to end */
2814
2815
	rw_lock_s_lock(&(log_sys->archive_lock));
2816
	rw_lock_s_unlock(&(log_sys->archive_lock));
2817
2818
	mutex_enter(&(log_sys->mutex));
2819
2820
	/* Close all archived log files, incrementing the file count by 2,
2821
	if appropriate */
2822
2823
	log_archive_close_groups(TRUE);
2824
2825
	mutex_exit(&(log_sys->mutex));
2826
2827
	/* Make a checkpoint, so that if recovery is needed, the file numbers
2828
	of new archived log files will start from the right value */
2829
2830
	success = FALSE;
2831
2832
	while (!success) {
2833
		success = log_checkpoint(TRUE, TRUE);
2834
	}
2835
2836
	mutex_enter(&(log_sys->mutex));
2837
2838
	log_sys->archiving_state = LOG_ARCH_STOPPED;
2839
2840
	mutex_exit(&(log_sys->mutex));
2841
2842
	return(DB_SUCCESS);
2843
}
2844
2845
/********************************************************************
2846
Starts again archiving which has been stopped. */
2847
2848
ulint
2849
log_archive_start(void)
2850
/*===================*/
2851
			/* out: DB_SUCCESS or DB_ERROR */
2852
{
2853
	mutex_enter(&(log_sys->mutex));
2854
2855
	if (log_sys->archiving_state != LOG_ARCH_STOPPED) {
2856
2857
		mutex_exit(&(log_sys->mutex));
2858
2859
		return(DB_ERROR);
2860
	}
2861
2862
	log_sys->archiving_state = LOG_ARCH_ON;
2863
2864
	os_event_set(log_sys->archiving_on);
2865
2866
	mutex_exit(&(log_sys->mutex));
2867
2868
	return(DB_SUCCESS);
2869
}
2870
2871
/********************************************************************
2872
Stop archiving the log so that a gap may occur in the archived log files. */
2873
2874
ulint
2875
log_archive_noarchivelog(void)
2876
/*==========================*/
2877
			/* out: DB_SUCCESS or DB_ERROR */
2878
{
2879
loop:
2880
	mutex_enter(&(log_sys->mutex));
2881
2882
	if (log_sys->archiving_state == LOG_ARCH_STOPPED
2883
	    || log_sys->archiving_state == LOG_ARCH_OFF) {
2884
2885
		log_sys->archiving_state = LOG_ARCH_OFF;
2886
2887
		os_event_set(log_sys->archiving_on);
2888
2889
		mutex_exit(&(log_sys->mutex));
2890
2891
		return(DB_SUCCESS);
2892
	}
2893
2894
	mutex_exit(&(log_sys->mutex));
2895
2896
	log_archive_stop();
2897
2898
	os_thread_sleep(500000);
2899
2900
	goto loop;
2901
}
2902
2903
/********************************************************************
2904
Start archiving the log so that a gap may occur in the archived log files. */
2905
2906
ulint
2907
log_archive_archivelog(void)
2908
/*========================*/
2909
			/* out: DB_SUCCESS or DB_ERROR */
2910
{
2911
	mutex_enter(&(log_sys->mutex));
2912
2913
	if (log_sys->archiving_state == LOG_ARCH_OFF) {
2914
2915
		log_sys->archiving_state = LOG_ARCH_ON;
2916
2917
		log_sys->archived_lsn
2918
			= ut_dulint_align_down(log_sys->lsn,
2919
					       OS_FILE_LOG_BLOCK_SIZE);
2920
		mutex_exit(&(log_sys->mutex));
2921
2922
		return(DB_SUCCESS);
2923
	}
2924
2925
	mutex_exit(&(log_sys->mutex));
2926
2927
	return(DB_ERROR);
2928
}
2929
2930
/********************************************************************
2931
Tries to establish a big enough margin of free space in the log groups, such
2932
that a new log entry can be catenated without an immediate need for
2933
archiving. */
2934
static
2935
void
2936
log_archive_margin(void)
2937
/*====================*/
2938
{
2939
	log_t*	log		= log_sys;
2940
	ulint	age;
2941
	ibool	sync;
2942
	ulint	dummy;
2943
loop:
2944
	mutex_enter(&(log->mutex));
2945
2946
	if (log->archiving_state == LOG_ARCH_OFF) {
2947
		mutex_exit(&(log->mutex));
2948
2949
		return;
2950
	}
2951
2952
	age = ut_dulint_minus(log->lsn, log->archived_lsn);
2953
2954
	if (age > log->max_archived_lsn_age) {
2955
2956
		/* An archiving is urgent: we have to do synchronous i/o */
2957
2958
		sync = TRUE;
2959
2960
	} else if (age > log->max_archived_lsn_age_async) {
2961
2962
		/* An archiving is not urgent: we do asynchronous i/o */
2963
2964
		sync = FALSE;
2965
	} else {
2966
		/* No archiving required yet */
2967
2968
		mutex_exit(&(log->mutex));
2969
2970
		return;
2971
	}
2972
2973
	mutex_exit(&(log->mutex));
2974
2975
	log_archive_do(sync, &dummy);
2976
2977
	if (sync == TRUE) {
2978
		/* Check again that enough was written to the archive */
2979
2980
		goto loop;
2981
	}
2982
}
2983
#endif /* UNIV_LOG_ARCHIVE */
2984
2985
/************************************************************************
2986
Checks that there is enough free space in the log to start a new query step.
2987
Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
2988
function may only be called if the calling thread owns no synchronization
2989
objects! */
2990
2991
void
2992
log_check_margins(void)
2993
/*===================*/
2994
{
2995
loop:
2996
	log_flush_margin();
2997
2998
	log_checkpoint_margin();
2999
3000
#ifdef UNIV_LOG_ARCHIVE
3001
	log_archive_margin();
3002
#endif /* UNIV_LOG_ARCHIVE */
3003
3004
	mutex_enter(&(log_sys->mutex));
3005
3006
	if (log_sys->check_flush_or_checkpoint) {
3007
3008
		mutex_exit(&(log_sys->mutex));
3009
3010
		goto loop;
3011
	}
3012
3013
	mutex_exit(&(log_sys->mutex));
3014
}
3015
3016
/********************************************************************
3017
Makes a checkpoint at the latest lsn and writes it to first page of each
3018
data file in the database, so that we know that the file spaces contain
3019
all modifications up to that lsn. This can only be called at database
3020
shutdown. This function also writes all log in log files to the log archive. */
3021
3022
void
3023
logs_empty_and_mark_files_at_shutdown(void)
3024
/*=======================================*/
3025
{
3026
	dulint	lsn;
3027
	ulint	arch_log_no;
3028
3029
	if (srv_print_verbose_log) {
3030
		ut_print_timestamp(stderr);
3031
		fprintf(stderr, "  InnoDB: Starting shutdown...\n");
3032
	}
3033
	/* Wait until the master thread and all other operations are idle: our
3034
	algorithm only works if the server is idle at shutdown */
3035
3036
	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
3037
loop:
3038
	os_thread_sleep(100000);
3039
3040
	mutex_enter(&kernel_mutex);
3041
3042
	/* We need the monitor threads to stop before we proceed with a
3043
	normal shutdown. In case of very fast shutdown, however, we can
3044
	proceed without waiting for monitor threads. */
3045
3046
	if (srv_fast_shutdown < 2
3047
	   && (srv_error_monitor_active
3048
	      || srv_lock_timeout_and_monitor_active)) {
3049
3050
		mutex_exit(&kernel_mutex);
3051
3052
		goto loop;
3053
	}
3054
3055
	/* Check that there are no longer transactions. We need this wait even
3056
	for the 'very fast' shutdown, because the InnoDB layer may have
3057
	committed or prepared transactions and we don't want to lose them. */
3058
3059
	if (trx_n_mysql_transactions > 0
3060
	    || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
3061
3062
		mutex_exit(&kernel_mutex);
3063
3064
		goto loop;
3065
	}
3066
3067
	if (srv_fast_shutdown == 2) {
3068
		/* In this fastest shutdown we do not flush the buffer pool:
3069
		it is essentially a 'crash' of the InnoDB server. Make sure
3070
		that the log is all flushed to disk, so that we can recover
3071
		all committed transactions in a crash recovery. We must not
3072
		write the lsn stamps to the data files, since at a startup
3073
		InnoDB deduces from the stamps if the previous shutdown was
3074
		clean. */
3075
3076
		log_buffer_flush_to_disk();
3077
3078
		return; /* We SKIP ALL THE REST !! */
3079
	}
3080
3081
	/* Check that the master thread is suspended */
3082
3083
	if (srv_n_threads_active[SRV_MASTER] != 0) {
3084
3085
		mutex_exit(&kernel_mutex);
3086
3087
		goto loop;
3088
	}
3089
3090
	mutex_exit(&kernel_mutex);
3091
3092
	mutex_enter(&(log_sys->mutex));
3093
3094
	if (log_sys->n_pending_checkpoint_writes
3095
#ifdef UNIV_LOG_ARCHIVE
3096
	    || log_sys->n_pending_archive_ios
3097
#endif /* UNIV_LOG_ARCHIVE */
3098
	    || log_sys->n_pending_writes) {
3099
3100
		mutex_exit(&(log_sys->mutex));
3101
3102
		goto loop;
3103
	}
3104
3105
	mutex_exit(&(log_sys->mutex));
3106
3107
	if (!buf_pool_check_no_pending_io()) {
3108
3109
		goto loop;
3110
	}
3111
3112
#ifdef UNIV_LOG_ARCHIVE
3113
	log_archive_all();
3114
#endif /* UNIV_LOG_ARCHIVE */
3115
3116
	log_make_checkpoint_at(ut_dulint_max, TRUE);
3117
3118
	mutex_enter(&(log_sys->mutex));
3119
3120
	lsn = log_sys->lsn;
3121
3122
	if ((ut_dulint_cmp(lsn, log_sys->last_checkpoint_lsn) != 0)
3123
#ifdef UNIV_LOG_ARCHIVE
3124
	    || (srv_log_archive_on
3125
		&& ut_dulint_cmp(lsn,
3126
				 ut_dulint_add(log_sys->archived_lsn,
3127
					       LOG_BLOCK_HDR_SIZE))
3128
		!= 0)
3129
#endif /* UNIV_LOG_ARCHIVE */
3130
	    ) {
3131
3132
		mutex_exit(&(log_sys->mutex));
3133
3134
		goto loop;
3135
	}
3136
3137
	arch_log_no = 0;
3138
3139
#ifdef UNIV_LOG_ARCHIVE
3140
	UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no;
3141
3142
	if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) {
3143
3144
		arch_log_no--;
3145
	}
3146
3147
	log_archive_close_groups(TRUE);
3148
#endif /* UNIV_LOG_ARCHIVE */
3149
3150
	mutex_exit(&(log_sys->mutex));
3151
3152
	mutex_enter(&kernel_mutex);
3153
	/* Check that the master thread has stayed suspended */
3154
	if (srv_n_threads_active[SRV_MASTER] != 0) {
3155
		fprintf(stderr,
3156
			"InnoDB: Warning: the master thread woke up"
3157
			" during shutdown\n");
3158
3159
		mutex_exit(&kernel_mutex);
3160
3161
		goto loop;
3162
	}
3163
	mutex_exit(&kernel_mutex);
3164
3165
	fil_flush_file_spaces(FIL_TABLESPACE);
3166
	fil_flush_file_spaces(FIL_LOG);
3167
3168
	/* The call fil_write_flushed_lsn_to_data_files() will pass the buffer
3169
	pool: therefore it is essential that the buffer pool has been
3170
	completely flushed to disk! (We do not call fil_write... if the
3171
	'very fast' shutdown is enabled.) */
3172
3173
	if (!buf_all_freed()) {
3174
3175
		goto loop;
3176
	}
3177
3178
	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
3179
3180
	/* Make some checks that the server really is quiet */
3181
	ut_a(srv_n_threads_active[SRV_MASTER] == 0);
3182
	ut_a(buf_all_freed());
3183
	ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn));
3184
3185
	if (ut_dulint_cmp(lsn, srv_start_lsn) < 0) {
3186
		fprintf(stderr,
3187
			"InnoDB: Error: log sequence number"
3188
			" at shutdown %lu %lu\n"
3189
			"InnoDB: is lower than at startup %lu %lu!\n",
3190
			(ulong) ut_dulint_get_high(lsn),
3191
			(ulong) ut_dulint_get_low(lsn),
3192
			(ulong) ut_dulint_get_high(srv_start_lsn),
3193
			(ulong) ut_dulint_get_low(srv_start_lsn));
3194
	}
3195
3196
	srv_shutdown_lsn = lsn;
3197
3198
	fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
3199
3200
	fil_flush_file_spaces(FIL_TABLESPACE);
3201
3202
	fil_close_all_files();
3203
3204
	/* Make some checks that the server really is quiet */
3205
	ut_a(srv_n_threads_active[SRV_MASTER] == 0);
3206
	ut_a(buf_all_freed());
3207
	ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn));
3208
}
3209
3210
/**********************************************************
3211
Checks by parsing that the catenated log segment for a single mtr is
3212
consistent. */
3213
3214
ibool
3215
log_check_log_recs(
3216
/*===============*/
3217
	byte*	buf,		/* in: pointer to the start of the log segment
3218
				in the log_sys->buf log buffer */
3219
	ulint	len,		/* in: segment length in bytes */
3220
	dulint	buf_start_lsn)	/* in: buffer start lsn */
3221
{
3222
	dulint	contiguous_lsn;
3223
	dulint	scanned_lsn;
3224
	byte*	start;
3225
	byte*	end;
3226
	byte*	buf1;
3227
	byte*	scan_buf;
3228
3229
	ut_ad(mutex_own(&(log_sys->mutex)));
3230
3231
	if (len == 0) {
3232
3233
		return(TRUE);
3234
	}
3235
3236
	start = ut_align_down(buf, OS_FILE_LOG_BLOCK_SIZE);
3237
	end = ut_align(buf + len, OS_FILE_LOG_BLOCK_SIZE);
3238
3239
	buf1 = mem_alloc((end - start) + OS_FILE_LOG_BLOCK_SIZE);
3240
	scan_buf = ut_align(buf1, OS_FILE_LOG_BLOCK_SIZE);
3241
3242
	ut_memcpy(scan_buf, start, end - start);
3243
3244
	recv_scan_log_recs(TRUE,
3245
			   (buf_pool->n_frames
3246
			    - recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
3247
			   FALSE, scan_buf, end - start,
3248
			   ut_dulint_align_down(buf_start_lsn,
3249
						OS_FILE_LOG_BLOCK_SIZE),
3250
			   &contiguous_lsn, &scanned_lsn);
3251
3252
	ut_a(ut_dulint_cmp(scanned_lsn, ut_dulint_add(buf_start_lsn, len))
3253
	     == 0);
3254
	ut_a(ut_dulint_cmp(recv_sys->recovered_lsn, scanned_lsn) == 0);
3255
3256
	mem_free(buf1);
3257
3258
	return(TRUE);
3259
}
3260
3261
/**********************************************************
3262
Peeks the current lsn. */
3263
3264
ibool
3265
log_peek_lsn(
3266
/*=========*/
3267
			/* out: TRUE if success, FALSE if could not get the
3268
			log system mutex */
3269
	dulint*	lsn)	/* out: if returns TRUE, current lsn is here */
3270
{
3271
	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
3272
		*lsn = log_sys->lsn;
3273
3274
		mutex_exit(&(log_sys->mutex));
3275
3276
		return(TRUE);
3277
	}
3278
3279
	return(FALSE);
3280
}
3281
3282
/**********************************************************
3283
Prints info of the log. */
3284
3285
void
3286
log_print(
3287
/*======*/
3288
	FILE*	file)	/* in: file where to print */
3289
{
3290
	double	time_elapsed;
3291
	time_t	current_time;
3292
3293
	mutex_enter(&(log_sys->mutex));
3294
3295
	fprintf(file,
3296
		"Log sequence number %lu %lu\n"
3297
		"Log flushed up to   %lu %lu\n"
3298
		"Last checkpoint at  %lu %lu\n",
3299
		(ulong) ut_dulint_get_high(log_sys->lsn),
3300
		(ulong) ut_dulint_get_low(log_sys->lsn),
3301
		(ulong) ut_dulint_get_high(log_sys->flushed_to_disk_lsn),
3302
		(ulong) ut_dulint_get_low(log_sys->flushed_to_disk_lsn),
3303
		(ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn),
3304
		(ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn));
3305
3306
	current_time = time(NULL);
3307
3308
	time_elapsed = 0.001 + difftime(current_time,
3309
					log_sys->last_printout_time);
3310
	fprintf(file,
3311
		"%lu pending log writes, %lu pending chkp writes\n"
3312
		"%lu log i/o's done, %.2f log i/o's/second\n",
3313
		(ulong) log_sys->n_pending_writes,
3314
		(ulong) log_sys->n_pending_checkpoint_writes,
3315
		(ulong) log_sys->n_log_ios,
3316
		((log_sys->n_log_ios - log_sys->n_log_ios_old)
3317
		 / time_elapsed));
3318
3319
	log_sys->n_log_ios_old = log_sys->n_log_ios;
3320
	log_sys->last_printout_time = current_time;
3321
3322
	mutex_exit(&(log_sys->mutex));
3323
}
3324
3325
/**************************************************************************
3326
Refreshes the statistics used to print per-second averages. */
3327
3328
void
3329
log_refresh_stats(void)
3330
/*===================*/
3331
{
3332
	log_sys->n_log_ios_old = log_sys->n_log_ios;
3333
	log_sys->last_printout_time = time(NULL);
3334
}