~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/******************************************************
2
The transaction
3
4
(c) 1996 Innobase Oy
5
6
Created 3/26/1996 Heikki Tuuri
7
*******************************************************/
8
9
#include "trx0trx.h"
10
11
#ifdef UNIV_NONINL
12
#include "trx0trx.ic"
13
#endif
14
15
#include "trx0undo.h"
16
#include "trx0rseg.h"
17
#include "log0log.h"
18
#include "que0que.h"
19
#include "lock0lock.h"
20
#include "trx0roll.h"
21
#include "usr0sess.h"
22
#include "read0read.h"
23
#include "srv0srv.h"
24
#include "thr0loc.h"
25
#include "btr0sea.h"
26
#include "os0proc.h"
27
#include "trx0xa.h"
28
#include "ha_prototypes.h"
29
30
/* Copy of the prototype for innobase_mysql_print_thd: this
31
copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */
32
33
void innobase_mysql_print_thd(
34
	FILE*	f,
35
	void*	thd,
36
	ulint	max_query_len);
37
38
/* Dummy session used currently in MySQL interface */
39
sess_t*		trx_dummy_sess = NULL;
40
41
/* Number of transactions currently allocated for MySQL: protected by
42
the kernel mutex */
43
ulint	trx_n_mysql_transactions = 0;
44
45
/*****************************************************************
46
Starts the transaction if it is not yet started. */
47
48
void
49
trx_start_if_not_started_noninline(
50
/*===============================*/
51
	trx_t*	trx) /* in: transaction */
52
{
53
	trx_start_if_not_started(trx);
54
}
55
56
/*****************************************************************
57
Set detailed error message for the transaction. */
58
59
void
60
trx_set_detailed_error(
61
/*===================*/
62
	trx_t*		trx,	/* in: transaction struct */
63
	const char*	msg)	/* in: detailed error message */
64
{
65
	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
66
}
67
68
/*****************************************************************
69
Set detailed error message for the transaction from a file. Note that the
70
file is rewinded before reading from it. */
71
72
void
73
trx_set_detailed_error_from_file(
74
/*=============================*/
75
	trx_t*	trx,	/* in: transaction struct */
76
	FILE*	file)	/* in: file to read message from */
77
{
78
	os_file_read_string(file, trx->detailed_error,
79
			    sizeof(trx->detailed_error));
80
}
81
82
/********************************************************************
83
Retrieves the error_info field from a trx. */
84
85
void*
86
trx_get_error_info(
87
/*===============*/
88
			/* out: the error info */
89
	trx_t*	trx)	/* in: trx object */
90
{
91
	return(trx->error_info);
92
}
93
94
/********************************************************************
95
Creates and initializes a transaction object. */
96
97
trx_t*
98
trx_create(
99
/*=======*/
100
			/* out, own: the transaction */
101
	sess_t*	sess)	/* in: session or NULL */
102
{
103
	trx_t*	trx;
104
105
	ut_ad(mutex_own(&kernel_mutex));
106
107
	trx = mem_alloc(sizeof(trx_t));
108
109
	trx->magic_n = TRX_MAGIC_N;
110
111
	trx->op_info = "";
112
113
	trx->is_purge = 0;
114
	trx->conc_state = TRX_NOT_STARTED;
115
	trx->start_time = time(NULL);
116
117
	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
118
119
	trx->id = ut_dulint_zero;
120
	trx->no = ut_dulint_max;
121
122
	trx->support_xa = TRUE;
123
124
	trx->check_foreigns = TRUE;
125
	trx->check_unique_secondary = TRUE;
126
127
	trx->flush_log_later = FALSE;
128
	trx->must_flush_log_later = FALSE;
129
130
	trx->dict_operation = FALSE;
131
132
	trx->mysql_thd = NULL;
133
	trx->mysql_query_str = NULL;
134
	trx->active_trans = 0;
135
	trx->duplicates = 0;
136
137
	trx->n_mysql_tables_in_use = 0;
138
	trx->mysql_n_tables_locked = 0;
139
140
	trx->mysql_log_file_name = NULL;
141
	trx->mysql_log_offset = 0;
142
143
	mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO);
144
145
	trx->rseg = NULL;
146
147
	trx->undo_no = ut_dulint_zero;
148
	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
149
	trx->insert_undo = NULL;
150
	trx->update_undo = NULL;
151
	trx->undo_no_arr = NULL;
152
153
	trx->error_state = DB_SUCCESS;
154
	trx->detailed_error[0] = '\0';
155
156
	trx->sess = sess;
157
	trx->que_state = TRX_QUE_RUNNING;
158
	trx->n_active_thrs = 0;
159
160
	trx->handling_signals = FALSE;
161
162
	UT_LIST_INIT(trx->signals);
163
	UT_LIST_INIT(trx->reply_signals);
164
165
	trx->graph = NULL;
166
167
	trx->wait_lock = NULL;
168
	trx->was_chosen_as_deadlock_victim = FALSE;
169
	UT_LIST_INIT(trx->wait_thrs);
170
171
	trx->lock_heap = mem_heap_create_in_buffer(256);
172
	UT_LIST_INIT(trx->trx_locks);
173
174
	UT_LIST_INIT(trx->trx_savepoints);
175
176
	trx->dict_operation_lock_mode = 0;
177
	trx->has_search_latch = FALSE;
178
	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
179
180
	trx->declared_to_be_inside_innodb = FALSE;
181
	trx->n_tickets_to_enter_innodb = 0;
182
183
	trx->auto_inc_lock = NULL;
184
185
	trx->global_read_view_heap = mem_heap_create(256);
186
	trx->global_read_view = NULL;
187
	trx->read_view = NULL;
188
189
	/* Set X/Open XA transaction identification to NULL */
190
	memset(&trx->xid, 0, sizeof(trx->xid));
191
	trx->xid.formatID = -1;
192
193
	trx->n_autoinc_rows = 0;
194
195
	trx_reset_new_rec_lock_info(trx);
196
197
	return(trx);
198
}
199
200
/************************************************************************
201
Creates a transaction object for MySQL. */
202
203
trx_t*
204
trx_allocate_for_mysql(void)
205
/*========================*/
206
				/* out, own: transaction object */
207
{
208
	trx_t*	trx;
209
210
	mutex_enter(&kernel_mutex);
211
212
	/* Open a dummy session */
213
214
	if (!trx_dummy_sess) {
215
		trx_dummy_sess = sess_open();
216
	}
217
218
	trx = trx_create(trx_dummy_sess);
219
220
	trx_n_mysql_transactions++;
221
222
	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
223
224
	mutex_exit(&kernel_mutex);
225
226
	trx->mysql_thread_id = os_thread_get_curr_id();
227
228
	trx->mysql_process_no = os_proc_get_number();
229
230
	return(trx);
231
}
232
233
/************************************************************************
234
Creates a transaction object for background operations by the master thread. */
235
236
trx_t*
237
trx_allocate_for_background(void)
238
/*=============================*/
239
				/* out, own: transaction object */
240
{
241
	trx_t*	trx;
242
243
	mutex_enter(&kernel_mutex);
244
245
	/* Open a dummy session */
246
247
	if (!trx_dummy_sess) {
248
		trx_dummy_sess = sess_open();
249
	}
250
251
	trx = trx_create(trx_dummy_sess);
252
253
	mutex_exit(&kernel_mutex);
254
255
	return(trx);
256
}
257
258
/************************************************************************
259
Releases the search latch if trx has reserved it. */
260
261
void
262
trx_search_latch_release_if_reserved(
263
/*=================================*/
264
	trx_t*	   trx) /* in: transaction */
265
{
266
	if (trx->has_search_latch) {
267
		rw_lock_s_unlock(&btr_search_latch);
268
269
		trx->has_search_latch = FALSE;
270
	}
271
}
272
273
/************************************************************************
274
Frees a transaction object. */
275
276
void
277
trx_free(
278
/*=====*/
279
	trx_t*	trx)	/* in, own: trx object */
280
{
281
	ut_ad(mutex_own(&kernel_mutex));
282
283
	if (trx->declared_to_be_inside_innodb) {
284
		ut_print_timestamp(stderr);
285
		fputs("  InnoDB: Error: Freeing a trx which is declared"
286
		      " to be processing\n"
287
		      "InnoDB: inside InnoDB.\n", stderr);
288
		trx_print(stderr, trx, 600);
289
		putc('\n', stderr);
290
	}
291
292
	if (trx->n_mysql_tables_in_use != 0
293
	    || trx->mysql_n_tables_locked != 0) {
294
295
		ut_print_timestamp(stderr);
296
		fprintf(stderr,
297
			"  InnoDB: Error: MySQL is freeing a thd\n"
298
			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
299
			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
300
			(ulong)trx->n_mysql_tables_in_use,
301
			(ulong)trx->mysql_n_tables_locked);
302
303
		trx_print(stderr, trx, 600);
304
305
		ut_print_buf(stderr, trx, sizeof(trx_t));
306
	}
307
308
	ut_a(trx->magic_n == TRX_MAGIC_N);
309
310
	trx->magic_n = 11112222;
311
312
	ut_a(trx->conc_state == TRX_NOT_STARTED);
313
314
	mutex_free(&(trx->undo_mutex));
315
316
	ut_a(trx->insert_undo == NULL);
317
	ut_a(trx->update_undo == NULL);
318
319
	if (trx->undo_no_arr) {
320
		trx_undo_arr_free(trx->undo_no_arr);
321
	}
322
323
	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
324
	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
325
326
	ut_a(trx->wait_lock == NULL);
327
	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
328
329
	ut_a(!trx->has_search_latch);
330
	ut_a(!trx->auto_inc_lock);
331
332
	ut_a(trx->dict_operation_lock_mode == 0);
333
334
	if (trx->lock_heap) {
335
		mem_heap_free(trx->lock_heap);
336
	}
337
338
	ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
339
340
	if (trx->global_read_view_heap) {
341
		mem_heap_free(trx->global_read_view_heap);
342
	}
343
344
	trx->global_read_view = NULL;
345
346
	ut_a(trx->read_view == NULL);
347
348
	mem_free(trx);
349
}
350
351
/************************************************************************
352
Frees a transaction object for MySQL. */
353
354
void
355
trx_free_for_mysql(
356
/*===============*/
357
	trx_t*	trx)	/* in, own: trx object */
358
{
359
	mutex_enter(&kernel_mutex);
360
361
	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
362
363
	trx_free(trx);
364
365
	ut_a(trx_n_mysql_transactions > 0);
366
367
	trx_n_mysql_transactions--;
368
369
	mutex_exit(&kernel_mutex);
370
}
371
372
/************************************************************************
373
Frees a transaction object of a background operation of the master thread. */
374
375
void
376
trx_free_for_background(
377
/*====================*/
378
	trx_t*	trx)	/* in, own: trx object */
379
{
380
	mutex_enter(&kernel_mutex);
381
382
	trx_free(trx);
383
384
	mutex_exit(&kernel_mutex);
385
}
386
387
/********************************************************************
388
Inserts the trx handle in the trx system trx list in the right position.
389
The list is sorted on the trx id so that the biggest id is at the list
390
start. This function is used at the database startup to insert incomplete
391
transactions to the list. */
392
static
393
void
394
trx_list_insert_ordered(
395
/*====================*/
396
	trx_t*	trx)	/* in: trx handle */
397
{
398
	trx_t*	trx2;
399
400
	ut_ad(mutex_own(&kernel_mutex));
401
402
	trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
403
404
	while (trx2 != NULL) {
405
		if (ut_dulint_cmp(trx->id, trx2->id) >= 0) {
406
407
			ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1);
408
			break;
409
		}
410
		trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
411
	}
412
413
	if (trx2 != NULL) {
414
		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
415
416
		if (trx2 == NULL) {
417
			UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
418
		} else {
419
			UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
420
					     trx2, trx);
421
		}
422
	} else {
423
		UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
424
	}
425
}
426
427
/********************************************************************
428
Creates trx objects for transactions and initializes the trx list of
429
trx_sys at database start. Rollback segment and undo log lists must
430
already exist when this function is called, because the lists of
431
transactions to be rolled back or cleaned up are built based on the
432
undo log lists. */
433
434
void
435
trx_lists_init_at_db_start(void)
436
/*============================*/
437
{
438
	trx_rseg_t*	rseg;
439
	trx_undo_t*	undo;
440
	trx_t*		trx;
441
442
	UT_LIST_INIT(trx_sys->trx_list);
443
444
	/* Look from the rollback segments if there exist undo logs for
445
	transactions */
446
447
	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
448
449
	while (rseg != NULL) {
450
		undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
451
452
		while (undo != NULL) {
453
454
			trx = trx_create(NULL);
455
456
			trx->id = undo->trx_id;
457
			trx->xid = undo->xid;
458
			trx->insert_undo = undo;
459
			trx->rseg = rseg;
460
461
			if (undo->state != TRX_UNDO_ACTIVE) {
462
463
				/* Prepared transactions are left in
464
				the prepared state waiting for a
465
				commit or abort decision from MySQL */
466
467
				if (undo->state == TRX_UNDO_PREPARED) {
468
469
					fprintf(stderr,
470
						"InnoDB: Transaction %lu %lu"
471
						" was in the"
472
						" XA prepared state.\n",
473
						ut_dulint_get_high(trx->id),
474
						ut_dulint_get_low(trx->id));
475
476
					if (srv_force_recovery == 0) {
477
478
						trx->conc_state = TRX_PREPARED;
479
					} else {
480
						fprintf(stderr,
481
							"InnoDB: Since"
482
							" innodb_force_recovery"
483
							" > 0, we will"
484
							" rollback it"
485
							" anyway.\n");
486
487
						trx->conc_state = TRX_ACTIVE;
488
					}
489
				} else {
490
					trx->conc_state
491
						= TRX_COMMITTED_IN_MEMORY;
492
				}
493
494
				/* We give a dummy value for the trx no;
495
				this should have no relevance since purge
496
				is not interested in committed transaction
497
				numbers, unless they are in the history
498
				list, in which case it looks the number
499
				from the disk based undo log structure */
500
501
				trx->no = trx->id;
502
			} else {
503
				trx->conc_state = TRX_ACTIVE;
504
505
				/* A running transaction always has the number
506
				field inited to ut_dulint_max */
507
508
				trx->no = ut_dulint_max;
509
			}
510
511
			if (undo->dict_operation) {
512
				trx->dict_operation = undo->dict_operation;
513
				trx->table_id = undo->table_id;
514
			}
515
516
			if (!undo->empty) {
517
				trx->undo_no = ut_dulint_add(undo->top_undo_no,
518
							     1);
519
			}
520
521
			trx_list_insert_ordered(trx);
522
523
			undo = UT_LIST_GET_NEXT(undo_list, undo);
524
		}
525
526
		undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
527
528
		while (undo != NULL) {
529
			trx = trx_get_on_id(undo->trx_id);
530
531
			if (NULL == trx) {
532
				trx = trx_create(NULL);
533
534
				trx->id = undo->trx_id;
535
				trx->xid = undo->xid;
536
537
				if (undo->state != TRX_UNDO_ACTIVE) {
538
539
					/* Prepared transactions are left in
540
					the prepared state waiting for a
541
					commit or abort decision from MySQL */
542
543
					if (undo->state == TRX_UNDO_PREPARED) {
544
						fprintf(stderr,
545
							"InnoDB: Transaction"
546
							" %lu %lu was in the"
547
							" XA prepared state.\n",
548
							ut_dulint_get_high(
549
								trx->id),
550
							ut_dulint_get_low(
551
								trx->id));
552
553
						if (srv_force_recovery == 0) {
554
555
							trx->conc_state
556
								= TRX_PREPARED;
557
						} else {
558
							fprintf(stderr,
559
								"InnoDB: Since"
560
								" innodb_force_recovery"
561
								" > 0, we will"
562
								" rollback it"
563
								" anyway.\n");
564
565
							trx->conc_state
566
								= TRX_ACTIVE;
567
						}
568
					} else {
569
						trx->conc_state
570
							= TRX_COMMITTED_IN_MEMORY;
571
					}
572
573
					/* We give a dummy value for the trx
574
					number */
575
576
					trx->no = trx->id;
577
				} else {
578
					trx->conc_state = TRX_ACTIVE;
579
580
					/* A running transaction always has
581
					the number field inited to
582
					ut_dulint_max */
583
584
					trx->no = ut_dulint_max;
585
				}
586
587
				trx->rseg = rseg;
588
				trx_list_insert_ordered(trx);
589
590
				if (undo->dict_operation) {
591
					trx->dict_operation
592
						= undo->dict_operation;
593
					trx->table_id = undo->table_id;
594
				}
595
			}
596
597
			trx->update_undo = undo;
598
599
			if ((!undo->empty)
600
			    && (ut_dulint_cmp(undo->top_undo_no,
601
					      trx->undo_no) >= 0)) {
602
603
				trx->undo_no = ut_dulint_add(undo->top_undo_no,
604
							     1);
605
			}
606
607
			undo = UT_LIST_GET_NEXT(undo_list, undo);
608
		}
609
610
		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
611
	}
612
}
613
614
/**********************************************************************
615
Assigns a rollback segment to a transaction in a round-robin fashion.
616
Skips the SYSTEM rollback segment if another is available. */
617
UNIV_INLINE
618
ulint
619
trx_assign_rseg(void)
620
/*=================*/
621
			/* out: assigned rollback segment id */
622
{
623
	trx_rseg_t*	rseg	= trx_sys->latest_rseg;
624
625
	ut_ad(mutex_own(&kernel_mutex));
626
loop:
627
	/* Get next rseg in a round-robin fashion */
628
629
	rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
630
631
	if (rseg == NULL) {
632
		rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
633
	}
634
635
	/* If it is the SYSTEM rollback segment, and there exist others, skip
636
	it */
637
638
	if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
639
	    && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
640
		goto loop;
641
	}
642
643
	trx_sys->latest_rseg = rseg;
644
645
	return(rseg->id);
646
}
647
648
/********************************************************************
649
Starts a new transaction. */
650
651
ibool
652
trx_start_low(
653
/*==========*/
654
			/* out: TRUE */
655
	trx_t*	trx,	/* in: transaction */
656
	ulint	rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED
657
			is passed, the system chooses the rollback segment
658
			automatically in a round-robin fashion */
659
{
660
	trx_rseg_t*	rseg;
661
662
	ut_ad(mutex_own(&kernel_mutex));
663
	ut_ad(trx->rseg == NULL);
664
665
	if (trx->is_purge) {
666
		trx->id = ut_dulint_zero;
667
		trx->conc_state = TRX_ACTIVE;
668
		trx->start_time = time(NULL);
669
670
		return(TRUE);
671
	}
672
673
	ut_ad(trx->conc_state != TRX_ACTIVE);
674
675
	if (rseg_id == ULINT_UNDEFINED) {
676
677
		rseg_id = trx_assign_rseg();
678
	}
679
680
	rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
681
682
	trx->id = trx_sys_get_new_trx_id();
683
684
	/* The initial value for trx->no: ut_dulint_max is used in
685
	read_view_open_now: */
686
687
	trx->no = ut_dulint_max;
688
689
	trx->rseg = rseg;
690
691
	trx->conc_state = TRX_ACTIVE;
692
	trx->start_time = time(NULL);
693
694
	UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
695
696
	return(TRUE);
697
}
698
699
/********************************************************************
700
Starts a new transaction. */
701
702
ibool
703
trx_start(
704
/*======*/
705
			/* out: TRUE */
706
	trx_t*	trx,	/* in: transaction */
707
	ulint	rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED
708
			is passed, the system chooses the rollback segment
709
			automatically in a round-robin fashion */
710
{
711
	ibool	ret;
712
713
	mutex_enter(&kernel_mutex);
714
715
	ret = trx_start_low(trx, rseg_id);
716
717
	mutex_exit(&kernel_mutex);
718
719
	return(ret);
720
}
721
722
/********************************************************************
723
Commits a transaction. */
724
725
void
726
trx_commit_off_kernel(
727
/*==================*/
728
	trx_t*	trx)	/* in: transaction */
729
{
730
	page_t*		update_hdr_page;
731
	dulint		lsn;
732
	trx_rseg_t*	rseg;
733
	trx_undo_t*	undo;
734
	ibool		must_flush_log	= FALSE;
735
	mtr_t		mtr;
736
737
	ut_ad(mutex_own(&kernel_mutex));
738
739
	trx->must_flush_log_later = FALSE;
740
741
	rseg = trx->rseg;
742
743
	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
744
745
		mutex_exit(&kernel_mutex);
746
747
		mtr_start(&mtr);
748
749
		must_flush_log = TRUE;
750
751
		/* Change the undo log segment states from TRX_UNDO_ACTIVE
752
		to some other state: these modifications to the file data
753
		structure define the transaction as committed in the file
754
		based world, at the serialization point of the log sequence
755
		number lsn obtained below. */
756
757
		mutex_enter(&(rseg->mutex));
758
759
		if (trx->insert_undo != NULL) {
760
			trx_undo_set_state_at_finish(
761
				rseg, trx, trx->insert_undo, &mtr);
762
		}
763
764
		undo = trx->update_undo;
765
766
		if (undo) {
767
			mutex_enter(&kernel_mutex);
768
			trx->no = trx_sys_get_new_trx_no();
769
770
			mutex_exit(&kernel_mutex);
771
772
			/* It is not necessary to obtain trx->undo_mutex here
773
			because only a single OS thread is allowed to do the
774
			transaction commit for this transaction. */
775
776
			update_hdr_page = trx_undo_set_state_at_finish(
777
				rseg, trx, undo, &mtr);
778
779
			/* We have to do the cleanup for the update log while
780
			holding the rseg mutex because update log headers
781
			have to be put to the history list in the order of
782
			the trx number. */
783
784
			trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
785
		}
786
787
		mutex_exit(&(rseg->mutex));
788
789
		/* Update the latest MySQL binlog name and offset info
790
		in trx sys header if MySQL binlogging is on or the database
791
		server is a MySQL replication slave */
792
793
		if (trx->mysql_log_file_name
794
		    && trx->mysql_log_file_name[0] != '\0') {
795
			trx_sys_update_mysql_binlog_offset(
796
				trx->mysql_log_file_name,
797
				trx->mysql_log_offset,
798
				TRX_SYS_MYSQL_LOG_INFO, &mtr);
799
			trx->mysql_log_file_name = NULL;
800
		}
801
802
		/* The following call commits the mini-transaction, making the
803
		whole transaction committed in the file-based world, at this
804
		log sequence number. The transaction becomes 'durable' when
805
		we write the log to disk, but in the logical sense the commit
806
		in the file-based data structures (undo logs etc.) happens
807
		here.
808
809
		NOTE that transaction numbers, which are assigned only to
810
		transactions with an update undo log, do not necessarily come
811
		in exactly the same order as commit lsn's, if the transactions
812
		have different rollback segments. To get exactly the same
813
		order we should hold the kernel mutex up to this point,
814
		adding to to the contention of the kernel mutex. However, if
815
		a transaction T2 is able to see modifications made by
816
		a transaction T1, T2 will always get a bigger transaction
817
		number and a bigger commit lsn than T1. */
818
819
		/*--------------*/
820
		mtr_commit(&mtr);
821
		/*--------------*/
822
		lsn = mtr.end_lsn;
823
824
		mutex_enter(&kernel_mutex);
825
	}
826
827
	ut_ad(trx->conc_state == TRX_ACTIVE
828
	      || trx->conc_state == TRX_PREPARED);
829
	ut_ad(mutex_own(&kernel_mutex));
830
831
	/* The following assignment makes the transaction committed in memory
832
	and makes its changes to data visible to other transactions.
833
	NOTE that there is a small discrepancy from the strict formal
834
	visibility rules here: a human user of the database can see
835
	modifications made by another transaction T even before the necessary
836
	log segment has been flushed to the disk. If the database happens to
837
	crash before the flush, the user has seen modifications from T which
838
	will never be a committed transaction. However, any transaction T2
839
	which sees the modifications of the committing transaction T, and
840
	which also itself makes modifications to the database, will get an lsn
841
	larger than the committing transaction T. In the case where the log
842
	flush fails, and T never gets committed, also T2 will never get
843
	committed. */
844
845
	/*--------------------------------------*/
846
	trx->conc_state = TRX_COMMITTED_IN_MEMORY;
847
	/*--------------------------------------*/
848
849
	lock_release_off_kernel(trx);
850
851
	if (trx->global_read_view) {
852
		read_view_close(trx->global_read_view);
853
		mem_heap_empty(trx->global_read_view_heap);
854
		trx->global_read_view = NULL;
855
	}
856
857
	trx->read_view = NULL;
858
859
	if (must_flush_log) {
860
861
		mutex_exit(&kernel_mutex);
862
863
		if (trx->insert_undo != NULL) {
864
865
			trx_undo_insert_cleanup(trx);
866
		}
867
868
		/* NOTE that we could possibly make a group commit more
869
		efficient here: call os_thread_yield here to allow also other
870
		trxs to come to commit! */
871
872
		/*-------------------------------------*/
873
874
		/* Depending on the my.cnf options, we may now write the log
875
		buffer to the log files, making the transaction durable if
876
		the OS does not crash. We may also flush the log files to
877
		disk, making the transaction durable also at an OS crash or a
878
		power outage.
879
880
		The idea in InnoDB's group commit is that a group of
881
		transactions gather behind a trx doing a physical disk write
882
		to log files, and when that physical write has been completed,
883
		one of those transactions does a write which commits the whole
884
		group. Note that this group commit will only bring benefit if
885
		there are > 2 users in the database. Then at least 2 users can
886
		gather behind one doing the physical log write to disk.
887
888
		If we are calling trx_commit() under MySQL's binlog mutex, we
889
		will delay possible log write and flush to a separate function
890
		trx_commit_complete_for_mysql(), which is only called when the
891
		thread has released the binlog mutex. This is to make the
892
		group commit algorithm to work. Otherwise, the MySQL binlog
893
		mutex would serialize all commits and prevent a group of
894
		transactions from gathering. */
895
896
		if (trx->flush_log_later) {
897
			/* Do nothing yet */
898
			trx->must_flush_log_later = TRUE;
899
		} else if (srv_flush_log_at_trx_commit == 0) {
900
			/* Do nothing */
901
		} else if (srv_flush_log_at_trx_commit == 1) {
902
			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
903
				/* Write the log but do not flush it to disk */
904
905
				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
906
						FALSE);
907
			} else {
908
				/* Write the log to the log files AND flush
909
				them to disk */
910
911
				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
912
			}
913
		} else if (srv_flush_log_at_trx_commit == 2) {
914
915
			/* Write the log but do not flush it to disk */
916
917
			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
918
		} else {
919
			ut_error;
920
		}
921
922
		trx->commit_lsn = lsn;
923
924
		/*-------------------------------------*/
925
926
		mutex_enter(&kernel_mutex);
927
	}
928
929
	/* Free savepoints */
930
	trx_roll_savepoints_free(trx, NULL);
931
932
	trx->conc_state = TRX_NOT_STARTED;
933
	trx->rseg = NULL;
934
	trx->undo_no = ut_dulint_zero;
935
	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
936
937
	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
938
	ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
939
940
	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
941
}
942
943
/********************************************************************
944
Cleans up a transaction at database startup. The cleanup is needed if
945
the transaction already got to the middle of a commit when the database
946
crashed, andf we cannot roll it back. */
947
948
void
949
trx_cleanup_at_db_startup(
950
/*======================*/
951
	trx_t*	trx)	/* in: transaction */
952
{
953
	if (trx->insert_undo != NULL) {
954
955
		trx_undo_insert_cleanup(trx);
956
	}
957
958
	trx->conc_state = TRX_NOT_STARTED;
959
	trx->rseg = NULL;
960
	trx->undo_no = ut_dulint_zero;
961
	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
962
963
	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
964
}
965
966
/************************************************************************
967
Assigns a read view for a consistent read query. All the consistent reads
968
within the same transaction will get the same read view, which is created
969
when this function is first called for a new started transaction. */
970
971
read_view_t*
972
trx_assign_read_view(
973
/*=================*/
974
			/* out: consistent read view */
975
	trx_t*	trx)	/* in: active transaction */
976
{
977
	ut_ad(trx->conc_state == TRX_ACTIVE);
978
979
	if (trx->read_view) {
980
		return(trx->read_view);
981
	}
982
983
	mutex_enter(&kernel_mutex);
984
985
	if (!trx->read_view) {
986
		trx->read_view = read_view_open_now(
987
			trx->id, trx->global_read_view_heap);
988
		trx->global_read_view = trx->read_view;
989
	}
990
991
	mutex_exit(&kernel_mutex);
992
993
	return(trx->read_view);
994
}
995
996
/********************************************************************
997
Commits a transaction. NOTE that the kernel mutex is temporarily released. */
998
static
999
void
1000
trx_handle_commit_sig_off_kernel(
1001
/*=============================*/
1002
	trx_t*		trx,		/* in: transaction */
1003
	que_thr_t**	next_thr)	/* in/out: next query thread to run;
1004
					if the value which is passed in is
1005
					a pointer to a NULL pointer, then the
1006
					calling function can start running
1007
					a new query thread */
1008
{
1009
	trx_sig_t*	sig;
1010
	trx_sig_t*	next_sig;
1011
1012
	ut_ad(mutex_own(&kernel_mutex));
1013
1014
	trx->que_state = TRX_QUE_COMMITTING;
1015
1016
	trx_commit_off_kernel(trx);
1017
1018
	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
1019
1020
	/* Remove all TRX_SIG_COMMIT signals from the signal queue and send
1021
	reply messages to them */
1022
1023
	sig = UT_LIST_GET_FIRST(trx->signals);
1024
1025
	while (sig != NULL) {
1026
		next_sig = UT_LIST_GET_NEXT(signals, sig);
1027
1028
		if (sig->type == TRX_SIG_COMMIT) {
1029
1030
			trx_sig_reply(sig, next_thr);
1031
			trx_sig_remove(trx, sig);
1032
		}
1033
1034
		sig = next_sig;
1035
	}
1036
1037
	trx->que_state = TRX_QUE_RUNNING;
1038
}
1039
1040
/***************************************************************
1041
The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
1042
the TRX_QUE_RUNNING state and releases query threads which were
1043
waiting for a lock in the wait_thrs list. */
1044
1045
void
1046
trx_end_lock_wait(
1047
/*==============*/
1048
	trx_t*	trx)	/* in: transaction */
1049
{
1050
	que_thr_t*	thr;
1051
1052
	ut_ad(mutex_own(&kernel_mutex));
1053
	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1054
1055
	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1056
1057
	while (thr != NULL) {
1058
		que_thr_end_wait_no_next_thr(thr);
1059
1060
		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1061
1062
		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1063
	}
1064
1065
	trx->que_state = TRX_QUE_RUNNING;
1066
}
1067
1068
/***************************************************************
1069
Moves the query threads in the lock wait list to the SUSPENDED state and puts
1070
the transaction to the TRX_QUE_RUNNING state. */
1071
static
1072
void
1073
trx_lock_wait_to_suspended(
1074
/*=======================*/
1075
	trx_t*	trx)	/* in: transaction in the TRX_QUE_LOCK_WAIT state */
1076
{
1077
	que_thr_t*	thr;
1078
1079
	ut_ad(mutex_own(&kernel_mutex));
1080
	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1081
1082
	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1083
1084
	while (thr != NULL) {
1085
		thr->state = QUE_THR_SUSPENDED;
1086
1087
		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1088
1089
		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1090
	}
1091
1092
	trx->que_state = TRX_QUE_RUNNING;
1093
}
1094
1095
/***************************************************************
1096
Moves the query threads in the sig reply wait list of trx to the SUSPENDED
1097
state. */
1098
static
1099
void
1100
trx_sig_reply_wait_to_suspended(
1101
/*============================*/
1102
	trx_t*	trx)	/* in: transaction */
1103
{
1104
	trx_sig_t*	sig;
1105
	que_thr_t*	thr;
1106
1107
	ut_ad(mutex_own(&kernel_mutex));
1108
1109
	sig = UT_LIST_GET_FIRST(trx->reply_signals);
1110
1111
	while (sig != NULL) {
1112
		thr = sig->receiver;
1113
1114
		ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
1115
1116
		thr->state = QUE_THR_SUSPENDED;
1117
1118
		sig->receiver = NULL;
1119
1120
		UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
1121
1122
		sig = UT_LIST_GET_FIRST(trx->reply_signals);
1123
	}
1124
}
1125
1126
/*********************************************************************
1127
Checks the compatibility of a new signal with the other signals in the
1128
queue. */
1129
static
1130
ibool
1131
trx_sig_is_compatible(
1132
/*==================*/
1133
			/* out: TRUE if the signal can be queued */
1134
	trx_t*	trx,	/* in: trx handle */
1135
	ulint	type,	/* in: signal type */
1136
	ulint	sender)	/* in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
1137
{
1138
	trx_sig_t*	sig;
1139
1140
	ut_ad(mutex_own(&kernel_mutex));
1141
1142
	if (UT_LIST_GET_LEN(trx->signals) == 0) {
1143
1144
		return(TRUE);
1145
	}
1146
1147
	if (sender == TRX_SIG_SELF) {
1148
		if (type == TRX_SIG_ERROR_OCCURRED) {
1149
1150
			return(TRUE);
1151
1152
		} else if (type == TRX_SIG_BREAK_EXECUTION) {
1153
1154
			return(TRUE);
1155
		} else {
1156
			return(FALSE);
1157
		}
1158
	}
1159
1160
	ut_ad(sender == TRX_SIG_OTHER_SESS);
1161
1162
	sig = UT_LIST_GET_FIRST(trx->signals);
1163
1164
	if (type == TRX_SIG_COMMIT) {
1165
		while (sig != NULL) {
1166
1167
			if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
1168
1169
				return(FALSE);
1170
			}
1171
1172
			sig = UT_LIST_GET_NEXT(signals, sig);
1173
		}
1174
1175
		return(TRUE);
1176
1177
	} else if (type == TRX_SIG_TOTAL_ROLLBACK) {
1178
		while (sig != NULL) {
1179
1180
			if (sig->type == TRX_SIG_COMMIT) {
1181
1182
				return(FALSE);
1183
			}
1184
1185
			sig = UT_LIST_GET_NEXT(signals, sig);
1186
		}
1187
1188
		return(TRUE);
1189
1190
	} else if (type == TRX_SIG_BREAK_EXECUTION) {
1191
1192
		return(TRUE);
1193
	} else {
1194
		ut_error;
1195
1196
		return(FALSE);
1197
	}
1198
}
1199
1200
/********************************************************************
1201
Sends a signal to a trx object. */
1202
1203
void
1204
trx_sig_send(
1205
/*=========*/
1206
	trx_t*		trx,		/* in: trx handle */
1207
	ulint		type,		/* in: signal type */
1208
	ulint		sender,		/* in: TRX_SIG_SELF or
1209
					TRX_SIG_OTHER_SESS */
1210
	que_thr_t*	receiver_thr,	/* in: query thread which wants the
1211
					reply, or NULL; if type is
1212
					TRX_SIG_END_WAIT, this must be NULL */
1213
	trx_savept_t*	savept,		/* in: possible rollback savepoint, or
1214
					NULL */
1215
	que_thr_t**	next_thr)	/* in/out: next query thread to run;
1216
					if the value which is passed in is
1217
					a pointer to a NULL pointer, then the
1218
					calling function can start running
1219
					a new query thread; if the parameter
1220
					is NULL, it is ignored */
1221
{
1222
	trx_sig_t*	sig;
1223
	trx_t*		receiver_trx;
1224
1225
	ut_ad(trx);
1226
	ut_ad(mutex_own(&kernel_mutex));
1227
1228
	if (!trx_sig_is_compatible(trx, type, sender)) {
1229
		/* The signal is not compatible with the other signals in
1230
		the queue: die */
1231
1232
		ut_error;
1233
	}
1234
1235
	/* Queue the signal object */
1236
1237
	if (UT_LIST_GET_LEN(trx->signals) == 0) {
1238
1239
		/* The signal list is empty: the 'sig' slot must be unused
1240
		(we improve performance a bit by avoiding mem_alloc) */
1241
		sig = &(trx->sig);
1242
	} else {
1243
		/* It might be that the 'sig' slot is unused also in this
1244
		case, but we choose the easy way of using mem_alloc */
1245
1246
		sig = mem_alloc(sizeof(trx_sig_t));
1247
	}
1248
1249
	UT_LIST_ADD_LAST(signals, trx->signals, sig);
1250
1251
	sig->type = type;
1252
	sig->sender = sender;
1253
	sig->receiver = receiver_thr;
1254
1255
	if (savept) {
1256
		sig->savept = *savept;
1257
	}
1258
1259
	if (receiver_thr) {
1260
		receiver_trx = thr_get_trx(receiver_thr);
1261
1262
		UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
1263
				 sig);
1264
	}
1265
1266
	if (trx->sess->state == SESS_ERROR) {
1267
1268
		trx_sig_reply_wait_to_suspended(trx);
1269
	}
1270
1271
	if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
1272
		ut_error;
1273
	}
1274
1275
	/* If there were no other signals ahead in the queue, try to start
1276
	handling of the signal */
1277
1278
	if (UT_LIST_GET_FIRST(trx->signals) == sig) {
1279
1280
		trx_sig_start_handle(trx, next_thr);
1281
	}
1282
}
1283
1284
/********************************************************************
1285
Ends signal handling. If the session is in the error state, and
1286
trx->graph_before_signal_handling != NULL, then returns control to the error
1287
handling routine of the graph (currently just returns the control to the
1288
graph root which then will send an error message to the client). */
1289
1290
void
1291
trx_end_signal_handling(
1292
/*====================*/
1293
	trx_t*	trx)	/* in: trx */
1294
{
1295
	ut_ad(mutex_own(&kernel_mutex));
1296
	ut_ad(trx->handling_signals == TRUE);
1297
1298
	trx->handling_signals = FALSE;
1299
1300
	trx->graph = trx->graph_before_signal_handling;
1301
1302
	if (trx->graph && (trx->sess->state == SESS_ERROR)) {
1303
1304
		que_fork_error_handle(trx, trx->graph);
1305
	}
1306
}
1307
1308
/********************************************************************
1309
Starts handling of a trx signal. */
1310
1311
void
1312
trx_sig_start_handle(
1313
/*=================*/
1314
	trx_t*		trx,		/* in: trx handle */
1315
	que_thr_t**	next_thr)	/* in/out: next query thread to run;
1316
					if the value which is passed in is
1317
					a pointer to a NULL pointer, then the
1318
					calling function can start running
1319
					a new query thread; if the parameter
1320
					is NULL, it is ignored */
1321
{
1322
	trx_sig_t*	sig;
1323
	ulint		type;
1324
loop:
1325
	/* We loop in this function body as long as there are queued signals
1326
	we can process immediately */
1327
1328
	ut_ad(trx);
1329
	ut_ad(mutex_own(&kernel_mutex));
1330
1331
	if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
1332
1333
		trx_end_signal_handling(trx);
1334
1335
		return;
1336
	}
1337
1338
	if (trx->conc_state == TRX_NOT_STARTED) {
1339
1340
		trx_start_low(trx, ULINT_UNDEFINED);
1341
	}
1342
1343
	/* If the trx is in a lock wait state, moves the waiting query threads
1344
	to the suspended state */
1345
1346
	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
1347
1348
		trx_lock_wait_to_suspended(trx);
1349
	}
1350
1351
	/* If the session is in the error state and this trx has threads
1352
	waiting for reply from signals, moves these threads to the suspended
1353
	state, canceling wait reservations; note that if the transaction has
1354
	sent a commit or rollback signal to itself, and its session is not in
1355
	the error state, then nothing is done here. */
1356
1357
	if (trx->sess->state == SESS_ERROR) {
1358
		trx_sig_reply_wait_to_suspended(trx);
1359
	}
1360
1361
	/* If there are no running query threads, we can start processing of a
1362
	signal, otherwise we have to wait until all query threads of this
1363
	transaction are aware of the arrival of the signal. */
1364
1365
	if (trx->n_active_thrs > 0) {
1366
1367
		return;
1368
	}
1369
1370
	if (trx->handling_signals == FALSE) {
1371
		trx->graph_before_signal_handling = trx->graph;
1372
1373
		trx->handling_signals = TRUE;
1374
	}
1375
1376
	sig = UT_LIST_GET_FIRST(trx->signals);
1377
	type = sig->type;
1378
1379
	if (type == TRX_SIG_COMMIT) {
1380
1381
		trx_handle_commit_sig_off_kernel(trx, next_thr);
1382
1383
	} else if ((type == TRX_SIG_TOTAL_ROLLBACK)
1384
		   || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
1385
1386
		trx_rollback(trx, sig, next_thr);
1387
1388
		/* No further signals can be handled until the rollback
1389
		completes, therefore we return */
1390
1391
		return;
1392
1393
	} else if (type == TRX_SIG_ERROR_OCCURRED) {
1394
1395
		trx_rollback(trx, sig, next_thr);
1396
1397
		/* No further signals can be handled until the rollback
1398
		completes, therefore we return */
1399
1400
		return;
1401
1402
	} else if (type == TRX_SIG_BREAK_EXECUTION) {
1403
1404
		trx_sig_reply(sig, next_thr);
1405
		trx_sig_remove(trx, sig);
1406
	} else {
1407
		ut_error;
1408
	}
1409
1410
	goto loop;
1411
}
1412
1413
/********************************************************************
1414
Send the reply message when a signal in the queue of the trx has been
1415
handled. */
1416
1417
void
1418
trx_sig_reply(
1419
/*==========*/
1420
	trx_sig_t*	sig,		/* in: signal */
1421
	que_thr_t**	next_thr)	/* in/out: next query thread to run;
1422
					if the value which is passed in is
1423
					a pointer to a NULL pointer, then the
1424
					calling function can start running
1425
					a new query thread */
1426
{
1427
	trx_t*	receiver_trx;
1428
1429
	ut_ad(sig);
1430
	ut_ad(mutex_own(&kernel_mutex));
1431
1432
	if (sig->receiver != NULL) {
1433
		ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
1434
1435
		receiver_trx = thr_get_trx(sig->receiver);
1436
1437
		UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
1438
			       sig);
1439
		ut_ad(receiver_trx->sess->state != SESS_ERROR);
1440
1441
		que_thr_end_wait(sig->receiver, next_thr);
1442
1443
		sig->receiver = NULL;
1444
1445
	}
1446
}
1447
1448
/********************************************************************
1449
Removes a signal object from the trx signal queue. */
1450
1451
void
1452
trx_sig_remove(
1453
/*===========*/
1454
	trx_t*		trx,	/* in: trx handle */
1455
	trx_sig_t*	sig)	/* in, own: signal */
1456
{
1457
	ut_ad(trx && sig);
1458
	ut_ad(mutex_own(&kernel_mutex));
1459
1460
	ut_ad(sig->receiver == NULL);
1461
1462
	UT_LIST_REMOVE(signals, trx->signals, sig);
1463
	sig->type = 0;	/* reset the field to catch possible bugs */
1464
1465
	if (sig != &(trx->sig)) {
1466
		mem_free(sig);
1467
	}
1468
}
1469
1470
/*************************************************************************
1471
Creates a commit command node struct. */
1472
1473
commit_node_t*
1474
commit_node_create(
1475
/*===============*/
1476
				/* out, own: commit node struct */
1477
	mem_heap_t*	heap)	/* in: mem heap where created */
1478
{
1479
	commit_node_t*	node;
1480
1481
	node = mem_heap_alloc(heap, sizeof(commit_node_t));
1482
	node->common.type  = QUE_NODE_COMMIT;
1483
	node->state = COMMIT_NODE_SEND;
1484
1485
	return(node);
1486
}
1487
1488
/***************************************************************
1489
Performs an execution step for a commit type node in a query graph. */
1490
1491
que_thr_t*
1492
trx_commit_step(
1493
/*============*/
1494
				/* out: query thread to run next, or NULL */
1495
	que_thr_t*	thr)	/* in: query thread */
1496
{
1497
	commit_node_t*	node;
1498
	que_thr_t*	next_thr;
1499
1500
	node = thr->run_node;
1501
1502
	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1503
1504
	if (thr->prev_node == que_node_get_parent(node)) {
1505
		node->state = COMMIT_NODE_SEND;
1506
	}
1507
1508
	if (node->state == COMMIT_NODE_SEND) {
1509
		mutex_enter(&kernel_mutex);
1510
1511
		node->state = COMMIT_NODE_WAIT;
1512
1513
		next_thr = NULL;
1514
1515
		thr->state = QUE_THR_SIG_REPLY_WAIT;
1516
1517
		/* Send the commit signal to the transaction */
1518
1519
		trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
1520
			     thr, NULL, &next_thr);
1521
1522
		mutex_exit(&kernel_mutex);
1523
1524
		return(next_thr);
1525
	}
1526
1527
	ut_ad(node->state == COMMIT_NODE_WAIT);
1528
1529
	node->state = COMMIT_NODE_SEND;
1530
1531
	thr->run_node = que_node_get_parent(node);
1532
1533
	return(thr);
1534
}
1535
1536
/**************************************************************************
1537
Does the transaction commit for MySQL. */
1538
1539
ulint
1540
trx_commit_for_mysql(
1541
/*=================*/
1542
			/* out: 0 or error number */
1543
	trx_t*	trx)	/* in: trx handle */
1544
{
1545
	/* Because we do not do the commit by sending an Innobase
1546
	sig to the transaction, we must here make sure that trx has been
1547
	started. */
1548
1549
	ut_a(trx);
1550
1551
	trx->op_info = "committing";
1552
1553
	/* If we are doing the XA recovery of prepared transactions, then
1554
	the transaction object does not have an InnoDB session object, and we
1555
	set the dummy session that we use for all MySQL transactions. */
1556
1557
	if (trx->sess == NULL) {
1558
		/* Open a dummy session */
1559
1560
		if (!trx_dummy_sess) {
1561
			mutex_enter(&kernel_mutex);
1562
1563
			if (!trx_dummy_sess) {
1564
				trx_dummy_sess = sess_open();
1565
			}
1566
1567
			mutex_exit(&kernel_mutex);
1568
		}
1569
1570
		trx->sess = trx_dummy_sess;
1571
	}
1572
1573
	trx_start_if_not_started(trx);
1574
1575
	mutex_enter(&kernel_mutex);
1576
1577
	trx_commit_off_kernel(trx);
1578
1579
	mutex_exit(&kernel_mutex);
1580
1581
	trx->op_info = "";
1582
1583
	return(0);
1584
}
1585
1586
/**************************************************************************
1587
If required, flushes the log to disk if we called trx_commit_for_mysql()
1588
with trx->flush_log_later == TRUE. */
1589
1590
ulint
1591
trx_commit_complete_for_mysql(
1592
/*==========================*/
1593
			/* out: 0 or error number */
1594
	trx_t*	trx)	/* in: trx handle */
1595
{
1596
	dulint	lsn	= trx->commit_lsn;
1597
1598
	ut_a(trx);
1599
1600
	trx->op_info = "flushing log";
1601
1602
	if (!trx->must_flush_log_later) {
1603
		/* Do nothing */
1604
	} else if (srv_flush_log_at_trx_commit == 0) {
1605
		/* Do nothing */
1606
	} else if (srv_flush_log_at_trx_commit == 1) {
1607
		if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1608
			/* Write the log but do not flush it to disk */
1609
1610
			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1611
		} else {
1612
			/* Write the log to the log files AND flush them to
1613
			disk */
1614
1615
			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1616
		}
1617
	} else if (srv_flush_log_at_trx_commit == 2) {
1618
1619
		/* Write the log but do not flush it to disk */
1620
1621
		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1622
	} else {
1623
		ut_error;
1624
	}
1625
1626
	trx->must_flush_log_later = FALSE;
1627
1628
	trx->op_info = "";
1629
1630
	return(0);
1631
}
1632
1633
/**************************************************************************
1634
Marks the latest SQL statement ended. */
1635
1636
void
1637
trx_mark_sql_stat_end(
1638
/*==================*/
1639
	trx_t*	trx)	/* in: trx handle */
1640
{
1641
	ut_a(trx);
1642
1643
	if (trx->conc_state == TRX_NOT_STARTED) {
1644
		trx->undo_no = ut_dulint_zero;
1645
	}
1646
1647
	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1648
}
1649
1650
/**************************************************************************
1651
Prints info about a transaction to the given file. The caller must own the
1652
kernel mutex and must have called
1653
innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL
1654
or InnoDB cannot meanwhile change the info printed here. */
1655
1656
void
1657
trx_print(
1658
/*======*/
1659
	FILE*	f,		/* in: output stream */
1660
	trx_t*	trx,		/* in: transaction */
1661
	ulint	max_query_len)	/* in: max query length to print, or 0 to
1662
				   use the default max length */
1663
{
1664
	ibool	newline;
1665
1666
	fprintf(f, "TRANSACTION %lu %lu",
1667
		(ulong) ut_dulint_get_high(trx->id),
1668
		(ulong) ut_dulint_get_low(trx->id));
1669
1670
	switch (trx->conc_state) {
1671
	case TRX_NOT_STARTED:
1672
		fputs(", not started", f);
1673
		break;
1674
	case TRX_ACTIVE:
1675
		fprintf(f, ", ACTIVE %lu sec",
1676
			(ulong)difftime(time(NULL), trx->start_time));
1677
		break;
1678
	case TRX_PREPARED:
1679
		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
1680
			(ulong)difftime(time(NULL), trx->start_time));
1681
		break;
1682
	case TRX_COMMITTED_IN_MEMORY:
1683
		fputs(", COMMITTED IN MEMORY", f);
1684
		break;
1685
	default:
1686
		fprintf(f, " state %lu", (ulong) trx->conc_state);
1687
	}
1688
1689
#ifdef UNIV_LINUX
1690
	fprintf(f, ", process no %lu", trx->mysql_process_no);
1691
#endif
1692
	fprintf(f, ", OS thread id %lu",
1693
		(ulong) os_thread_pf(trx->mysql_thread_id));
1694
1695
	if (*trx->op_info) {
1696
		putc(' ', f);
1697
		fputs(trx->op_info, f);
1698
	}
1699
1700
	if (trx->is_purge) {
1701
		fputs(" purge trx", f);
1702
	}
1703
1704
	if (trx->declared_to_be_inside_innodb) {
1705
		fprintf(f, ", thread declared inside InnoDB %lu",
1706
			(ulong) trx->n_tickets_to_enter_innodb);
1707
	}
1708
1709
	putc('\n', f);
1710
1711
	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
1712
		fprintf(f, "mysql tables in use %lu, locked %lu\n",
1713
			(ulong) trx->n_mysql_tables_in_use,
1714
			(ulong) trx->mysql_n_tables_locked);
1715
	}
1716
1717
	newline = TRUE;
1718
1719
	switch (trx->que_state) {
1720
	case TRX_QUE_RUNNING:
1721
		newline = FALSE; break;
1722
	case TRX_QUE_LOCK_WAIT:
1723
		fputs("LOCK WAIT ", f); break;
1724
	case TRX_QUE_ROLLING_BACK:
1725
		fputs("ROLLING BACK ", f); break;
1726
	case TRX_QUE_COMMITTING:
1727
		fputs("COMMITTING ", f); break;
1728
	default:
1729
		fprintf(f, "que state %lu ", (ulong) trx->que_state);
1730
	}
1731
1732
	if (0 < UT_LIST_GET_LEN(trx->trx_locks)
1733
	    || mem_heap_get_size(trx->lock_heap) > 400) {
1734
		newline = TRUE;
1735
1736
		fprintf(f, "%lu lock struct(s), heap size %lu,"
1737
			" %lu row lock(s)",
1738
			(ulong) UT_LIST_GET_LEN(trx->trx_locks),
1739
			(ulong) mem_heap_get_size(trx->lock_heap),
1740
			(ulong) lock_number_of_rows_locked(trx));
1741
	}
1742
1743
	if (trx->has_search_latch) {
1744
		newline = TRUE;
1745
		fputs(", holds adaptive hash latch", f);
1746
	}
1747
1748
	if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) {
1749
		newline = TRUE;
1750
		fprintf(f, ", undo log entries %lu",
1751
			(ulong) ut_dulint_get_low(trx->undo_no));
1752
	}
1753
1754
	if (newline) {
1755
		putc('\n', f);
1756
	}
1757
1758
	if (trx->mysql_thd != NULL) {
1759
		innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
1760
	}
1761
}
1762
1763
/***********************************************************************
1764
Compares the "weight" (or size) of two transactions. The weight of one
1765
transaction is estimated as the number of altered rows + the number of
1766
locked rows. Transactions that have edited non-transactional tables are
1767
considered heavier than ones that have not. */
1768
1769
int
1770
trx_weight_cmp(
1771
/*===========*/
1772
			/* out: <0, 0 or >0; similar to strcmp(3) */
1773
	trx_t*	a,	/* in: the first transaction to be compared */
1774
	trx_t*	b)	/* in: the second transaction to be compared */
1775
{
1776
	ibool	a_notrans_edit;
1777
	ibool	b_notrans_edit;
1778
1779
	/* If mysql_thd is NULL for a transaction we assume that it has
1780
	not edited non-transactional tables. */
1781
1782
	a_notrans_edit = a->mysql_thd != NULL
1783
	    && thd_has_edited_nontrans_tables(a->mysql_thd);
1784
1785
	b_notrans_edit = b->mysql_thd != NULL
1786
	    && thd_has_edited_nontrans_tables(b->mysql_thd);
1787
1788
	if (a_notrans_edit && !b_notrans_edit) {
1789
1790
		return(1);
1791
	}
1792
1793
	if (!a_notrans_edit && b_notrans_edit) {
1794
1795
		return(-1);
1796
	}
1797
1798
	/* Either both had edited non-transactional tables or both had
1799
	not, we fall back to comparing the number of altered/locked
1800
	rows. */
1801
1802
#if 0
1803
	fprintf(stderr,
1804
		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
1805
		__func__,
1806
		ut_conv_dulint_to_longlong(a->undo_no),
1807
		UT_LIST_GET_LEN(a->trx_locks),
1808
		ut_conv_dulint_to_longlong(b->undo_no),
1809
		UT_LIST_GET_LEN(b->trx_locks));
1810
#endif
1811
1812
#define TRX_WEIGHT(t)	\
1813
	ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks))
1814
1815
	return(ut_dulint_cmp(TRX_WEIGHT(a), TRX_WEIGHT(b)));
1816
}
1817
1818
/********************************************************************
1819
Prepares a transaction. */
1820
1821
void
1822
trx_prepare_off_kernel(
1823
/*===================*/
1824
	trx_t*	trx)	/* in: transaction */
1825
{
1826
	page_t*		update_hdr_page;
1827
	trx_rseg_t*	rseg;
1828
	ibool		must_flush_log	= FALSE;
1829
	dulint		lsn;
1830
	mtr_t		mtr;
1831
1832
	ut_ad(mutex_own(&kernel_mutex));
1833
1834
	rseg = trx->rseg;
1835
1836
	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
1837
1838
		mutex_exit(&kernel_mutex);
1839
1840
		mtr_start(&mtr);
1841
1842
		must_flush_log = TRUE;
1843
1844
		/* Change the undo log segment states from TRX_UNDO_ACTIVE
1845
		to TRX_UNDO_PREPARED: these modifications to the file data
1846
		structure define the transaction as prepared in the
1847
		file-based world, at the serialization point of lsn. */
1848
1849
		mutex_enter(&(rseg->mutex));
1850
1851
		if (trx->insert_undo != NULL) {
1852
1853
			/* It is not necessary to obtain trx->undo_mutex here
1854
			because only a single OS thread is allowed to do the
1855
			transaction prepare for this transaction. */
1856
1857
			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
1858
						      &mtr);
1859
		}
1860
1861
		if (trx->update_undo) {
1862
			update_hdr_page = trx_undo_set_state_at_prepare(
1863
				trx, trx->update_undo, &mtr);
1864
		}
1865
1866
		mutex_exit(&(rseg->mutex));
1867
1868
		/*--------------*/
1869
		mtr_commit(&mtr);	/* This mtr commit makes the
1870
					transaction prepared in the file-based
1871
					world */
1872
		/*--------------*/
1873
		lsn = mtr.end_lsn;
1874
1875
		mutex_enter(&kernel_mutex);
1876
	}
1877
1878
	ut_ad(mutex_own(&kernel_mutex));
1879
1880
	/*--------------------------------------*/
1881
	trx->conc_state = TRX_PREPARED;
1882
	/*--------------------------------------*/
1883
1884
	if (must_flush_log) {
1885
		/* Depending on the my.cnf options, we may now write the log
1886
		buffer to the log files, making the prepared state of the
1887
		transaction durable if the OS does not crash. We may also
1888
		flush the log files to disk, making the prepared state of the
1889
		transaction durable also at an OS crash or a power outage.
1890
1891
		The idea in InnoDB's group prepare is that a group of
1892
		transactions gather behind a trx doing a physical disk write
1893
		to log files, and when that physical write has been completed,
1894
		one of those transactions does a write which prepares the whole
1895
		group. Note that this group prepare will only bring benefit if
1896
		there are > 2 users in the database. Then at least 2 users can
1897
		gather behind one doing the physical log write to disk.
1898
1899
		TODO: find out if MySQL holds some mutex when calling this.
1900
		That would spoil our group prepare algorithm. */
1901
1902
		mutex_exit(&kernel_mutex);
1903
1904
		if (srv_flush_log_at_trx_commit == 0) {
1905
			/* Do nothing */
1906
		} else if (srv_flush_log_at_trx_commit == 1) {
1907
			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1908
				/* Write the log but do not flush it to disk */
1909
1910
				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
1911
						FALSE);
1912
			} else {
1913
				/* Write the log to the log files AND flush
1914
				them to disk */
1915
1916
				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1917
			}
1918
		} else if (srv_flush_log_at_trx_commit == 2) {
1919
1920
			/* Write the log but do not flush it to disk */
1921
1922
			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1923
		} else {
1924
			ut_error;
1925
		}
1926
1927
		mutex_enter(&kernel_mutex);
1928
	}
1929
}
1930
1931
/**************************************************************************
1932
Does the transaction prepare for MySQL. */
1933
1934
ulint
1935
trx_prepare_for_mysql(
1936
/*==================*/
1937
			/* out: 0 or error number */
1938
	trx_t*	trx)	/* in: trx handle */
1939
{
1940
	/* Because we do not do the prepare by sending an Innobase
1941
	sig to the transaction, we must here make sure that trx has been
1942
	started. */
1943
1944
	ut_a(trx);
1945
1946
	trx->op_info = "preparing";
1947
1948
	trx_start_if_not_started(trx);
1949
1950
	mutex_enter(&kernel_mutex);
1951
1952
	trx_prepare_off_kernel(trx);
1953
1954
	mutex_exit(&kernel_mutex);
1955
1956
	trx->op_info = "";
1957
1958
	return(0);
1959
}
1960
1961
/**************************************************************************
1962
This function is used to find number of prepared transactions and
1963
their transaction objects for a recovery. */
1964
1965
int
1966
trx_recover_for_mysql(
1967
/*==================*/
1968
				/* out: number of prepared transactions
1969
				stored in xid_list */
1970
	XID*	xid_list,	/* in/out: prepared transactions */
1971
	ulint	len)		/* in: number of slots in xid_list */
1972
{
1973
	trx_t*	trx;
1974
	ulint	count = 0;
1975
1976
	ut_ad(xid_list);
1977
	ut_ad(len);
1978
1979
	/* We should set those transactions which are in the prepared state
1980
	to the xid_list */
1981
1982
	mutex_enter(&kernel_mutex);
1983
1984
	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
1985
1986
	while (trx) {
1987
		if (trx->conc_state == TRX_PREPARED) {
1988
			xid_list[count] = trx->xid;
1989
1990
			if (count == 0) {
1991
				ut_print_timestamp(stderr);
1992
				fprintf(stderr,
1993
					"  InnoDB: Starting recovery for"
1994
					" XA transactions...\n");
1995
			}
1996
1997
			ut_print_timestamp(stderr);
1998
			fprintf(stderr,
1999
				"  InnoDB: Transaction %lu %lu in"
2000
				" prepared state after recovery\n",
2001
				(ulong) ut_dulint_get_high(trx->id),
2002
				(ulong) ut_dulint_get_low(trx->id));
2003
2004
			ut_print_timestamp(stderr);
2005
			fprintf(stderr,
2006
				"  InnoDB: Transaction contains changes"
2007
				" to %lu rows\n",
2008
				(ulong) ut_conv_dulint_to_longlong(
2009
					trx->undo_no));
2010
2011
			count++;
2012
2013
			if (count == len) {
2014
				break;
2015
			}
2016
		}
2017
2018
		trx = UT_LIST_GET_NEXT(trx_list, trx);
2019
	}
2020
2021
	mutex_exit(&kernel_mutex);
2022
2023
	if (count > 0){
2024
		ut_print_timestamp(stderr);
2025
		fprintf(stderr,
2026
			"  InnoDB: %lu transactions in prepared state"
2027
			" after recovery\n",
2028
			(ulong) count);
2029
	}
2030
2031
	return ((int) count);
2032
}
2033
2034
/***********************************************************************
2035
This function is used to find one X/Open XA distributed transaction
2036
which is in the prepared state */
2037
2038
trx_t*
2039
trx_get_trx_by_xid(
2040
/*===============*/
2041
			/* out: trx or NULL */
2042
	XID*	xid)	/* in: X/Open XA transaction identification */
2043
{
2044
	trx_t*	trx;
2045
2046
	if (xid == NULL) {
2047
2048
		return (NULL);
2049
	}
2050
2051
	mutex_enter(&kernel_mutex);
2052
2053
	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
2054
2055
	while (trx) {
2056
		/* Compare two X/Open XA transaction id's: their
2057
		length should be the same and binary comparison
2058
		of gtrid_lenght+bqual_length bytes should be
2059
		the same */
2060
2061
		if (xid->gtrid_length == trx->xid.gtrid_length
2062
		    && xid->bqual_length == trx->xid.bqual_length
2063
		    && memcmp(xid->data, trx->xid.data,
2064
			      xid->gtrid_length + xid->bqual_length) == 0) {
2065
			break;
2066
		}
2067
2068
		trx = UT_LIST_GET_NEXT(trx_list, trx);
2069
	}
2070
2071
	mutex_exit(&kernel_mutex);
2072
2073
	if (trx) {
2074
		if (trx->conc_state != TRX_PREPARED) {
2075
2076
			return(NULL);
2077
		}
2078
2079
		return(trx);
2080
	} else {
2081
		return(NULL);
2082
	}
2083
}