1
/******************************************************
2
The wait array used in synchronization primitives
6
Created 9/5/1995 Heikki Tuuri
7
*******************************************************/
11
#include "sync0arr.ic"
14
#include "sync0sync.h"
24
The wait array consists of cells each of which has an
25
an operating system event object created for it. The threads
26
waiting for a mutex, for example, can reserve a cell
27
in the array and suspend themselves to wait for the event
28
to become signaled. When using the wait array, remember to make
29
sure that some thread holding the synchronization object
30
will eventually know that there is a waiter in the array and
31
signal the object, to prevent infinite wait.
32
Why we chose to implement a wait array? First, to make
33
mutexes fast, we had to code our own implementation of them,
34
which only in usually uncommon cases resorts to using
35
slow operating system primitives. Then we had the choice of
36
assigning a unique OS event for each mutex, which would
37
be simpler, or using a global wait array. In some operating systems,
38
the global wait array solution is more efficient and flexible,
39
because we can do with a very small number of OS events,
40
say 200. In NT 3.51, allocating events seems to be a quadratic
41
algorithm, because 10 000 events are created fast, but
42
100 000 events takes a couple of minutes to create.
44
As of 5.0.30 the above mentioned design is changed. Since now
45
OS can handle millions of wait events efficiently, we no longer
46
have this concept of each cell of wait array having one event.
47
Instead, now the event that a thread wants to wait on is embedded
48
in the wait object (mutex or rw_lock). We still keep the global
49
wait array for the sake of diagnostics and also to avoid infinite
50
wait The error_monitor thread scans the global wait array to signal
51
any waiting threads who have missed the signal. */
53
/* A cell where an individual thread may wait suspended
54
until a resource is released. The suspending is implemented
55
using an operating system event semaphore. */
56
struct sync_cell_struct {
57
void* wait_object; /* pointer to the object the
58
thread is waiting for; if NULL
59
the cell is free for use */
60
mutex_t* old_wait_mutex; /* the latest wait mutex in cell */
61
rw_lock_t* old_wait_rw_lock;/* the latest wait rw-lock in cell */
62
ulint request_type; /* lock type requested on the
64
const char* file; /* in debug version file where
66
ulint line; /* in debug version line where
68
os_thread_id_t thread; /* thread id of this waiting
70
ibool waiting; /* TRUE if the thread has already
71
called sync_array_event_wait
73
ib_int64_t signal_count; /* We capture the signal_count
74
of the wait_object when we
75
reset the event. This value is
76
then passed on to os_event_wait
77
and we wait only if the event
78
has not been signalled in the
79
period between the reset and
81
time_t reservation_time;/* time when the thread reserved
85
/* NOTE: It is allowed for a thread to wait
86
for an event allocated for the array without owning the
87
protecting mutex (depending on the case: OS or database mutex), but
88
all changes (set or reset) to the state of the event must be made
89
while owning the mutex. */
90
struct sync_array_struct {
91
ulint n_reserved; /* number of currently reserved
92
cells in the wait array */
93
ulint n_cells; /* number of cells in the
95
sync_cell_t* array; /* pointer to wait array */
96
ulint protection; /* this flag tells which
97
mutex protects the data */
98
mutex_t mutex; /* possible database mutex
99
protecting this data structure */
100
os_mutex_t os_mutex; /* Possible operating system mutex
101
protecting the data structure.
102
As this data structure is used in
103
constructing the database mutex,
104
to prevent infinite recursion
105
in implementation, we fall back to
107
ulint sg_count; /* count of how many times an
108
object has been signalled */
109
ulint res_count; /* count of cell reservations
110
since creation of the array */
113
#ifdef UNIV_SYNC_DEBUG
114
/**********************************************************************
115
This function is called only in the debug version. Detects a deadlock
116
of one or more threads because of waits of semaphores. */
119
sync_array_detect_deadlock(
120
/*=======================*/
121
/* out: TRUE if deadlock detected */
122
sync_array_t* arr, /* in: wait array; NOTE! the caller must
123
own the mutex to array */
124
sync_cell_t* start, /* in: cell where recursive search started */
125
sync_cell_t* cell, /* in: cell to search */
126
ulint depth); /* in: recursion depth */
127
#endif /* UNIV_SYNC_DEBUG */
129
/*********************************************************************
130
Gets the nth cell in array. */
133
sync_array_get_nth_cell(
134
/*====================*/
136
sync_array_t* arr, /* in: sync array */
137
ulint n) /* in: index */
140
ut_a(n < arr->n_cells);
142
return(arr->array + n);
145
/**********************************************************************
146
Reserves the mutex semaphore protecting a sync array. */
151
sync_array_t* arr) /* in: sync wait array */
155
protection = arr->protection;
157
if (protection == SYNC_ARRAY_OS_MUTEX) {
158
os_mutex_enter(arr->os_mutex);
159
} else if (protection == SYNC_ARRAY_MUTEX) {
160
mutex_enter(&(arr->mutex));
166
/**********************************************************************
167
Releases the mutex semaphore protecting a sync array. */
172
sync_array_t* arr) /* in: sync wait array */
176
protection = arr->protection;
178
if (protection == SYNC_ARRAY_OS_MUTEX) {
179
os_mutex_exit(arr->os_mutex);
180
} else if (protection == SYNC_ARRAY_MUTEX) {
181
mutex_exit(&(arr->mutex));
187
/***********************************************************************
188
Creates a synchronization wait array. It is protected by a mutex
189
which is automatically reserved when the functions operating on it
195
/* out, own: created wait array */
196
ulint n_cells, /* in: number of cells in the array
198
ulint protection) /* in: either SYNC_ARRAY_OS_MUTEX or
199
SYNC_ARRAY_MUTEX: determines the type
200
of mutex protecting the data structure */
203
sync_cell_t* cell_array;
209
/* Allocate memory for the data structures */
210
arr = ut_malloc(sizeof(sync_array_t));
212
cell_array = ut_malloc(sizeof(sync_cell_t) * n_cells);
214
arr->n_cells = n_cells;
216
arr->array = cell_array;
217
arr->protection = protection;
221
/* Then create the mutex to protect the wait array complex */
222
if (protection == SYNC_ARRAY_OS_MUTEX) {
223
arr->os_mutex = os_mutex_create(NULL);
224
} else if (protection == SYNC_ARRAY_MUTEX) {
225
mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK);
230
for (i = 0; i < n_cells; i++) {
231
cell = sync_array_get_nth_cell(arr, i);
232
cell->wait_object = NULL;
233
cell->waiting = FALSE;
234
cell->signal_count = 0;
240
/**********************************************************************
241
Frees the resources in a wait array. */
246
sync_array_t* arr) /* in, own: sync wait array */
250
ut_a(arr->n_reserved == 0);
252
sync_array_validate(arr);
254
protection = arr->protection;
256
/* Release the mutex protecting the wait array complex */
258
if (protection == SYNC_ARRAY_OS_MUTEX) {
259
os_mutex_free(arr->os_mutex);
260
} else if (protection == SYNC_ARRAY_MUTEX) {
261
mutex_free(&(arr->mutex));
270
/************************************************************************
271
Validates the integrity of the wait array. Checks
272
that the number of reserved cells equals the count variable. */
277
sync_array_t* arr) /* in: sync wait array */
283
sync_array_enter(arr);
285
for (i = 0; i < arr->n_cells; i++) {
286
cell = sync_array_get_nth_cell(arr, i);
287
if (cell->wait_object != NULL) {
292
ut_a(count == arr->n_reserved);
294
sync_array_exit(arr);
297
/***********************************************************************
298
Puts the cell event in reset state. */
301
sync_cell_event_reset(
302
/*==================*/
303
/* out: value of signal_count
304
at the time of reset. */
305
ulint type, /* in: lock type mutex/rw_lock */
306
void* object) /* in: the rw_lock/mutex object */
308
if (type == SYNC_MUTEX) {
309
return(os_event_reset(((mutex_t *) object)->event));
311
} else if (type == RW_LOCK_WAIT_EX) {
312
return(os_event_reset(
313
((rw_lock_t *) object)->wait_ex_event));
316
return(os_event_reset(((rw_lock_t *) object)->event));
320
/**********************************************************************
321
Reserves a wait array cell for waiting for an object.
322
The event of the cell is reset to nonsignalled state. */
325
sync_array_reserve_cell(
326
/*====================*/
327
sync_array_t* arr, /* in: wait array */
328
void* object, /* in: pointer to the object to wait for */
329
ulint type, /* in: lock request type */
330
const char* file, /* in: file where requested */
331
ulint line, /* in: line where requested */
332
ulint* index) /* out: index of the reserved cell */
340
sync_array_enter(arr);
344
/* Reserve a new cell. */
345
for (i = 0; i < arr->n_cells; i++) {
346
cell = sync_array_get_nth_cell(arr, i);
348
if (cell->wait_object == NULL) {
350
cell->waiting = FALSE;
351
cell->wait_object = object;
353
if (type == SYNC_MUTEX) {
354
cell->old_wait_mutex = object;
356
cell->old_wait_rw_lock = object;
359
cell->request_type = type;
368
sync_array_exit(arr);
370
/* Make sure the event is reset and also store
371
the value of signal_count at which the event
373
cell->signal_count = sync_cell_event_reset(type,
376
cell->reservation_time = time(NULL);
378
cell->thread = os_thread_get_curr_id();
384
ut_error; /* No free cell found */
389
/**********************************************************************
390
This function should be called when a thread starts to wait on
391
a wait array cell. In the debug version this function checks
392
if the wait for a semaphore will result in a deadlock, in which
393
case prints info and asserts. */
396
sync_array_wait_event(
397
/*==================*/
398
sync_array_t* arr, /* in: wait array */
399
ulint index) /* in: index of the reserved cell */
406
sync_array_enter(arr);
408
cell = sync_array_get_nth_cell(arr, index);
410
ut_a(cell->wait_object);
411
ut_a(!cell->waiting);
412
ut_ad(os_thread_get_curr_id() == cell->thread);
414
if (cell->request_type == SYNC_MUTEX) {
415
event = ((mutex_t*) cell->wait_object)->event;
417
/* On windows if the thread about to wait is the one which
418
has set the state of the rw_lock to RW_LOCK_WAIT_EX, then
419
it waits on a special event i.e.: wait_ex_event. */
420
} else if (cell->request_type == RW_LOCK_WAIT_EX) {
421
event = ((rw_lock_t*) cell->wait_object)->wait_ex_event;
424
event = ((rw_lock_t*) cell->wait_object)->event;
427
cell->waiting = TRUE;
429
#ifdef UNIV_SYNC_DEBUG
431
/* We use simple enter to the mutex below, because if
432
we cannot acquire it at once, mutex_enter would call
433
recursively sync_array routines, leading to trouble.
434
rw_lock_debug_mutex freezes the debug lists. */
436
rw_lock_debug_mutex_enter();
438
if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
440
fputs("########################################\n", stderr);
444
rw_lock_debug_mutex_exit();
446
sync_array_exit(arr);
448
os_event_wait_low(event, cell->signal_count);
450
sync_array_free_cell(arr, index);
453
/**********************************************************************
454
Reports info of a wait array cell. */
457
sync_array_cell_print(
458
/*==================*/
459
FILE* file, /* in: file where to print */
460
sync_cell_t* cell) /* in: sync cell */
466
type = cell->request_type;
469
"--Thread %lu has waited at %s line %lu"
470
" for %.2f seconds the semaphore:\n",
471
(ulong) os_thread_pf(cell->thread), cell->file,
473
difftime(time(NULL), cell->reservation_time));
475
if (type == SYNC_MUTEX) {
476
/* We use old_wait_mutex in case the cell has already
477
been freed meanwhile */
478
mutex = cell->old_wait_mutex;
481
"Mutex at %p created file %s line %lu, lock var %lu\n"
482
#ifdef UNIV_SYNC_DEBUG
483
"Last time reserved in file %s line %lu, "
484
#endif /* UNIV_SYNC_DEBUG */
485
"waiters flag %lu\n",
486
(void*) mutex, mutex->cfile_name, (ulong) mutex->cline,
487
(ulong) mutex->lock_word,
488
#ifdef UNIV_SYNC_DEBUG
489
mutex->file_name, (ulong) mutex->line,
490
#endif /* UNIV_SYNC_DEBUG */
491
(ulong) mutex->waiters);
493
} else if (type == RW_LOCK_EX
495
|| type == RW_LOCK_WAIT_EX
497
|| type == RW_LOCK_SHARED) {
499
fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file);
501
rwlock = cell->old_wait_rw_lock;
504
" RW-latch at %p created in file %s line %lu\n",
505
(void*) rwlock, rwlock->cfile_name,
506
(ulong) rwlock->cline);
507
if (rwlock->writer != RW_LOCK_NOT_LOCKED) {
509
"a writer (thread id %lu) has"
510
" reserved it in mode %s",
511
(ulong) os_thread_pf(rwlock->writer_thread),
512
rwlock->writer == RW_LOCK_EX
514
: " wait exclusive\n");
518
"number of readers %lu, waiters flag %lu\n"
519
"Last time read locked in file %s line %lu\n"
520
"Last time write locked in file %s line %lu\n",
521
(ulong) rwlock->reader_count,
522
(ulong) rwlock->waiters,
523
rwlock->last_s_file_name,
524
(ulong) rwlock->last_s_line,
525
rwlock->last_x_file_name,
526
(ulong) rwlock->last_x_line);
531
if (!cell->waiting) {
532
fputs("wait has ended\n", file);
536
#ifdef UNIV_SYNC_DEBUG
537
/**********************************************************************
538
Looks for a cell with the given thread id. */
541
sync_array_find_thread(
542
/*===================*/
543
/* out: pointer to cell or NULL
545
sync_array_t* arr, /* in: wait array */
546
os_thread_id_t thread) /* in: thread id */
551
for (i = 0; i < arr->n_cells; i++) {
553
cell = sync_array_get_nth_cell(arr, i);
555
if (cell->wait_object != NULL
556
&& os_thread_eq(cell->thread, thread)) {
558
return(cell); /* Found */
562
return(NULL); /* Not found */
565
/**********************************************************************
566
Recursion step for deadlock detection. */
569
sync_array_deadlock_step(
570
/*=====================*/
571
/* out: TRUE if deadlock detected */
572
sync_array_t* arr, /* in: wait array; NOTE! the caller must
573
own the mutex to array */
574
sync_cell_t* start, /* in: cell where recursive search
576
os_thread_id_t thread, /* in: thread to look at */
577
ulint pass, /* in: pass value */
578
ulint depth) /* in: recursion depth */
586
/* If pass != 0, then we do not know which threads are
587
responsible of releasing the lock, and no deadlock can
593
new = sync_array_find_thread(arr, thread);
596
/* Stop running of other threads */
598
ut_dbg_stop_threads = TRUE;
601
fputs("########################################\n"
602
"DEADLOCK of threads detected!\n", stderr);
607
ret = sync_array_detect_deadlock(arr, start, new, depth);
616
/**********************************************************************
617
This function is called only in the debug version. Detects a deadlock
618
of one or more threads because of waits of semaphores. */
621
sync_array_detect_deadlock(
622
/*=======================*/
623
/* out: TRUE if deadlock detected */
624
sync_array_t* arr, /* in: wait array; NOTE! the caller must
625
own the mutex to array */
626
sync_cell_t* start, /* in: cell where recursive search started */
627
sync_cell_t* cell, /* in: cell to search */
628
ulint depth) /* in: recursion depth */
632
os_thread_id_t thread;
634
rw_lock_debug_t*debug;
639
ut_ad(cell->wait_object);
640
ut_ad(os_thread_get_curr_id() == start->thread);
645
if (!cell->waiting) {
647
return(FALSE); /* No deadlock here */
650
if (cell->request_type == SYNC_MUTEX) {
652
mutex = cell->wait_object;
654
if (mutex_get_lock_word(mutex) != 0) {
656
thread = mutex->thread_id;
658
/* Note that mutex->thread_id above may be
659
also OS_THREAD_ID_UNDEFINED, because the
660
thread which held the mutex maybe has not
661
yet updated the value, or it has already
662
released the mutex: in this case no deadlock
663
can occur, as the wait array cannot contain
664
a thread with ID_UNDEFINED value. */
666
ret = sync_array_deadlock_step(arr, start, thread, 0,
670
"Mutex %p owned by thread %lu file %s line %lu\n",
671
mutex, (ulong) os_thread_pf(mutex->thread_id),
672
mutex->file_name, (ulong) mutex->line);
673
sync_array_cell_print(stderr, cell);
679
return(FALSE); /* No deadlock */
681
} else if (cell->request_type == RW_LOCK_EX
682
|| cell->request_type == RW_LOCK_WAIT_EX) {
684
lock = cell->wait_object;
686
debug = UT_LIST_GET_FIRST(lock->debug_list);
688
while (debug != NULL) {
690
thread = debug->thread_id;
692
if (((debug->lock_type == RW_LOCK_EX)
693
&& !os_thread_eq(thread, cell->thread))
694
|| ((debug->lock_type == RW_LOCK_WAIT_EX)
695
&& !os_thread_eq(thread, cell->thread))
696
|| (debug->lock_type == RW_LOCK_SHARED)) {
698
/* The (wait) x-lock request can block
699
infinitely only if someone (can be also cell
700
thread) is holding s-lock, or someone
701
(cannot be cell thread) (wait) x-lock, and
702
he is blocked by start thread */
704
ret = sync_array_deadlock_step(
705
arr, start, thread, debug->pass,
709
fprintf(stderr, "rw-lock %p ",
711
sync_array_cell_print(stderr, cell);
712
rw_lock_debug_print(debug);
717
debug = UT_LIST_GET_NEXT(list, debug);
722
} else if (cell->request_type == RW_LOCK_SHARED) {
724
lock = cell->wait_object;
725
debug = UT_LIST_GET_FIRST(lock->debug_list);
727
while (debug != NULL) {
729
thread = debug->thread_id;
731
if ((debug->lock_type == RW_LOCK_EX)
732
|| (debug->lock_type == RW_LOCK_WAIT_EX)) {
734
/* The s-lock request can block infinitely
735
only if someone (can also be cell thread) is
736
holding (wait) x-lock, and he is blocked by
739
ret = sync_array_deadlock_step(
740
arr, start, thread, debug->pass,
747
debug = UT_LIST_GET_NEXT(list, debug);
756
return(TRUE); /* Execution never reaches this line: for compiler
759
#endif /* UNIV_SYNC_DEBUG */
761
/**********************************************************************
762
Determines if we can wake up the thread waiting for a sempahore. */
765
sync_arr_cell_can_wake_up(
766
/*======================*/
767
sync_cell_t* cell) /* in: cell to search */
772
if (cell->request_type == SYNC_MUTEX) {
774
mutex = cell->wait_object;
776
if (mutex_get_lock_word(mutex) == 0) {
781
} else if (cell->request_type == RW_LOCK_EX
782
|| cell->request_type == RW_LOCK_WAIT_EX) {
784
lock = cell->wait_object;
786
if (rw_lock_get_reader_count(lock) == 0
787
&& rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
792
if (rw_lock_get_reader_count(lock) == 0
793
&& rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX
794
&& os_thread_eq(lock->writer_thread, cell->thread)) {
799
} else if (cell->request_type == RW_LOCK_SHARED) {
800
lock = cell->wait_object;
802
if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
811
/**********************************************************************
812
Frees the cell. NOTE! sync_array_wait_event frees the cell
816
sync_array_free_cell(
817
/*=================*/
818
sync_array_t* arr, /* in: wait array */
819
ulint index) /* in: index of the cell in array */
823
sync_array_enter(arr);
825
cell = sync_array_get_nth_cell(arr, index);
827
ut_a(cell->wait_object != NULL);
829
cell->waiting = FALSE;
830
cell->wait_object = NULL;
831
cell->signal_count = 0;
833
ut_a(arr->n_reserved > 0);
836
sync_array_exit(arr);
839
/**************************************************************************
840
Increments the signalled count. */
843
sync_array_object_signalled(
844
/*========================*/
845
sync_array_t* arr) /* in: wait array */
847
sync_array_enter(arr);
851
sync_array_exit(arr);
854
/**************************************************************************
855
If the wakeup algorithm does not work perfectly at semaphore relases,
856
this function will do the waking (see the comment in mutex_exit). This
857
function should be called about every 1 second in the server.
859
Note that there's a race condition between this thread and mutex_exit
860
changing the lock_word and calling signal_object, so sometimes this finds
861
threads to wake up even when nothing has gone wrong. */
864
sync_arr_wake_threads_if_sema_free(void)
865
/*====================================*/
867
sync_array_t* arr = sync_primary_wait_array;
872
sync_array_enter(arr);
877
while (count < arr->n_reserved) {
879
cell = sync_array_get_nth_cell(arr, i);
881
if (cell->wait_object != NULL) {
885
if (sync_arr_cell_can_wake_up(cell)) {
887
if (cell->request_type == SYNC_MUTEX) {
890
mutex = cell->wait_object;
891
os_event_set(mutex->event);
893
} else if (cell->request_type
894
== RW_LOCK_WAIT_EX) {
897
lock = cell->wait_object;
898
os_event_set(lock->wait_ex_event);
903
lock = cell->wait_object;
904
os_event_set(lock->event);
912
sync_array_exit(arr);
915
/**************************************************************************
916
Prints warnings of long semaphore waits to stderr. */
919
sync_array_print_long_waits(void)
920
/*=============================*/
921
/* out: TRUE if fatal semaphore wait threshold
926
ibool noticed = FALSE;
928
ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
931
for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
933
cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
935
if (cell->wait_object != NULL && cell->waiting
936
&& difftime(time(NULL), cell->reservation_time) > 240) {
937
fputs("InnoDB: Warning: a long semaphore wait:\n",
939
sync_array_cell_print(stderr, cell);
943
if (cell->wait_object != NULL && cell->waiting
944
&& difftime(time(NULL), cell->reservation_time)
952
"InnoDB: ###### Starts InnoDB Monitor"
953
" for 30 secs to print diagnostic info:\n");
954
old_val = srv_print_innodb_monitor;
956
/* If some crucial semaphore is reserved, then also the InnoDB
957
Monitor can hang, and we do not get diagnostics. Since in
958
many cases an InnoDB hang is caused by a pwrite() or a pread()
959
call hanging inside the operating system, let us print right
960
now the values of pending calls of these. */
963
"InnoDB: Pending preads %lu, pwrites %lu\n",
964
(ulong)os_file_n_pending_preads,
965
(ulong)os_file_n_pending_pwrites);
967
srv_print_innodb_monitor = TRUE;
968
os_event_set(srv_lock_timeout_thread_event);
970
os_thread_sleep(30000000);
972
srv_print_innodb_monitor = old_val;
974
"InnoDB: ###### Diagnostic info printed"
975
" to the standard error stream\n");
981
/**************************************************************************
982
Prints info of the wait array. */
985
sync_array_output_info(
986
/*===================*/
987
FILE* file, /* in: file where to print */
988
sync_array_t* arr) /* in: wait array; NOTE! caller must own the
996
"OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
997
(long) arr->res_count, (long) arr->sg_count);
1001
while (count < arr->n_reserved) {
1003
cell = sync_array_get_nth_cell(arr, i);
1005
if (cell->wait_object != NULL) {
1007
sync_array_cell_print(file, cell);
1014
/**************************************************************************
1015
Prints info of the wait array. */
1018
sync_array_print_info(
1019
/*==================*/
1020
FILE* file, /* in: file where to print */
1021
sync_array_t* arr) /* in: wait array */
1023
sync_array_enter(arr);
1025
sync_array_output_info(file, arr);
1027
sync_array_exit(arr);