84
91
/* In simulated aio, merge at most this many consecutive i/os */
85
92
#define OS_AIO_MERGE_N_CONSECUTIVE 64
87
/** If this flag is TRUE, then we will use the native aio of the
88
OS (provided we compiled Innobase with it in), otherwise we will
89
use simulated aio we build below with threads */
91
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
93
148
/** Flag: enable debug printout for asynchronous i/o */
94
149
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
96
158
/** The asynchronous i/o array slot structure */
97
159
typedef struct os_aio_slot_struct os_aio_slot_t;
2993
3108
return((array->slots) + index);
2996
/************************************************************************//**
2997
Creates an aio wait array.
2998
@return own: aio array */
3111
#if defined(LINUX_NATIVE_AIO)
3112
/******************************************************************//**
3113
Creates an io_context for native linux AIO.
3114
@return TRUE on success. */
3117
os_aio_linux_create_io_ctx(
3118
/*=======================*/
3119
ulint max_events, /*!< in: number of events. */
3120
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3126
memset(io_ctx, 0x0, sizeof(*io_ctx));
3128
/* Initialize the io_ctx. Tell it how many pending
3129
IO requests this context will handle. */
3131
ret = io_setup(max_events, io_ctx);
3133
#if defined(UNIV_AIO_DEBUG)
3135
"InnoDB: Linux native AIO:"
3136
" initialized io_ctx for segment\n");
3138
/* Success. Return now. */
3142
/* If we hit EAGAIN we'll make a few attempts before failing. */
3147
/* First time around. */
3148
ut_print_timestamp(stderr);
3150
" InnoDB: Warning: io_setup() failed"
3151
" with EAGAIN. Will make %d attempts"
3152
" before giving up.\n",
3153
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3156
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3159
"InnoDB: Warning: io_setup() attempt"
3162
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3166
/* Have tried enough. Better call it a day. */
3167
ut_print_timestamp(stderr);
3169
" InnoDB: Error: io_setup() failed"
3170
" with EAGAIN after %d attempts.\n",
3171
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3175
ut_print_timestamp(stderr);
3177
" InnoDB: Error: Linux Native AIO interface"
3178
" is not supported on this platform. Please"
3179
" check your OS documentation and install"
3180
" appropriate binary of InnoDB.\n");
3185
ut_print_timestamp(stderr);
3187
" InnoDB: Error: Linux Native AIO setup"
3188
" returned following error[%d]\n", -ret);
3193
"InnoDB: You can disable Linux Native AIO by"
3194
" setting innodb_native_aio = off in my.cnf\n");
3197
#endif /* LINUX_NATIVE_AIO */
3199
/******************************************************************//**
3200
Creates an aio wait array. Note that we return NULL in case of failure.
3201
We don't care about freeing memory here because we assume that a
3202
failure will result in server refusing to start up.
3203
@return own: aio array, NULL on failure */
3000
3205
os_aio_array_t*
3001
3206
os_aio_array_create(
3002
3207
/*================*/
3003
ulint n, /*!< in: maximum number of pending aio operations
3004
allowed; n must be divisible by n_segments */
3208
ulint n, /*!< in: maximum number of pending aio
3209
operations allowed; n must be
3210
divisible by n_segments */
3005
3211
ulint n_segments) /*!< in: number of segments in the aio array */
3007
3213
os_aio_array_t* array;
3024
3232
array->n_slots = n;
3025
3233
array->n_segments = n_segments;
3026
3234
array->n_reserved = 0;
3027
3236
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3029
3238
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3241
#if defined(LINUX_NATIVE_AIO)
3242
/* If we are not using native aio interface then skip this
3243
part of initialization. */
3244
if (!srv_use_native_aio) {
3245
goto skip_native_aio;
3248
/* Initialize the io_context array. One io_context
3249
per segment in the array. */
3251
array->aio_ctx = ut_malloc(n_segments *
3252
sizeof(*array->aio_ctx));
3253
for (i = 0; i < n_segments; ++i) {
3254
if (!os_aio_linux_create_io_ctx(n/n_segments,
3255
&array->aio_ctx[i])) {
3256
/* If something bad happened during aio setup
3257
we should call it a day and return right away.
3258
We don't care about any leaks because a failure
3259
to initialize the io subsystem means that the
3260
server (or atleast the innodb storage engine)
3261
is not going to startup. */
3266
/* Initialize the event array. One event per slot. */
3267
io_event = ut_malloc(n * sizeof(*io_event));
3268
memset(io_event, 0x0, sizeof(*io_event) * n);
3269
array->aio_events = io_event;
3272
#endif /* LINUX_NATIVE_AIO */
3031
3273
for (i = 0; i < n; i++) {
3032
3274
slot = os_aio_array_get_nth_slot(array, i);
3552
3892
#endif /* __WIN__ */
3555
/*******************************************************************//**
3895
#if defined(LINUX_NATIVE_AIO)
3896
/*******************************************************************//**
3897
Dispatch an AIO request to the kernel.
3898
@return TRUE on success. */
3901
os_aio_linux_dispatch(
3902
/*==================*/
3903
os_aio_array_t* array, /*!< in: io request array. */
3904
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3910
ut_ad(slot != NULL);
3913
ut_a(slot->reserved);
3915
/* Find out what we are going to work with.
3916
The iocb struct is directly in the slot.
3917
The io_context is one per segment. */
3919
iocb = &slot->control;
3920
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3922
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3924
#if defined(UNIV_AIO_DEBUG)
3926
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3927
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3928
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3931
/* io_submit returns number of successfully
3932
queued requests or -errno. */
3933
if (UNIV_UNLIKELY(ret != 1)) {
3940
#endif /* LINUX_NATIVE_AIO */
3943
/*******************************************************************//**
3944
NOTE! Use the corresponding macro os_aio(), not directly this function!
3556
3945
Requests an asynchronous i/o operation.
3557
3946
@return TRUE if request was queued successfully, FALSE if fail */
3562
3951
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3563
3952
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3564
3953
to OS_AIO_SIMULATED_WAKE_LATER: the
4313
#if defined(LINUX_NATIVE_AIO)
4314
/******************************************************************//**
4315
This function is only used in Linux native asynchronous i/o. This is
4316
called from within the io-thread. If there are no completed IO requests
4317
in the slot array, the thread calls this function to collect more
4318
requests from the kernel.
4319
The io-thread waits on io_getevents(), which is a blocking call, with
4320
a timeout value. Unless the system is very heavy loaded, keeping the
4321
io-thread very busy, the io-thread will spend most of its time waiting
4323
The io-thread also exits in this function. It checks server status at
4324
each wakeup and that is why we use timed wait in io_getevents(). */
4327
os_aio_linux_collect(
4328
/*=================*/
4329
os_aio_array_t* array, /*!< in/out: slot array. */
4330
ulint segment, /*!< in: local segment no. */
4331
ulint seg_size) /*!< in: segment size. */
4337
struct timespec timeout;
4338
struct io_event* events;
4339
struct io_context* io_ctx;
4341
/* sanity checks. */
4342
ut_ad(array != NULL);
4343
ut_ad(seg_size > 0);
4344
ut_ad(segment < array->n_segments);
4346
/* Which part of event array we are going to work on. */
4347
events = &array->aio_events[segment * seg_size];
4349
/* Which io_context we are going to use. */
4350
io_ctx = array->aio_ctx[segment];
4352
/* Starting point of the segment we will be working on. */
4353
start_pos = segment * seg_size;
4356
end_pos = start_pos + seg_size;
4360
/* Go down if we are in shutdown mode.
4361
In case of srv_fast_shutdown == 2, there may be pending
4362
IO requests but that should be OK as we essentially treat
4363
that as a crash of InnoDB. */
4364
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4365
os_thread_exit(NULL);
4368
/* Initialize the events. The timeout value is arbitrary.
4369
We probably need to experiment with it a little. */
4370
memset(events, 0, sizeof(*events) * seg_size);
4372
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4374
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4376
/* This error handling is for any error in collecting the
4377
IO requests. The errors, if any, for any particular IO
4378
request are simply passed on to the calling routine. */
4380
/* Not enough resources! Try again. */
4381
if (ret == -EAGAIN) {
4385
/* Interrupted! I have tested the behaviour in case of an
4386
interrupt. If we have some completed IOs available then
4387
the return code will be the number of IOs. We get EINTR only
4388
if there are no completed IOs and we have been interrupted. */
4389
if (ret == -EINTR) {
4393
/* No pending request! Go back and check again. */
4398
/* All other errors! should cause a trap for now. */
4399
if (UNIV_UNLIKELY(ret < 0)) {
4400
ut_print_timestamp(stderr);
4402
" InnoDB: unexpected ret_code[%d] from"
4403
" io_getevents()!\n", ret);
4409
for (i = 0; i < ret; i++) {
4410
os_aio_slot_t* slot;
4411
struct iocb* control;
4413
control = (struct iocb *)events[i].obj;
4414
ut_a(control != NULL);
4416
slot = (os_aio_slot_t *) control->data;
4418
/* Some sanity checks. */
4420
ut_a(slot->reserved);
4422
#if defined(UNIV_AIO_DEBUG)
4424
"io_getevents[%c]: slot[%p] ctx[%p]"
4426
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4427
slot, io_ctx, segment);
4430
/* We are not scribbling previous segment. */
4431
ut_a(slot->pos >= start_pos);
4433
/* We have not overstepped to next segment. */
4434
ut_a(slot->pos < end_pos);
4436
/* Mark this request as completed. The error handling
4437
will be done in the calling function. */
4438
os_mutex_enter(array->mutex);
4439
slot->n_bytes = events[i].res;
4440
slot->ret = events[i].res2;
4441
slot->io_already_done = TRUE;
4442
os_mutex_exit(array->mutex);
4448
/**********************************************************************//**
4449
This function is only used in Linux native asynchronous i/o.
4450
Waits for an aio operation to complete. This function is used to wait for
4451
the completed requests. The aio array of pending requests is divided
4452
into segments. The thread specifies which segment or slot it wants to wait
4453
for. NOTE: this function will also take care of freeing the aio slot,
4454
therefore no other thread is allowed to do the freeing!
4455
@return TRUE if the IO was successful */
4458
os_aio_linux_handle(
4459
/*================*/
4460
ulint global_seg, /*!< in: segment number in the aio array
4461
to wait for; segment 0 is the ibuf
4462
i/o thread, segment 1 is log i/o thread,
4463
then follow the non-ibuf read threads,
4464
and the last are the non-ibuf write
4466
fil_node_t**message1, /*!< out: the messages passed with the */
4467
void** message2, /*!< aio request; note that in case the
4468
aio operation failed, these output
4469
parameters are valid and can be used to
4470
restart the operation. */
4471
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4474
os_aio_array_t* array;
4475
os_aio_slot_t* slot;
4480
/* Should never be doing Sync IO here. */
4481
ut_a(global_seg != ULINT_UNDEFINED);
4483
/* Find the array and the local segment. */
4484
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4485
n = array->n_slots / array->n_segments;
4487
/* Loop until we have found a completed request. */
4489
os_mutex_enter(array->mutex);
4490
for (i = 0; i < n; ++i) {
4491
slot = os_aio_array_get_nth_slot(
4492
array, i + segment * n);
4493
if (slot->reserved && slot->io_already_done) {
4494
/* Something for us to work on. */
4499
os_mutex_exit(array->mutex);
4501
/* We don't have any completed request.
4502
Wait for some request. Note that we return
4503
from wait iff we have found a request. */
4505
srv_set_io_thread_op_info(global_seg,
4506
"waiting for completed aio requests");
4507
os_aio_linux_collect(array, segment, n);
4511
/* Note that it may be that there are more then one completed
4512
IO requests. We process them one at a time. We may have a case
4513
here to improve the performance slightly by dealing with all
4514
requests in one sweep. */
4515
srv_set_io_thread_op_info(global_seg,
4516
"processing completed aio requests");
4518
/* Ensure that we are scribbling only our segment. */
4521
ut_ad(slot != NULL);
4522
ut_ad(slot->reserved);
4523
ut_ad(slot->io_already_done);
4525
*message1 = slot->message1;
4526
*message2 = slot->message2;
4530
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4533
#ifdef UNIV_DO_FLUSH
4534
if (slot->type == OS_FILE_WRITE
4535
&& !os_do_not_call_flush_at_each_write)
4536
&& !os_file_flush(slot->file) {
4539
#endif /* UNIV_DO_FLUSH */
4543
/* os_file_handle_error does tell us if we should retry
4544
this IO. As it stands now, we don't do this retry when
4545
reaping requests from a different context than
4546
the dispatcher. This non-retry logic is the same for
4547
windows and linux native AIO.
4548
We should probably look into this to transparently
4549
re-submit the IO. */
4550
os_file_handle_error(slot->name, "Linux aio");
4555
os_mutex_exit(array->mutex);
4557
os_aio_array_free_slot(array, slot);
4561
#endif /* LINUX_NATIVE_AIO */
3893
4563
/**********************************************************************//**
3894
4564
Does simulated aio. This function should be called by an i/o-handler