19
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
13
You should have received a copy of the GNU General Public License along with
22
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23
St, Fifth Floor, Boston, MA 02110-1301 USA
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
25
17
*****************************************************************************/
18
/***********************************************************************
20
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
21
Copyright (c) 2009, Percona Inc.
23
Portions of this file contain modifications contributed and copyrighted
24
by Percona Inc.. Those modifications are
25
gratefully acknowledged and are described briefly in the InnoDB
26
documentation. The contributions by Percona Inc. are incorporated with
27
their permission, and subject to the conditions contained in the file
30
This program is free software; you can redistribute it and/or modify it
31
under the terms of the GNU General Public License as published by the
32
Free Software Foundation; version 2 of the License.
34
This program is distributed in the hope that it will be useful, but
35
WITHOUT ANY WARRANTY; without even the implied warranty of
36
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
37
Public License for more details.
39
You should have received a copy of the GNU General Public License along
40
with this program; if not, write to the Free Software Foundation, Inc.,
41
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
43
***********************************************************************/
27
45
/**************************************************//**
91
100
/* In simulated aio, merge at most this many consecutive i/os */
92
101
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
103
/** If this flag is TRUE, then we will use the native aio of the
104
OS (provided we compiled Innobase with it in), otherwise we will
105
use simulated aio we build below with threads */
107
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
148
109
/** Flag: enable debug printout for asynchronous i/o */
149
110
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
112
/** The asynchronous i/o array slot structure */
159
113
typedef struct os_aio_slot_struct os_aio_slot_t;
3107
2904
return((array->slots) + index);
3110
#if defined(LINUX_NATIVE_AIO)
3111
/******************************************************************//**
3112
Creates an io_context for native linux AIO.
3113
@return TRUE on success. */
3116
os_aio_linux_create_io_ctx(
3117
/*=======================*/
3118
ulint max_events, /*!< in: number of events. */
3119
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3125
memset(io_ctx, 0x0, sizeof(*io_ctx));
3127
/* Initialize the io_ctx. Tell it how many pending
3128
IO requests this context will handle. */
3130
ret = io_setup(max_events, io_ctx);
3132
#if defined(UNIV_AIO_DEBUG)
3134
"InnoDB: Linux native AIO:"
3135
" initialized io_ctx for segment\n");
3137
/* Success. Return now. */
3141
/* If we hit EAGAIN we'll make a few attempts before failing. */
3146
/* First time around. */
3147
ut_print_timestamp(stderr);
3149
" InnoDB: Warning: io_setup() failed"
3150
" with EAGAIN. Will make %d attempts"
3151
" before giving up.\n",
3152
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3155
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3158
"InnoDB: Warning: io_setup() attempt"
3161
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3165
/* Have tried enough. Better call it a day. */
3166
ut_print_timestamp(stderr);
3168
" InnoDB: Error: io_setup() failed"
3169
" with EAGAIN after %d attempts.\n",
3170
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3174
ut_print_timestamp(stderr);
3176
" InnoDB: Error: Linux Native AIO interface"
3177
" is not supported on this platform. Please"
3178
" check your OS documentation and install"
3179
" appropriate binary of InnoDB.\n");
3184
ut_print_timestamp(stderr);
3186
" InnoDB: Error: Linux Native AIO setup"
3187
" returned following error[%d]\n", -ret);
3192
"InnoDB: You can disable Linux Native AIO by"
3193
" setting innodb_native_aio = off in my.cnf\n");
3196
#endif /* LINUX_NATIVE_AIO */
3198
/******************************************************************//**
3199
Creates an aio wait array. Note that we return NULL in case of failure.
3200
We don't care about freeing memory here because we assume that a
3201
failure will result in server refusing to start up.
3202
@return own: aio array, NULL on failure */
2907
/************************************************************************//**
2908
Creates an aio wait array.
2909
@return own: aio array */
3204
2911
os_aio_array_t*
3205
2912
os_aio_array_create(
3206
2913
/*================*/
3207
ulint n, /*!< in: maximum number of pending aio
3208
operations allowed; n must be
3209
divisible by n_segments */
2914
ulint n, /*!< in: maximum number of pending aio operations
2915
allowed; n must be divisible by n_segments */
3210
2916
ulint n_segments) /*!< in: number of segments in the aio array */
3212
2918
os_aio_array_t* array;
3231
2935
array->n_slots = n;
3232
2936
array->n_segments = n_segments;
3233
2937
array->n_reserved = 0;
3235
array->slots = static_cast<os_aio_slot_t *>(ut_malloc(n * sizeof(os_aio_slot_t)));
2938
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3237
array->handles = ut_malloc(n * sizeof(HANDLE));
2940
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3240
#if defined(LINUX_NATIVE_AIO)
3241
array->aio_ctx = NULL;
3242
array->aio_events = NULL;
3244
/* If we are not using native aio interface then skip this
3245
part of initialization. */
3246
if (!srv_use_native_aio) {
3247
goto skip_native_aio;
3250
/* Initialize the io_context array. One io_context
3251
per segment in the array. */
3253
array->aio_ctx = (io_context**) ut_malloc(n_segments *
3254
sizeof(*array->aio_ctx));
3255
for (i = 0; i < n_segments; ++i) {
3256
if (!os_aio_linux_create_io_ctx(n/n_segments,
3257
&array->aio_ctx[i])) {
3258
/* If something bad happened during aio setup
3259
we should call it a day and return right away.
3260
We don't care about any leaks because a failure
3261
to initialize the io subsystem means that the
3262
server (or atleast the innodb storage engine)
3263
is not going to startup. */
3268
/* Initialize the event array. One event per slot. */
3269
aio_event = (io_event*) ut_malloc(n * sizeof(io_event));
3270
memset(aio_event, 0x0, sizeof(io_event) * n);
3271
array->aio_events = aio_event;
3274
#endif /* LINUX_NATIVE_AIO */
3275
2942
for (i = 0; i < n; i++) {
3276
2943
slot = os_aio_array_get_nth_slot(array, i);
3279
2946
slot->reserved = FALSE;
3280
2947
#ifdef WIN_ASYNC_IO
3281
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
2948
slot->event = os_event_create(NULL);
3283
2950
over = &(slot->control);
3285
over->hEvent = slot->handle;
3287
*((array->handles) + i) = over->hEvent;
3289
#elif defined(LINUX_NATIVE_AIO)
3291
memset(&slot->control, 0x0, sizeof(slot->control));
2952
over->hEvent = slot->event->handle;
2954
*((array->native_events) + i) = over->hEvent;
3300
/************************************************************************//**
3301
Frees an aio wait array. */
3306
os_aio_array_t* array) /*!< in, own: array to free */
3311
for (i = 0; i < array->n_slots; i++) {
3312
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3313
CloseHandle(slot->handle);
3315
#endif /* WIN_ASYNC_IO */
3318
ut_free(array->handles);
3319
#endif /* __WIN__ */
3320
os_mutex_free(array->mutex);
3321
os_event_free(array->not_full);
3322
os_event_free(array->is_empty);
3324
#if defined(LINUX_NATIVE_AIO)
3325
if (srv_use_native_aio) {
3326
ut_free(array->aio_events);
3327
ut_free(array->aio_ctx);
3329
#endif /* LINUX_NATIVE_AIO */
3331
ut_free(array->slots);
3335
2961
/***********************************************************************
3336
2962
Initializes the asynchronous io system. Creates one array each for ibuf
3337
2963
and log i/o. Also creates one array each for read and write where each
3902
3391
os_event_reset(os_aio_segment_wait_events[g]);
3905
#endif /* __WIN__ */
3908
#if defined(LINUX_NATIVE_AIO)
3909
/*******************************************************************//**
3910
Dispatch an AIO request to the kernel.
3911
@return TRUE on success. */
3914
os_aio_linux_dispatch(
3915
/*==================*/
3916
os_aio_array_t* array, /*!< in: io request array. */
3917
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3923
ut_ad(slot != NULL);
3926
ut_a(slot->reserved);
3928
/* Find out what we are going to work with.
3929
The iocb struct is directly in the slot.
3930
The io_context is one per segment. */
3932
iocb = &slot->control;
3933
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3935
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3937
#if defined(UNIV_AIO_DEBUG)
3939
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3940
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3941
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3944
/* io_submit returns number of successfully
3945
queued requests or -errno. */
3946
if (UNIV_UNLIKELY(ret != 1)) {
3953
#endif /* LINUX_NATIVE_AIO */
3956
/*******************************************************************//**
3957
NOTE! Use the corresponding macro os_aio(), not directly this function!
3396
/*******************************************************************//**
3958
3397
Requests an asynchronous i/o operation.
3959
3398
@return TRUE if request was queued successfully, FALSE if fail */
3964
3403
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3965
3404
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3966
3405
to OS_AIO_SIMULATED_WAKE_LATER: the
4264
3675
#ifdef UNIV_DO_FLUSH
4265
3676
if (slot->type == OS_FILE_WRITE
4266
3677
&& !os_do_not_call_flush_at_each_write) {
4267
if (!os_file_flush(slot->file)) {
3678
ut_a(TRUE == os_file_flush(slot->file));
4271
3680
#endif /* UNIV_DO_FLUSH */
4272
} else if (os_file_handle_error(slot->name, "Windows aio")) {
3682
os_file_handle_error(slot->name, "Windows aio");
4277
3684
ret_val = FALSE;
4280
3687
os_mutex_exit(array->mutex);
4283
/* retry failed read/write operation synchronously.
4284
No need to hold array->mutex. */
4287
/* This read/write does not go through os_file_read
4288
and os_file_write APIs, need to register with
4289
performance schema explicitly here. */
4290
struct PSI_file_locker* locker = NULL;
4291
register_pfs_file_io_begin(locker, slot->file, slot->len,
4292
(slot->type == OS_FILE_WRITE)
4295
__FILE__, __LINE__);
4298
ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4300
switch (slot->type) {
4302
ret = WriteFile(slot->file, slot->buf,
4303
(DWORD) slot->len, &len,
4308
ret = ReadFile(slot->file, slot->buf,
4309
(DWORD) slot->len, &len,
4318
register_pfs_file_io_end(locker, len);
4321
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4322
/* aio was queued successfully!
4323
We want a synchronous i/o operation on a
4324
file where we also use async i/o: in Windows
4325
we must use the same wait mechanism as for
4328
ret = GetOverlappedResult(slot->file,
4333
ret_val = ret && len == slot->len;
4336
3689
os_aio_array_free_slot(array, slot);
4338
3691
return(ret_val);
4342
#if defined(LINUX_NATIVE_AIO)
4343
/******************************************************************//**
4344
This function is only used in Linux native asynchronous i/o. This is
4345
called from within the io-thread. If there are no completed IO requests
4346
in the slot array, the thread calls this function to collect more
4347
requests from the kernel.
4348
The io-thread waits on io_getevents(), which is a blocking call, with
4349
a timeout value. Unless the system is very heavy loaded, keeping the
4350
io-thread very busy, the io-thread will spend most of its time waiting
4352
The io-thread also exits in this function. It checks server status at
4353
each wakeup and that is why we use timed wait in io_getevents(). */
4356
os_aio_linux_collect(
4357
/*=================*/
4358
os_aio_array_t* array, /*!< in/out: slot array. */
4359
ulint segment, /*!< in: local segment no. */
4360
ulint seg_size) /*!< in: segment size. */
4366
struct timespec timeout;
4367
struct io_event* events;
4368
struct io_context* io_ctx;
4370
/* sanity checks. */
4371
ut_ad(array != NULL);
4372
ut_ad(seg_size > 0);
4373
ut_ad(segment < array->n_segments);
4375
/* Which part of event array we are going to work on. */
4376
events = &array->aio_events[segment * seg_size];
4378
/* Which io_context we are going to use. */
4379
io_ctx = array->aio_ctx[segment];
4381
/* Starting point of the segment we will be working on. */
4382
start_pos = segment * seg_size;
4385
end_pos = start_pos + seg_size;
4389
/* Go down if we are in shutdown mode.
4390
In case of srv_fast_shutdown == 2, there may be pending
4391
IO requests but that should be OK as we essentially treat
4392
that as a crash of InnoDB. */
4393
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4394
os_thread_exit(NULL);
4397
/* Initialize the events. The timeout value is arbitrary.
4398
We probably need to experiment with it a little. */
4399
memset(events, 0, sizeof(*events) * seg_size);
4401
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4403
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4405
/* This error handling is for any error in collecting the
4406
IO requests. The errors, if any, for any particular IO
4407
request are simply passed on to the calling routine. */
4409
/* Not enough resources! Try again. */
4410
if (ret == -EAGAIN) {
4414
/* Interrupted! I have tested the behaviour in case of an
4415
interrupt. If we have some completed IOs available then
4416
the return code will be the number of IOs. We get EINTR only
4417
if there are no completed IOs and we have been interrupted. */
4418
if (ret == -EINTR) {
4422
/* No pending request! Go back and check again. */
4427
/* All other errors! should cause a trap for now. */
4428
if (UNIV_UNLIKELY(ret < 0)) {
4429
ut_print_timestamp(stderr);
4431
" InnoDB: unexpected ret_code[%d] from"
4432
" io_getevents()!\n", ret);
4438
for (i = 0; i < ret; i++) {
4439
os_aio_slot_t* slot;
4440
struct iocb* control;
4442
control = (struct iocb *)events[i].obj;
4443
ut_a(control != NULL);
4445
slot = (os_aio_slot_t *) control->data;
4447
/* Some sanity checks. */
4449
ut_a(slot->reserved);
4451
#if defined(UNIV_AIO_DEBUG)
4453
"io_getevents[%c]: slot[%p] ctx[%p]"
4455
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4456
slot, io_ctx, segment);
4459
/* We are not scribbling previous segment. */
4460
ut_a(slot->pos >= start_pos);
4462
/* We have not overstepped to next segment. */
4463
ut_a(slot->pos < end_pos);
4465
/* Mark this request as completed. The error handling
4466
will be done in the calling function. */
4467
os_mutex_enter(array->mutex);
4468
slot->n_bytes = events[i].res;
4469
slot->ret = events[i].res2;
4470
slot->io_already_done = TRUE;
4471
os_mutex_exit(array->mutex);
4477
/**********************************************************************//**
4478
This function is only used in Linux native asynchronous i/o.
4479
Waits for an aio operation to complete. This function is used to wait for
4480
the completed requests. The aio array of pending requests is divided
4481
into segments. The thread specifies which segment or slot it wants to wait
4482
for. NOTE: this function will also take care of freeing the aio slot,
4483
therefore no other thread is allowed to do the freeing!
4484
@return TRUE if the IO was successful */
4487
os_aio_linux_handle(
4488
/*================*/
4489
ulint global_seg, /*!< in: segment number in the aio array
4490
to wait for; segment 0 is the ibuf
4491
i/o thread, segment 1 is log i/o thread,
4492
then follow the non-ibuf read threads,
4493
and the last are the non-ibuf write
4495
fil_node_t**message1, /*!< out: the messages passed with the */
4496
void** message2, /*!< aio request; note that in case the
4497
aio operation failed, these output
4498
parameters are valid and can be used to
4499
restart the operation. */
4500
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4503
os_aio_array_t* array;
4504
os_aio_slot_t* slot;
4509
/* Should never be doing Sync IO here. */
4510
ut_a(global_seg != ULINT_UNDEFINED);
4512
/* Find the array and the local segment. */
4513
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4514
n = array->n_slots / array->n_segments;
4516
/* Loop until we have found a completed request. */
4518
os_mutex_enter(array->mutex);
4519
for (i = 0; i < n; ++i) {
4520
slot = os_aio_array_get_nth_slot(
4521
array, i + segment * n);
4522
if (slot->reserved && slot->io_already_done) {
4523
/* Something for us to work on. */
4528
os_mutex_exit(array->mutex);
4530
/* We don't have any completed request.
4531
Wait for some request. Note that we return
4532
from wait iff we have found a request. */
4534
srv_set_io_thread_op_info(global_seg,
4535
"waiting for completed aio requests");
4536
os_aio_linux_collect(array, segment, n);
4540
/* Note that it may be that there are more then one completed
4541
IO requests. We process them one at a time. We may have a case
4542
here to improve the performance slightly by dealing with all
4543
requests in one sweep. */
4544
srv_set_io_thread_op_info(global_seg,
4545
"processing completed aio requests");
4547
/* Ensure that we are scribbling only our segment. */
4550
ut_ad(slot != NULL);
4551
ut_ad(slot->reserved);
4552
ut_ad(slot->io_already_done);
4554
*message1 = slot->message1;
4555
*message2 = slot->message2;
4559
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4562
#ifdef UNIV_DO_FLUSH
4563
if (slot->type == OS_FILE_WRITE
4564
&& !os_do_not_call_flush_at_each_write)
4565
&& !os_file_flush(slot->file) {
4568
#endif /* UNIV_DO_FLUSH */
4572
/* os_file_handle_error does tell us if we should retry
4573
this IO. As it stands now, we don't do this retry when
4574
reaping requests from a different context than
4575
the dispatcher. This non-retry logic is the same for
4576
windows and linux native AIO.
4577
We should probably look into this to transparently
4578
re-submit the IO. */
4579
os_file_handle_error(slot->name, "Linux aio");
4584
os_mutex_exit(array->mutex);
4586
os_aio_array_free_slot(array, slot);
4590
#endif /* LINUX_NATIVE_AIO */
4592
3695
/**********************************************************************//**
4593
3696
Does simulated aio. This function should be called by an i/o-handler