19
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
13
You should have received a copy of the GNU General Public License along with
22
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23
St, Fifth Floor, Boston, MA 02110-1301 USA
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
25
17
*****************************************************************************/
18
/***********************************************************************
20
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
21
Copyright (c) 2009, Percona Inc.
23
Portions of this file contain modifications contributed and copyrighted
24
by Percona Inc.. Those modifications are
25
gratefully acknowledged and are described briefly in the InnoDB
26
documentation. The contributions by Percona Inc. are incorporated with
27
their permission, and subject to the conditions contained in the file
30
This program is free software; you can redistribute it and/or modify it
31
under the terms of the GNU General Public License as published by the
32
Free Software Foundation; version 2 of the License.
34
This program is distributed in the hope that it will be useful, but
35
WITHOUT ANY WARRANTY; without even the implied warranty of
36
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
37
Public License for more details.
39
You should have received a copy of the GNU General Public License along
40
with this program; if not, write to the Free Software Foundation, Inc.,
41
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
43
***********************************************************************/
27
45
/**************************************************//**
91
100
/* In simulated aio, merge at most this many consecutive i/os */
92
101
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
103
/** If this flag is TRUE, then we will use the native aio of the
104
OS (provided we compiled Innobase with it in), otherwise we will
105
use simulated aio we build below with threads */
107
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
148
109
/** Flag: enable debug printout for asynchronous i/o */
149
110
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
112
/** The asynchronous i/o array slot structure */
159
113
typedef struct os_aio_slot_struct os_aio_slot_t;
3098
2904
return((array->slots) + index);
3101
#if defined(LINUX_NATIVE_AIO)
3102
/******************************************************************//**
3103
Creates an io_context for native linux AIO.
3104
@return TRUE on success. */
3107
os_aio_linux_create_io_ctx(
3108
/*=======================*/
3109
ulint max_events, /*!< in: number of events. */
3110
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3116
memset(io_ctx, 0x0, sizeof(*io_ctx));
3118
/* Initialize the io_ctx. Tell it how many pending
3119
IO requests this context will handle. */
3121
ret = io_setup(max_events, io_ctx);
3123
#if defined(UNIV_AIO_DEBUG)
3125
"InnoDB: Linux native AIO:"
3126
" initialized io_ctx for segment\n");
3128
/* Success. Return now. */
3132
/* If we hit EAGAIN we'll make a few attempts before failing. */
3137
/* First time around. */
3138
ut_print_timestamp(stderr);
3140
" InnoDB: Warning: io_setup() failed"
3141
" with EAGAIN. Will make %d attempts"
3142
" before giving up.\n",
3143
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3146
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3149
"InnoDB: Warning: io_setup() attempt"
3152
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3156
/* Have tried enough. Better call it a day. */
3157
ut_print_timestamp(stderr);
3159
" InnoDB: Error: io_setup() failed"
3160
" with EAGAIN after %d attempts.\n",
3161
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3165
ut_print_timestamp(stderr);
3167
" InnoDB: Error: Linux Native AIO interface"
3168
" is not supported on this platform. Please"
3169
" check your OS documentation and install"
3170
" appropriate binary of InnoDB.\n");
3175
ut_print_timestamp(stderr);
3177
" InnoDB: Error: Linux Native AIO setup"
3178
" returned following error[%d]\n", -ret);
3183
"InnoDB: You can disable Linux Native AIO by"
3184
" setting innodb_native_aio = off in my.cnf\n");
3187
#endif /* LINUX_NATIVE_AIO */
3189
/******************************************************************//**
3190
Creates an aio wait array. Note that we return NULL in case of failure.
3191
We don't care about freeing memory here because we assume that a
3192
failure will result in server refusing to start up.
3193
@return own: aio array, NULL on failure */
2907
/************************************************************************//**
2908
Creates an aio wait array.
2909
@return own: aio array */
3195
2911
os_aio_array_t*
3196
2912
os_aio_array_create(
3197
2913
/*================*/
3198
ulint n, /*!< in: maximum number of pending aio
3199
operations allowed; n must be
3200
divisible by n_segments */
2914
ulint n, /*!< in: maximum number of pending aio operations
2915
allowed; n must be divisible by n_segments */
3201
2916
ulint n_segments) /*!< in: number of segments in the aio array */
3203
2918
os_aio_array_t* array;
3222
2935
array->n_slots = n;
3223
2936
array->n_segments = n_segments;
3224
2937
array->n_reserved = 0;
3226
2938
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3228
array->handles = ut_malloc(n * sizeof(HANDLE));
2940
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3231
#if defined(LINUX_NATIVE_AIO)
3232
array->aio_ctx = NULL;
3233
array->aio_events = NULL;
3235
/* If we are not using native aio interface then skip this
3236
part of initialization. */
3237
if (!srv_use_native_aio) {
3238
goto skip_native_aio;
3241
/* Initialize the io_context array. One io_context
3242
per segment in the array. */
3244
array->aio_ctx = ut_malloc(n_segments *
3245
sizeof(*array->aio_ctx));
3246
for (i = 0; i < n_segments; ++i) {
3247
if (!os_aio_linux_create_io_ctx(n/n_segments,
3248
&array->aio_ctx[i])) {
3249
/* If something bad happened during aio setup
3250
we should call it a day and return right away.
3251
We don't care about any leaks because a failure
3252
to initialize the io subsystem means that the
3253
server (or atleast the innodb storage engine)
3254
is not going to startup. */
3259
/* Initialize the event array. One event per slot. */
3260
io_event = ut_malloc(n * sizeof(*io_event));
3261
memset(io_event, 0x0, sizeof(*io_event) * n);
3262
array->aio_events = io_event;
3265
#endif /* LINUX_NATIVE_AIO */
3266
2942
for (i = 0; i < n; i++) {
3267
2943
slot = os_aio_array_get_nth_slot(array, i);
3270
2946
slot->reserved = FALSE;
3271
2947
#ifdef WIN_ASYNC_IO
3272
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
2948
slot->event = os_event_create(NULL);
3274
2950
over = &(slot->control);
3276
over->hEvent = slot->handle;
3278
*((array->handles) + i) = over->hEvent;
3280
#elif defined(LINUX_NATIVE_AIO)
3282
memset(&slot->control, 0x0, sizeof(slot->control));
2952
over->hEvent = slot->event->handle;
2954
*((array->native_events) + i) = over->hEvent;
3291
/************************************************************************//**
3292
Frees an aio wait array. */
3297
os_aio_array_t* array) /*!< in, own: array to free */
3302
for (i = 0; i < array->n_slots; i++) {
3303
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3304
CloseHandle(slot->handle);
3306
#endif /* WIN_ASYNC_IO */
3309
ut_free(array->handles);
3310
#endif /* __WIN__ */
3311
os_mutex_free(array->mutex);
3312
os_event_free(array->not_full);
3313
os_event_free(array->is_empty);
3315
#if defined(LINUX_NATIVE_AIO)
3316
if (srv_use_native_aio) {
3317
ut_free(array->aio_events);
3318
ut_free(array->aio_ctx);
3320
#endif /* LINUX_NATIVE_AIO */
3322
ut_free(array->slots);
3326
2961
/***********************************************************************
3327
2962
Initializes the asynchronous io system. Creates one array each for ibuf
3328
2963
and log i/o. Also creates one array each for read and write where each
3889
3391
os_event_reset(os_aio_segment_wait_events[g]);
3892
#endif /* __WIN__ */
3895
#if defined(LINUX_NATIVE_AIO)
3896
/*******************************************************************//**
3897
Dispatch an AIO request to the kernel.
3898
@return TRUE on success. */
3901
os_aio_linux_dispatch(
3902
/*==================*/
3903
os_aio_array_t* array, /*!< in: io request array. */
3904
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3910
ut_ad(slot != NULL);
3913
ut_a(slot->reserved);
3915
/* Find out what we are going to work with.
3916
The iocb struct is directly in the slot.
3917
The io_context is one per segment. */
3919
iocb = &slot->control;
3920
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3922
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3924
#if defined(UNIV_AIO_DEBUG)
3926
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3927
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3928
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3931
/* io_submit returns number of successfully
3932
queued requests or -errno. */
3933
if (UNIV_UNLIKELY(ret != 1)) {
3940
#endif /* LINUX_NATIVE_AIO */
3943
/*******************************************************************//**
3944
NOTE! Use the corresponding macro os_aio(), not directly this function!
3396
/*******************************************************************//**
3945
3397
Requests an asynchronous i/o operation.
3946
3398
@return TRUE if request was queued successfully, FALSE if fail */
3951
3403
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3952
3404
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3953
3405
to OS_AIO_SIMULATED_WAKE_LATER: the
4248
3675
#ifdef UNIV_DO_FLUSH
4249
3676
if (slot->type == OS_FILE_WRITE
4250
3677
&& !os_do_not_call_flush_at_each_write) {
4251
if (!os_file_flush(slot->file)) {
3678
ut_a(TRUE == os_file_flush(slot->file));
4255
3680
#endif /* UNIV_DO_FLUSH */
4256
} else if (os_file_handle_error(slot->name, "Windows aio")) {
3682
os_file_handle_error(slot->name, "Windows aio");
4261
3684
ret_val = FALSE;
4264
3687
os_mutex_exit(array->mutex);
4267
/* retry failed read/write operation synchronously.
4268
No need to hold array->mutex. */
4271
/* This read/write does not go through os_file_read
4272
and os_file_write APIs, need to register with
4273
performance schema explicitly here. */
4274
struct PSI_file_locker* locker = NULL;
4275
register_pfs_file_io_begin(locker, slot->file, slot->len,
4276
(slot->type == OS_FILE_WRITE)
4279
__FILE__, __LINE__);
4282
switch (slot->type) {
4284
ret = WriteFile(slot->file, slot->buf,
4290
ret = ReadFile(slot->file, slot->buf,
4300
register_pfs_file_io_end(locker, len);
4303
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4304
/* aio was queued successfully!
4305
We want a synchronous i/o operation on a
4306
file where we also use async i/o: in Windows
4307
we must use the same wait mechanism as for
4310
ret = GetOverlappedResult(slot->file,
4315
ret_val = ret && len == slot->len;
4318
3689
os_aio_array_free_slot(array, slot);
4320
3691
return(ret_val);
4324
#if defined(LINUX_NATIVE_AIO)
4325
/******************************************************************//**
4326
This function is only used in Linux native asynchronous i/o. This is
4327
called from within the io-thread. If there are no completed IO requests
4328
in the slot array, the thread calls this function to collect more
4329
requests from the kernel.
4330
The io-thread waits on io_getevents(), which is a blocking call, with
4331
a timeout value. Unless the system is very heavy loaded, keeping the
4332
io-thread very busy, the io-thread will spend most of its time waiting
4334
The io-thread also exits in this function. It checks server status at
4335
each wakeup and that is why we use timed wait in io_getevents(). */
4338
os_aio_linux_collect(
4339
/*=================*/
4340
os_aio_array_t* array, /*!< in/out: slot array. */
4341
ulint segment, /*!< in: local segment no. */
4342
ulint seg_size) /*!< in: segment size. */
4348
struct timespec timeout;
4349
struct io_event* events;
4350
struct io_context* io_ctx;
4352
/* sanity checks. */
4353
ut_ad(array != NULL);
4354
ut_ad(seg_size > 0);
4355
ut_ad(segment < array->n_segments);
4357
/* Which part of event array we are going to work on. */
4358
events = &array->aio_events[segment * seg_size];
4360
/* Which io_context we are going to use. */
4361
io_ctx = array->aio_ctx[segment];
4363
/* Starting point of the segment we will be working on. */
4364
start_pos = segment * seg_size;
4367
end_pos = start_pos + seg_size;
4371
/* Go down if we are in shutdown mode.
4372
In case of srv_fast_shutdown == 2, there may be pending
4373
IO requests but that should be OK as we essentially treat
4374
that as a crash of InnoDB. */
4375
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4376
os_thread_exit(NULL);
4379
/* Initialize the events. The timeout value is arbitrary.
4380
We probably need to experiment with it a little. */
4381
memset(events, 0, sizeof(*events) * seg_size);
4383
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4385
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4387
/* This error handling is for any error in collecting the
4388
IO requests. The errors, if any, for any particular IO
4389
request are simply passed on to the calling routine. */
4391
/* Not enough resources! Try again. */
4392
if (ret == -EAGAIN) {
4396
/* Interrupted! I have tested the behaviour in case of an
4397
interrupt. If we have some completed IOs available then
4398
the return code will be the number of IOs. We get EINTR only
4399
if there are no completed IOs and we have been interrupted. */
4400
if (ret == -EINTR) {
4404
/* No pending request! Go back and check again. */
4409
/* All other errors! should cause a trap for now. */
4410
if (UNIV_UNLIKELY(ret < 0)) {
4411
ut_print_timestamp(stderr);
4413
" InnoDB: unexpected ret_code[%d] from"
4414
" io_getevents()!\n", ret);
4420
for (i = 0; i < ret; i++) {
4421
os_aio_slot_t* slot;
4422
struct iocb* control;
4424
control = (struct iocb *)events[i].obj;
4425
ut_a(control != NULL);
4427
slot = (os_aio_slot_t *) control->data;
4429
/* Some sanity checks. */
4431
ut_a(slot->reserved);
4433
#if defined(UNIV_AIO_DEBUG)
4435
"io_getevents[%c]: slot[%p] ctx[%p]"
4437
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4438
slot, io_ctx, segment);
4441
/* We are not scribbling previous segment. */
4442
ut_a(slot->pos >= start_pos);
4444
/* We have not overstepped to next segment. */
4445
ut_a(slot->pos < end_pos);
4447
/* Mark this request as completed. The error handling
4448
will be done in the calling function. */
4449
os_mutex_enter(array->mutex);
4450
slot->n_bytes = events[i].res;
4451
slot->ret = events[i].res2;
4452
slot->io_already_done = TRUE;
4453
os_mutex_exit(array->mutex);
4459
/**********************************************************************//**
4460
This function is only used in Linux native asynchronous i/o.
4461
Waits for an aio operation to complete. This function is used to wait for
4462
the completed requests. The aio array of pending requests is divided
4463
into segments. The thread specifies which segment or slot it wants to wait
4464
for. NOTE: this function will also take care of freeing the aio slot,
4465
therefore no other thread is allowed to do the freeing!
4466
@return TRUE if the IO was successful */
4469
os_aio_linux_handle(
4470
/*================*/
4471
ulint global_seg, /*!< in: segment number in the aio array
4472
to wait for; segment 0 is the ibuf
4473
i/o thread, segment 1 is log i/o thread,
4474
then follow the non-ibuf read threads,
4475
and the last are the non-ibuf write
4477
fil_node_t**message1, /*!< out: the messages passed with the */
4478
void** message2, /*!< aio request; note that in case the
4479
aio operation failed, these output
4480
parameters are valid and can be used to
4481
restart the operation. */
4482
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4485
os_aio_array_t* array;
4486
os_aio_slot_t* slot;
4491
/* Should never be doing Sync IO here. */
4492
ut_a(global_seg != ULINT_UNDEFINED);
4494
/* Find the array and the local segment. */
4495
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4496
n = array->n_slots / array->n_segments;
4498
/* Loop until we have found a completed request. */
4500
os_mutex_enter(array->mutex);
4501
for (i = 0; i < n; ++i) {
4502
slot = os_aio_array_get_nth_slot(
4503
array, i + segment * n);
4504
if (slot->reserved && slot->io_already_done) {
4505
/* Something for us to work on. */
4510
os_mutex_exit(array->mutex);
4512
/* We don't have any completed request.
4513
Wait for some request. Note that we return
4514
from wait iff we have found a request. */
4516
srv_set_io_thread_op_info(global_seg,
4517
"waiting for completed aio requests");
4518
os_aio_linux_collect(array, segment, n);
4522
/* Note that it may be that there are more then one completed
4523
IO requests. We process them one at a time. We may have a case
4524
here to improve the performance slightly by dealing with all
4525
requests in one sweep. */
4526
srv_set_io_thread_op_info(global_seg,
4527
"processing completed aio requests");
4529
/* Ensure that we are scribbling only our segment. */
4532
ut_ad(slot != NULL);
4533
ut_ad(slot->reserved);
4534
ut_ad(slot->io_already_done);
4536
*message1 = slot->message1;
4537
*message2 = slot->message2;
4541
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4544
#ifdef UNIV_DO_FLUSH
4545
if (slot->type == OS_FILE_WRITE
4546
&& !os_do_not_call_flush_at_each_write)
4547
&& !os_file_flush(slot->file) {
4550
#endif /* UNIV_DO_FLUSH */
4554
/* os_file_handle_error does tell us if we should retry
4555
this IO. As it stands now, we don't do this retry when
4556
reaping requests from a different context than
4557
the dispatcher. This non-retry logic is the same for
4558
windows and linux native AIO.
4559
We should probably look into this to transparently
4560
re-submit the IO. */
4561
os_file_handle_error(slot->name, "Linux aio");
4566
os_mutex_exit(array->mutex);
4568
os_aio_array_free_slot(array, slot);
4572
#endif /* LINUX_NATIVE_AIO */
4574
3695
/**********************************************************************//**
4575
3696
Does simulated aio. This function should be called by an i/o-handler