91
65
/* In simulated aio, merge at most this many consecutive i/os */
92
66
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
148
/** Flag: enable debug printout for asynchronous i/o */
68
/* If this flag is TRUE, then we will use the native aio of the
69
OS (provided we compiled Innobase with it in), otherwise we will
70
use simulated aio we build below with threads */
72
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
149
74
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
/** The asynchronous i/o array slot structure */
76
/* The aio array slot structure */
159
77
typedef struct os_aio_slot_struct os_aio_slot_t;
161
/** The asynchronous i/o array slot structure */
162
79
struct os_aio_slot_struct{
163
ibool is_read; /*!< TRUE if a read operation */
164
ulint pos; /*!< index of the slot in the aio
80
ibool is_read; /* TRUE if a read operation */
81
ulint pos; /* index of the slot in the aio
166
ibool reserved; /*!< TRUE if this slot is reserved */
167
time_t reservation_time;/*!< time when reserved */
168
ulint len; /*!< length of the block to read or
83
ibool reserved; /* TRUE if this slot is reserved */
84
time_t reservation_time;/* time when reserved */
85
ulint len; /* length of the block to read or
170
byte* buf; /*!< buffer used in i/o */
171
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
172
ulint offset; /*!< 32 low bits of file offset in
87
byte* buf; /* buffer used in i/o */
88
ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
89
ulint offset; /* 32 low bits of file offset in
174
ulint offset_high; /*!< 32 high bits of file offset */
175
os_file_t file; /*!< file where to read or write */
176
const char* name; /*!< file name or path */
177
ibool io_already_done;/*!< used only in simulated aio:
91
ulint offset_high; /* 32 high bits of file offset */
92
os_file_t file; /* file where to read or write */
93
const char* name; /* file name or path */
94
ibool io_already_done;/* used only in simulated aio:
178
95
TRUE if the physical i/o already
179
96
made and only the slot message
180
97
needs to be passed to the caller
181
98
of os_aio_simulated_handle */
182
fil_node_t* message1; /*!< message which is given by the */
183
void* message2; /*!< the requester of an aio operation
99
fil_node_t* message1; /* message which is given by the */
100
void* message2; /* the requester of an aio operation
184
101
and which can be used to identify
185
102
which pending aio operation was
187
104
#ifdef WIN_ASYNC_IO
188
HANDLE handle; /*!< handle object we need in the
105
os_event_t event; /* event object we need in the
189
106
OVERLAPPED struct */
190
OVERLAPPED control; /*!< Windows control block for the
107
OVERLAPPED control; /* Windows control block for the
192
#elif defined(LINUX_NATIVE_AIO)
193
struct iocb control; /* Linux control block for aio */
194
int n_bytes; /* bytes written/read. */
195
int ret; /* AIO return code */
199
/** The asynchronous i/o array structure */
112
/* The aio array structure */
200
113
typedef struct os_aio_array_struct os_aio_array_t;
202
/** The asynchronous i/o array structure */
203
115
struct os_aio_array_struct{
204
os_mutex_t mutex; /*!< the mutex protecting the aio array */
206
/*!< The event which is set to the
207
signaled state when there is space in
208
the aio outside the ibuf segment */
210
/*!< The event which is set to the
211
signaled state when there are no
212
pending i/os in this array */
213
ulint n_slots;/*!< Total number of slots in the aio
214
array. This must be divisible by
217
/*!< Number of segments in the aio
218
array of pending aio requests. A
219
thread can wait separately for any one
221
ulint cur_seg;/*!< We reserve IO requests in round
222
robin fashion to different segments.
223
This points to the segment that is to
224
be used to service next IO request. */
226
/*!< Number of reserved slots in the
227
aio array outside the ibuf segment */
228
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
116
os_mutex_t mutex; /* the mutex protecting the aio array */
117
os_event_t not_full; /* The event which is set to the signaled
118
state when there is space in the aio
119
outside the ibuf segment */
120
os_event_t is_empty; /* The event which is set to the signaled
121
state when there are no pending i/os
123
ulint n_slots; /* Total number of slots in the aio array.
124
This must be divisible by n_threads. */
125
ulint n_segments;/* Number of segments in the aio array of
126
pending aio requests. A thread can wait
127
separately for any one of the segments. */
128
ulint n_reserved;/* Number of reserved slots in the
129
aio array outside the ibuf segment */
130
os_aio_slot_t* slots; /* Pointer to the slots in the array */
231
/*!< Pointer to an array of OS native
232
event handles where we copied the
233
handles from slots, in the same
234
order. This can be used in
235
WaitForMultipleObjects; used only in
239
#if defined(LINUX_NATIVE_AIO)
240
io_context_t* aio_ctx;
241
/* completion queue for IO. There is
242
one such queue per segment. Each thread
243
will work on one ctx exclusively. */
244
struct io_event* aio_events;
245
/* The array to collect completed IOs.
246
There is one such event for each
247
possible pending IO. The size of the
248
array is equal to n_slots. */
132
os_native_event_t* native_events;
133
/* Pointer to an array of OS native event
134
handles where we copied the handles from
135
slots, in the same order. This can be used
136
in WaitForMultipleObjects; used only in
252
#if defined(LINUX_NATIVE_AIO)
253
/** timeout for each io_getevents() call = 500ms. */
254
#define OS_AIO_REAP_TIMEOUT (500000000UL)
256
/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
257
#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
259
/** number of attempts before giving up on io_setup(). */
260
#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
263
/** Array of events used in simulated aio */
141
/* Array of events used in simulated aio */
264
142
static os_event_t* os_aio_segment_wait_events = NULL;
266
/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
267
are NULL when the module has not yet been initialized. @{ */
268
static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
269
static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
270
static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
271
static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
272
static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
144
/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
145
are NULL when the module has not yet been initialized. */
146
static os_aio_array_t* os_aio_read_array = NULL;
147
static os_aio_array_t* os_aio_write_array = NULL;
148
static os_aio_array_t* os_aio_ibuf_array = NULL;
149
static os_aio_array_t* os_aio_log_array = NULL;
150
static os_aio_array_t* os_aio_sync_array = NULL;
275
/** Number of asynchronous I/O segments. Set by os_aio_init(). */
276
152
static ulint os_aio_n_segments = ULINT_UNDEFINED;
278
/** If the following is TRUE, read i/o handler threads try to
154
/* If the following is TRUE, read i/o handler threads try to
279
155
wait until a batch of new read requests have been posted */
280
156
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281
#endif /* !UNIV_HOTBACKUP */
283
158
UNIV_INTERN ulint os_n_file_reads = 0;
284
159
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
687
os_file_count_mutex = os_mutex_create();
509
os_file_count_mutex = os_mutex_create(NULL);
689
511
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
690
os_file_seek_mutexes[i] = os_mutex_create();
512
os_file_seek_mutexes[i] = os_mutex_create(NULL);
694
/***********************************************************************//**
516
/***************************************************************************
695
517
Creates a temporary file. This function is like tmpfile(3), but
696
518
the temporary file is created in the MySQL temporary directory.
697
@return temporary file handle, or NULL on error */
519
On Netware, this function is like tmpfile(3), because the C run-time
520
library of Netware does not expose the delete-on-close flag. */
700
523
os_file_create_tmpfile(void)
701
524
/*========================*/
525
/* out: temporary file handle, or NULL on error */
527
#ifdef UNIV_HOTBACKUP
533
FILE* file = tmpfile();
534
# else /* __NETWARE__ */
703
535
FILE* file = NULL;
704
536
int fd = innobase_mysql_tmpfile();
707
539
file = fdopen(fd, "w+b");
541
# endif /* __NETWARE__ */
711
544
ut_print_timestamp(stderr);
713
546
" InnoDB: Error: unable to create temporary file;"
714
547
" errno: %d\n", errno);
552
# endif /* !__NETWARE__ */
556
#endif /* UNIV_HOTBACKUP */
722
#endif /* !UNIV_HOTBACKUP */
724
/***********************************************************************//**
559
/***************************************************************************
725
560
The os_file_opendir() function opens a directory stream corresponding to the
726
561
directory named by the dirname argument. The directory stream is positioned
727
562
at the first entry. In both Unix and Windows we automatically skip the '.'
728
and '..' items at the start of the directory listing.
729
@return directory stream, NULL if error */
563
and '..' items at the start of the directory listing. */
734
const char* dirname, /*!< in: directory name; it must not
568
/* out: directory stream, NULL if
570
const char* dirname, /* in: directory name; it must not
735
571
contain a trailing '\' or '/' */
736
ibool error_is_fatal) /*!< in: TRUE if we should treat an
572
ibool error_is_fatal) /* in: TRUE if we should treat an
737
573
error as a fatal error; if we try to
738
574
open symlinks then we do not wish a
739
575
fatal error if it happens not to be
3082
2856
return(success);
3085
#ifndef UNIV_HOTBACKUP
3086
/****************************************************************//**
3087
Returns a pointer to the nth slot in the aio array.
3088
@return pointer to slot */
2859
/********************************************************************
2860
Returns a pointer to the nth slot in the aio array. */
3091
2863
os_aio_array_get_nth_slot(
3092
2864
/*======================*/
3093
os_aio_array_t* array, /*!< in: aio array */
3094
ulint index) /*!< in: index of the slot */
2865
/* out: pointer to slot */
2866
os_aio_array_t* array, /* in: aio array */
2867
ulint index) /* in: index of the slot */
3096
2869
ut_a(index < array->n_slots);
3098
2871
return((array->slots) + index);
3101
#if defined(LINUX_NATIVE_AIO)
3102
/******************************************************************//**
3103
Creates an io_context for native linux AIO.
3104
@return TRUE on success. */
3107
os_aio_linux_create_io_ctx(
3108
/*=======================*/
3109
ulint max_events, /*!< in: number of events. */
3110
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3116
memset(io_ctx, 0x0, sizeof(*io_ctx));
3118
/* Initialize the io_ctx. Tell it how many pending
3119
IO requests this context will handle. */
3121
ret = io_setup(max_events, io_ctx);
3123
#if defined(UNIV_AIO_DEBUG)
3125
"InnoDB: Linux native AIO:"
3126
" initialized io_ctx for segment\n");
3128
/* Success. Return now. */
3132
/* If we hit EAGAIN we'll make a few attempts before failing. */
3137
/* First time around. */
3138
ut_print_timestamp(stderr);
3140
" InnoDB: Warning: io_setup() failed"
3141
" with EAGAIN. Will make %d attempts"
3142
" before giving up.\n",
3143
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3146
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3149
"InnoDB: Warning: io_setup() attempt"
3152
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3156
/* Have tried enough. Better call it a day. */
3157
ut_print_timestamp(stderr);
3159
" InnoDB: Error: io_setup() failed"
3160
" with EAGAIN after %d attempts.\n",
3161
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3165
ut_print_timestamp(stderr);
3167
" InnoDB: Error: Linux Native AIO interface"
3168
" is not supported on this platform. Please"
3169
" check your OS documentation and install"
3170
" appropriate binary of InnoDB.\n");
3175
ut_print_timestamp(stderr);
3177
" InnoDB: Error: Linux Native AIO setup"
3178
" returned following error[%d]\n", -ret);
3183
"InnoDB: You can disable Linux Native AIO by"
3184
" setting innodb_native_aio = off in my.cnf\n");
3187
#endif /* LINUX_NATIVE_AIO */
3189
/******************************************************************//**
3190
Creates an aio wait array. Note that we return NULL in case of failure.
3191
We don't care about freeing memory here because we assume that a
3192
failure will result in server refusing to start up.
3193
@return own: aio array, NULL on failure */
2874
/****************************************************************************
2875
Creates an aio wait array. */
3195
2877
os_aio_array_t*
3196
2878
os_aio_array_create(
3197
2879
/*================*/
3198
ulint n, /*!< in: maximum number of pending aio
3199
operations allowed; n must be
3200
divisible by n_segments */
3201
ulint n_segments) /*!< in: number of segments in the aio array */
2880
/* out, own: aio array */
2881
ulint n, /* in: maximum number of pending aio operations
2882
allowed; n must be divisible by n_segments */
2883
ulint n_segments) /* in: number of segments in the aio array */
3203
2885
os_aio_array_t* array;
3205
2887
os_aio_slot_t* slot;
3206
2888
#ifdef WIN_ASYNC_IO
3207
2889
OVERLAPPED* over;
3208
#elif defined(LINUX_NATIVE_AIO)
3209
struct io_event* io_event = NULL;
3212
2892
ut_a(n_segments > 0);
3214
2894
array = ut_malloc(sizeof(os_aio_array_t));
3216
array->mutex = os_mutex_create();
2896
array->mutex = os_mutex_create(NULL);
3217
2897
array->not_full = os_event_create(NULL);
3218
2898
array->is_empty = os_event_create(NULL);
3222
2902
array->n_slots = n;
3223
2903
array->n_segments = n_segments;
3224
2904
array->n_reserved = 0;
3226
2905
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3228
array->handles = ut_malloc(n * sizeof(HANDLE));
2907
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3231
#if defined(LINUX_NATIVE_AIO)
3232
array->aio_ctx = NULL;
3233
array->aio_events = NULL;
3235
/* If we are not using native aio interface then skip this
3236
part of initialization. */
3237
if (!srv_use_native_aio) {
3238
goto skip_native_aio;
3241
/* Initialize the io_context array. One io_context
3242
per segment in the array. */
3244
array->aio_ctx = ut_malloc(n_segments *
3245
sizeof(*array->aio_ctx));
3246
for (i = 0; i < n_segments; ++i) {
3247
if (!os_aio_linux_create_io_ctx(n/n_segments,
3248
&array->aio_ctx[i])) {
3249
/* If something bad happened during aio setup
3250
we should call it a day and return right away.
3251
We don't care about any leaks because a failure
3252
to initialize the io subsystem means that the
3253
server (or atleast the innodb storage engine)
3254
is not going to startup. */
3259
/* Initialize the event array. One event per slot. */
3260
io_event = ut_malloc(n * sizeof(*io_event));
3261
memset(io_event, 0x0, sizeof(*io_event) * n);
3262
array->aio_events = io_event;
3265
#endif /* LINUX_NATIVE_AIO */
3266
2909
for (i = 0; i < n; i++) {
3267
2910
slot = os_aio_array_get_nth_slot(array, i);
3270
2913
slot->reserved = FALSE;
3271
2914
#ifdef WIN_ASYNC_IO
3272
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
2915
slot->event = os_event_create(NULL);
3274
2917
over = &(slot->control);
3276
over->hEvent = slot->handle;
3278
*((array->handles) + i) = over->hEvent;
3280
#elif defined(LINUX_NATIVE_AIO)
3282
memset(&slot->control, 0x0, sizeof(slot->control));
2919
over->hEvent = slot->event->handle;
2921
*((array->native_events) + i) = over->hEvent;
3291
/************************************************************************//**
3292
Frees an aio wait array. */
2928
/****************************************************************************
2929
Initializes the asynchronous io system. Calls also os_io_init_simple.
2930
Creates a separate aio array for
2931
non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2932
segment, two aio arrays for log reads and writes with one segment, and a
2933
synchronous aio array of the specified size. The combined number of segments
2934
in the three first aio arrays is the parameter n_segments given to the
2935
function. The caller must create an i/o handler thread for each segment in
2936
the four first arrays, but not for the sync aio array. */
3297
os_aio_array_t* array) /*!< in, own: array to free */
3302
for (i = 0; i < array->n_slots; i++) {
3303
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3304
CloseHandle(slot->handle);
3306
#endif /* WIN_ASYNC_IO */
3309
ut_free(array->handles);
3310
#endif /* __WIN__ */
3311
os_mutex_free(array->mutex);
3312
os_event_free(array->not_full);
3313
os_event_free(array->is_empty);
3315
#if defined(LINUX_NATIVE_AIO)
3316
if (srv_use_native_aio) {
3317
ut_free(array->aio_events);
3318
ut_free(array->aio_ctx);
3320
#endif /* LINUX_NATIVE_AIO */
3322
ut_free(array->slots);
3326
/***********************************************************************
3327
Initializes the asynchronous io system. Creates one array each for ibuf
3328
and log i/o. Also creates one array each for read and write where each
3329
array is divided logically into n_read_segs and n_write_segs
3330
respectively. The caller must create an i/o handler thread for each
3331
segment in these arrays. This function also creates the sync array.
3332
No i/o handler thread needs to be created for that */
3337
ulint n_per_seg, /*<! in: maximum number of pending aio
3338
operations allowed per segment */
3339
ulint n_read_segs, /*<! in: number of reader threads */
3340
ulint n_write_segs, /*<! in: number of writer threads */
3341
ulint n_slots_sync) /*<! in: number of slots in the sync aio
2941
ulint n, /* in: maximum number of pending aio operations
2942
allowed; n must be divisible by n_segments */
2943
ulint n_segments, /* in: combined number of segments in the four
2944
first aio arrays; must be >= 4 */
2945
ulint n_slots_sync) /* in: number of slots in the sync aio array */
3345
ulint n_segments = 2 + n_read_segs + n_write_segs;
2952
ut_ad(n % n_segments == 0);
3347
2953
ut_ad(n_segments >= 4);
3349
2955
os_io_init_simple();
3585
3130
return(segment);
3588
/*******************************************************************//**
3133
/***********************************************************************
3589
3134
Requests for a slot in the aio array. If no slot is available, waits until
3590
not_full-event becomes signaled.
3591
@return pointer to slot */
3135
not_full-event becomes signaled. */
3594
3138
os_aio_array_reserve_slot(
3595
3139
/*======================*/
3596
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3597
os_aio_array_t* array, /*!< in: aio array */
3598
fil_node_t* message1,/*!< in: message to be passed along with
3599
the aio operation */
3600
void* message2,/*!< in: message to be passed along with
3601
the aio operation */
3602
os_file_t file, /*!< in: file handle */
3603
const char* name, /*!< in: name of the file or path as a
3140
/* out: pointer to slot */
3141
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3142
os_aio_array_t* array, /* in: aio array */
3143
fil_node_t* message1,/* in: message to be passed along with
3144
the aio operation */
3145
void* message2,/* in: message to be passed along with
3146
the aio operation */
3147
os_file_t file, /* in: file handle */
3148
const char* name, /* in: name of the file or path as a
3604
3149
null-terminated string */
3605
void* buf, /*!< in: buffer where to read or from which
3150
void* buf, /* in: buffer where to read or from which
3607
ulint offset, /*!< in: least significant 32 bits of file
3609
ulint offset_high, /*!< in: most significant 32 bits of
3611
ulint len) /*!< in: length of the block to read or write */
3152
ulint offset, /* in: least significant 32 bits of file
3154
ulint offset_high, /* in: most significant 32 bits of
3156
ulint len) /* in: length of the block to read or write */
3613
os_aio_slot_t* slot = NULL;
3158
os_aio_slot_t* slot;
3614
3159
#ifdef WIN_ASYNC_IO
3615
3160
OVERLAPPED* control;
3617
#elif defined(LINUX_NATIVE_AIO)
3625
ulint slots_per_seg;
3628
/* No need of a mutex. Only reading constant fields */
3629
slots_per_seg = array->n_slots / array->n_segments;
3631
/* We attempt to keep adjacent blocks in the same local
3632
segment. This can help in merging IO requests when we are
3633
doing simulated AIO */
3634
local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3635
% array->n_segments;
3638
3164
os_mutex_enter(array->mutex);
3640
3166
if (array->n_reserved == array->n_slots) {
3641
3167
os_mutex_exit(array->mutex);
3643
if (!srv_use_native_aio) {
3169
if (!os_aio_use_native_aio) {
3644
3170
/* If the handler threads are suspended, wake them
3645
3171
so that we get more slots */
3889
3342
os_event_reset(os_aio_segment_wait_events[g]);
3892
#endif /* __WIN__ */
3895
#if defined(LINUX_NATIVE_AIO)
3896
/*******************************************************************//**
3897
Dispatch an AIO request to the kernel.
3898
@return TRUE on success. */
3901
os_aio_linux_dispatch(
3902
/*==================*/
3903
os_aio_array_t* array, /*!< in: io request array. */
3904
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3910
ut_ad(slot != NULL);
3913
ut_a(slot->reserved);
3915
/* Find out what we are going to work with.
3916
The iocb struct is directly in the slot.
3917
The io_context is one per segment. */
3919
iocb = &slot->control;
3920
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3922
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3924
#if defined(UNIV_AIO_DEBUG)
3926
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3927
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3928
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3931
/* io_submit returns number of successfully
3932
queued requests or -errno. */
3933
if (UNIV_UNLIKELY(ret != 1)) {
3940
#endif /* LINUX_NATIVE_AIO */
3943
/*******************************************************************//**
3944
NOTE! Use the corresponding macro os_aio(), not directly this function!
3945
Requests an asynchronous i/o operation.
3946
@return TRUE if request was queued successfully, FALSE if fail */
3347
/***********************************************************************
3348
Requests an asynchronous i/o operation. */
3951
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3952
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3353
/* out: TRUE if request was queued
3354
successfully, FALSE if fail */
3355
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3356
ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
3953
3357
to OS_AIO_SIMULATED_WAKE_LATER: the
3954
3358
last flag advises this function not to wake
3955
3359
i/o-handler threads, but the caller will
4248
3624
#ifdef UNIV_DO_FLUSH
4249
3625
if (slot->type == OS_FILE_WRITE
4250
3626
&& !os_do_not_call_flush_at_each_write) {
4251
if (!os_file_flush(slot->file)) {
3627
ut_a(TRUE == os_file_flush(slot->file));
4255
3629
#endif /* UNIV_DO_FLUSH */
4256
} else if (os_file_handle_error(slot->name, "Windows aio")) {
3631
os_file_handle_error(slot->name, "Windows aio");
4261
3633
ret_val = FALSE;
4264
3636
os_mutex_exit(array->mutex);
4267
/* retry failed read/write operation synchronously.
4268
No need to hold array->mutex. */
4271
/* This read/write does not go through os_file_read
4272
and os_file_write APIs, need to register with
4273
performance schema explicitly here. */
4274
struct PSI_file_locker* locker = NULL;
4275
register_pfs_file_io_begin(locker, slot->file, slot->len,
4276
(slot->type == OS_FILE_WRITE)
4279
__FILE__, __LINE__);
4282
switch (slot->type) {
4284
ret = WriteFile(slot->file, slot->buf,
4290
ret = ReadFile(slot->file, slot->buf,
4300
register_pfs_file_io_end(locker, len);
4303
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4304
/* aio was queued successfully!
4305
We want a synchronous i/o operation on a
4306
file where we also use async i/o: in Windows
4307
we must use the same wait mechanism as for
4310
ret = GetOverlappedResult(slot->file,
4315
ret_val = ret && len == slot->len;
4318
3638
os_aio_array_free_slot(array, slot);
4320
3640
return(ret_val);
4324
#if defined(LINUX_NATIVE_AIO)
4325
/******************************************************************//**
4326
This function is only used in Linux native asynchronous i/o. This is
4327
called from within the io-thread. If there are no completed IO requests
4328
in the slot array, the thread calls this function to collect more
4329
requests from the kernel.
4330
The io-thread waits on io_getevents(), which is a blocking call, with
4331
a timeout value. Unless the system is very heavy loaded, keeping the
4332
io-thread very busy, the io-thread will spend most of its time waiting
4334
The io-thread also exits in this function. It checks server status at
4335
each wakeup and that is why we use timed wait in io_getevents(). */
4338
os_aio_linux_collect(
4339
/*=================*/
4340
os_aio_array_t* array, /*!< in/out: slot array. */
4341
ulint segment, /*!< in: local segment no. */
4342
ulint seg_size) /*!< in: segment size. */
4348
struct timespec timeout;
4349
struct io_event* events;
4350
struct io_context* io_ctx;
4352
/* sanity checks. */
4353
ut_ad(array != NULL);
4354
ut_ad(seg_size > 0);
4355
ut_ad(segment < array->n_segments);
4357
/* Which part of event array we are going to work on. */
4358
events = &array->aio_events[segment * seg_size];
4360
/* Which io_context we are going to use. */
4361
io_ctx = array->aio_ctx[segment];
4363
/* Starting point of the segment we will be working on. */
4364
start_pos = segment * seg_size;
4367
end_pos = start_pos + seg_size;
4371
/* Go down if we are in shutdown mode.
4372
In case of srv_fast_shutdown == 2, there may be pending
4373
IO requests but that should be OK as we essentially treat
4374
that as a crash of InnoDB. */
4375
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4376
os_thread_exit(NULL);
4379
/* Initialize the events. The timeout value is arbitrary.
4380
We probably need to experiment with it a little. */
4381
memset(events, 0, sizeof(*events) * seg_size);
4383
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4385
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4387
/* This error handling is for any error in collecting the
4388
IO requests. The errors, if any, for any particular IO
4389
request are simply passed on to the calling routine. */
4391
/* Not enough resources! Try again. */
4392
if (ret == -EAGAIN) {
4396
/* Interrupted! I have tested the behaviour in case of an
4397
interrupt. If we have some completed IOs available then
4398
the return code will be the number of IOs. We get EINTR only
4399
if there are no completed IOs and we have been interrupted. */
4400
if (ret == -EINTR) {
4404
/* No pending request! Go back and check again. */
4409
/* All other errors! should cause a trap for now. */
4410
if (UNIV_UNLIKELY(ret < 0)) {
4411
ut_print_timestamp(stderr);
4413
" InnoDB: unexpected ret_code[%d] from"
4414
" io_getevents()!\n", ret);
4420
for (i = 0; i < ret; i++) {
4421
os_aio_slot_t* slot;
4422
struct iocb* control;
4424
control = (struct iocb *)events[i].obj;
4425
ut_a(control != NULL);
4427
slot = (os_aio_slot_t *) control->data;
4429
/* Some sanity checks. */
4431
ut_a(slot->reserved);
4433
#if defined(UNIV_AIO_DEBUG)
4435
"io_getevents[%c]: slot[%p] ctx[%p]"
4437
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4438
slot, io_ctx, segment);
4441
/* We are not scribbling previous segment. */
4442
ut_a(slot->pos >= start_pos);
4444
/* We have not overstepped to next segment. */
4445
ut_a(slot->pos < end_pos);
4447
/* Mark this request as completed. The error handling
4448
will be done in the calling function. */
4449
os_mutex_enter(array->mutex);
4450
slot->n_bytes = events[i].res;
4451
slot->ret = events[i].res2;
4452
slot->io_already_done = TRUE;
4453
os_mutex_exit(array->mutex);
4459
/**********************************************************************//**
4460
This function is only used in Linux native asynchronous i/o.
4461
Waits for an aio operation to complete. This function is used to wait for
4462
the completed requests. The aio array of pending requests is divided
4463
into segments. The thread specifies which segment or slot it wants to wait
4464
for. NOTE: this function will also take care of freeing the aio slot,
4465
therefore no other thread is allowed to do the freeing!
4466
@return TRUE if the IO was successful */
4469
os_aio_linux_handle(
4470
/*================*/
4471
ulint global_seg, /*!< in: segment number in the aio array
4472
to wait for; segment 0 is the ibuf
4473
i/o thread, segment 1 is log i/o thread,
4474
then follow the non-ibuf read threads,
4475
and the last are the non-ibuf write
4477
fil_node_t**message1, /*!< out: the messages passed with the */
4478
void** message2, /*!< aio request; note that in case the
4479
aio operation failed, these output
4480
parameters are valid and can be used to
4481
restart the operation. */
4482
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4485
os_aio_array_t* array;
4486
os_aio_slot_t* slot;
4491
/* Should never be doing Sync IO here. */
4492
ut_a(global_seg != ULINT_UNDEFINED);
4494
/* Find the array and the local segment. */
4495
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4496
n = array->n_slots / array->n_segments;
4498
/* Loop until we have found a completed request. */
4500
os_mutex_enter(array->mutex);
4501
for (i = 0; i < n; ++i) {
4502
slot = os_aio_array_get_nth_slot(
4503
array, i + segment * n);
4504
if (slot->reserved && slot->io_already_done) {
4505
/* Something for us to work on. */
4510
os_mutex_exit(array->mutex);
4512
/* We don't have any completed request.
4513
Wait for some request. Note that we return
4514
from wait iff we have found a request. */
4516
srv_set_io_thread_op_info(global_seg,
4517
"waiting for completed aio requests");
4518
os_aio_linux_collect(array, segment, n);
4522
/* Note that it may be that there are more then one completed
4523
IO requests. We process them one at a time. We may have a case
4524
here to improve the performance slightly by dealing with all
4525
requests in one sweep. */
4526
srv_set_io_thread_op_info(global_seg,
4527
"processing completed aio requests");
4529
/* Ensure that we are scribbling only our segment. */
4532
ut_ad(slot != NULL);
4533
ut_ad(slot->reserved);
4534
ut_ad(slot->io_already_done);
4536
*message1 = slot->message1;
4537
*message2 = slot->message2;
4541
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4544
#ifdef UNIV_DO_FLUSH
4545
if (slot->type == OS_FILE_WRITE
4546
&& !os_do_not_call_flush_at_each_write)
4547
&& !os_file_flush(slot->file) {
4550
#endif /* UNIV_DO_FLUSH */
4554
/* os_file_handle_error does tell us if we should retry
4555
this IO. As it stands now, we don't do this retry when
4556
reaping requests from a different context than
4557
the dispatcher. This non-retry logic is the same for
4558
windows and linux native AIO.
4559
We should probably look into this to transparently
4560
re-submit the IO. */
4561
os_file_handle_error(slot->name, "Linux aio");
4566
os_mutex_exit(array->mutex);
4568
os_aio_array_free_slot(array, slot);
4572
#endif /* LINUX_NATIVE_AIO */
4574
/**********************************************************************//**
3644
/**************************************************************************
4575
3645
Does simulated aio. This function should be called by an i/o-handler
4577
@return TRUE if the aio operation succeeded */
4580
3649
os_aio_simulated_handle(
4581
3650
/*====================*/
4582
ulint global_segment, /*!< in: the number of the segment in the aio
3651
/* out: TRUE if the aio operation succeeded */
3652
ulint global_segment, /* in: the number of the segment in the aio
4583
3653
arrays to wait for; segment 0 is the ibuf
4584
3654
i/o thread, segment 1 the log i/o thread,
4585
3655
then follow the non-ibuf read threads, and as
4586
3656
the last are the non-ibuf write threads */
4587
fil_node_t**message1, /*!< out: the messages passed with the aio
3657
fil_node_t**message1, /* out: the messages passed with the aio
4588
3658
request; note that also in the case where
4589
3659
the aio operation failed, these output
4590
3660
parameters are valid and can be used to
4591
3661
restart the operation, for example */
4592
3662
void** message2,
4593
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
3663
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
4595
3665
os_aio_array_t* array;