91
65
/* In simulated aio, merge at most this many consecutive i/os */
92
66
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
148
/** Flag: enable debug printout for asynchronous i/o */
68
/* If this flag is TRUE, then we will use the native aio of the
69
OS (provided we compiled Innobase with it in), otherwise we will
70
use simulated aio we build below with threads */
72
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
149
74
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
/** The asynchronous i/o array slot structure */
76
/* The aio array slot structure */
159
77
typedef struct os_aio_slot_struct os_aio_slot_t;
161
/** The asynchronous i/o array slot structure */
162
79
struct os_aio_slot_struct{
163
ibool is_read; /*!< TRUE if a read operation */
164
ulint pos; /*!< index of the slot in the aio
80
ibool is_read; /* TRUE if a read operation */
81
ulint pos; /* index of the slot in the aio
166
ibool reserved; /*!< TRUE if this slot is reserved */
167
time_t reservation_time;/*!< time when reserved */
168
ulint len; /*!< length of the block to read or
83
ibool reserved; /* TRUE if this slot is reserved */
84
time_t reservation_time;/* time when reserved */
85
ulint len; /* length of the block to read or
170
byte* buf; /*!< buffer used in i/o */
171
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
172
ulint offset; /*!< 32 low bits of file offset in
87
byte* buf; /* buffer used in i/o */
88
ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
89
ulint offset; /* 32 low bits of file offset in
174
ulint offset_high; /*!< 32 high bits of file offset */
175
os_file_t file; /*!< file where to read or write */
176
const char* name; /*!< file name or path */
177
ibool io_already_done;/*!< used only in simulated aio:
91
ulint offset_high; /* 32 high bits of file offset */
92
os_file_t file; /* file where to read or write */
93
const char* name; /* file name or path */
94
ibool io_already_done;/* used only in simulated aio:
178
95
TRUE if the physical i/o already
179
96
made and only the slot message
180
97
needs to be passed to the caller
181
98
of os_aio_simulated_handle */
182
fil_node_t* message1; /*!< message which is given by the */
183
void* message2; /*!< the requester of an aio operation
99
fil_node_t* message1; /* message which is given by the */
100
void* message2; /* the requester of an aio operation
184
101
and which can be used to identify
185
102
which pending aio operation was
187
104
#ifdef WIN_ASYNC_IO
188
HANDLE handle; /*!< handle object we need in the
105
os_event_t event; /* event object we need in the
189
106
OVERLAPPED struct */
190
OVERLAPPED control; /*!< Windows control block for the
107
OVERLAPPED control; /* Windows control block for the
192
#elif defined(LINUX_NATIVE_AIO)
193
struct iocb control; /* Linux control block for aio */
194
int n_bytes; /* bytes written/read. */
195
int ret; /* AIO return code */
199
/** The asynchronous i/o array structure */
112
/* The aio array structure */
200
113
typedef struct os_aio_array_struct os_aio_array_t;
202
/** The asynchronous i/o array structure */
203
115
struct os_aio_array_struct{
204
os_mutex_t mutex; /*!< the mutex protecting the aio array */
206
/*!< The event which is set to the
207
signaled state when there is space in
208
the aio outside the ibuf segment */
210
/*!< The event which is set to the
211
signaled state when there are no
212
pending i/os in this array */
213
ulint n_slots;/*!< Total number of slots in the aio
214
array. This must be divisible by
217
/*!< Number of segments in the aio
218
array of pending aio requests. A
219
thread can wait separately for any one
221
ulint cur_seg;/*!< We reserve IO requests in round
222
robin fashion to different segments.
223
This points to the segment that is to
224
be used to service next IO request. */
226
/*!< Number of reserved slots in the
227
aio array outside the ibuf segment */
228
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
116
os_mutex_t mutex; /* the mutex protecting the aio array */
117
os_event_t not_full; /* The event which is set to the signaled
118
state when there is space in the aio
119
outside the ibuf segment */
120
os_event_t is_empty; /* The event which is set to the signaled
121
state when there are no pending i/os
123
ulint n_slots; /* Total number of slots in the aio array.
124
This must be divisible by n_threads. */
125
ulint n_segments;/* Number of segments in the aio array of
126
pending aio requests. A thread can wait
127
separately for any one of the segments. */
128
ulint n_reserved;/* Number of reserved slots in the
129
aio array outside the ibuf segment */
130
os_aio_slot_t* slots; /* Pointer to the slots in the array */
231
/*!< Pointer to an array of OS native
232
event handles where we copied the
233
handles from slots, in the same
234
order. This can be used in
235
WaitForMultipleObjects; used only in
239
#if defined(LINUX_NATIVE_AIO)
240
io_context_t* aio_ctx;
241
/* completion queue for IO. There is
242
one such queue per segment. Each thread
243
will work on one ctx exclusively. */
244
struct io_event* aio_events;
245
/* The array to collect completed IOs.
246
There is one such event for each
247
possible pending IO. The size of the
248
array is equal to n_slots. */
132
os_native_event_t* native_events;
133
/* Pointer to an array of OS native event
134
handles where we copied the handles from
135
slots, in the same order. This can be used
136
in WaitForMultipleObjects; used only in
252
#if defined(LINUX_NATIVE_AIO)
253
/** timeout for each io_getevents() call = 500ms. */
254
#define OS_AIO_REAP_TIMEOUT (500000000UL)
256
/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
257
#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
259
/** number of attempts before giving up on io_setup(). */
260
#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
263
/** Array of events used in simulated aio */
141
/* Array of events used in simulated aio */
264
142
static os_event_t* os_aio_segment_wait_events = NULL;
266
/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
267
are NULL when the module has not yet been initialized. @{ */
268
static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
269
static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
270
static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
271
static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
272
static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
144
/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
145
are NULL when the module has not yet been initialized. */
146
static os_aio_array_t* os_aio_read_array = NULL;
147
static os_aio_array_t* os_aio_write_array = NULL;
148
static os_aio_array_t* os_aio_ibuf_array = NULL;
149
static os_aio_array_t* os_aio_log_array = NULL;
150
static os_aio_array_t* os_aio_sync_array = NULL;
275
/** Number of asynchronous I/O segments. Set by os_aio_init(). */
276
152
static ulint os_aio_n_segments = ULINT_UNDEFINED;
278
/** If the following is TRUE, read i/o handler threads try to
154
/* If the following is TRUE, read i/o handler threads try to
279
155
wait until a batch of new read requests have been posted */
280
156
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281
#endif /* !UNIV_HOTBACKUP */
283
158
UNIV_INTERN ulint os_n_file_reads = 0;
284
159
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
687
os_file_count_mutex = os_mutex_create();
509
os_file_count_mutex = os_mutex_create(NULL);
689
511
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
690
os_file_seek_mutexes[i] = os_mutex_create();
512
os_file_seek_mutexes[i] = os_mutex_create(NULL);
694
/***********************************************************************//**
516
/***************************************************************************
695
517
Creates a temporary file. This function is like tmpfile(3), but
696
518
the temporary file is created in the MySQL temporary directory.
697
@return temporary file handle, or NULL on error */
519
On Netware, this function is like tmpfile(3), because the C run-time
520
library of Netware does not expose the delete-on-close flag. */
700
523
os_file_create_tmpfile(void)
701
524
/*========================*/
525
/* out: temporary file handle, or NULL on error */
527
#ifdef UNIV_HOTBACKUP
533
FILE* file = tmpfile();
534
# else /* __NETWARE__ */
703
535
FILE* file = NULL;
704
536
int fd = innobase_mysql_tmpfile();
707
539
file = fdopen(fd, "w+b");
541
# endif /* __NETWARE__ */
711
544
ut_print_timestamp(stderr);
713
546
" InnoDB: Error: unable to create temporary file;"
714
547
" errno: %d\n", errno);
552
# endif /* !__NETWARE__ */
556
#endif /* UNIV_HOTBACKUP */
722
#endif /* !UNIV_HOTBACKUP */
724
/***********************************************************************//**
559
/***************************************************************************
725
560
The os_file_opendir() function opens a directory stream corresponding to the
726
561
directory named by the dirname argument. The directory stream is positioned
727
562
at the first entry. In both Unix and Windows we automatically skip the '.'
728
and '..' items at the start of the directory listing.
729
@return directory stream, NULL if error */
563
and '..' items at the start of the directory listing. */
734
const char* dirname, /*!< in: directory name; it must not
568
/* out: directory stream, NULL if
570
const char* dirname, /* in: directory name; it must not
735
571
contain a trailing '\' or '/' */
736
ibool error_is_fatal) /*!< in: TRUE if we should treat an
572
ibool error_is_fatal) /* in: TRUE if we should treat an
737
573
error as a fatal error; if we try to
738
574
open symlinks then we do not wish a
739
575
fatal error if it happens not to be
3091
2856
return(success);
3094
#ifndef UNIV_HOTBACKUP
3095
/****************************************************************//**
3096
Returns a pointer to the nth slot in the aio array.
3097
@return pointer to slot */
2859
/********************************************************************
2860
Returns a pointer to the nth slot in the aio array. */
3100
2863
os_aio_array_get_nth_slot(
3101
2864
/*======================*/
3102
os_aio_array_t* array, /*!< in: aio array */
3103
ulint index) /*!< in: index of the slot */
2865
/* out: pointer to slot */
2866
os_aio_array_t* array, /* in: aio array */
2867
ulint index) /* in: index of the slot */
3105
2869
ut_a(index < array->n_slots);
3107
2871
return((array->slots) + index);
3110
#if defined(LINUX_NATIVE_AIO)
3111
/******************************************************************//**
3112
Creates an io_context for native linux AIO.
3113
@return TRUE on success. */
3116
os_aio_linux_create_io_ctx(
3117
/*=======================*/
3118
ulint max_events, /*!< in: number of events. */
3119
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3125
memset(io_ctx, 0x0, sizeof(*io_ctx));
3127
/* Initialize the io_ctx. Tell it how many pending
3128
IO requests this context will handle. */
3130
ret = io_setup(max_events, io_ctx);
3132
#if defined(UNIV_AIO_DEBUG)
3134
"InnoDB: Linux native AIO:"
3135
" initialized io_ctx for segment\n");
3137
/* Success. Return now. */
3141
/* If we hit EAGAIN we'll make a few attempts before failing. */
3146
/* First time around. */
3147
ut_print_timestamp(stderr);
3149
" InnoDB: Warning: io_setup() failed"
3150
" with EAGAIN. Will make %d attempts"
3151
" before giving up.\n",
3152
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3155
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3158
"InnoDB: Warning: io_setup() attempt"
3161
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3165
/* Have tried enough. Better call it a day. */
3166
ut_print_timestamp(stderr);
3168
" InnoDB: Error: io_setup() failed"
3169
" with EAGAIN after %d attempts.\n",
3170
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3174
ut_print_timestamp(stderr);
3176
" InnoDB: Error: Linux Native AIO interface"
3177
" is not supported on this platform. Please"
3178
" check your OS documentation and install"
3179
" appropriate binary of InnoDB.\n");
3184
ut_print_timestamp(stderr);
3186
" InnoDB: Error: Linux Native AIO setup"
3187
" returned following error[%d]\n", -ret);
3192
"InnoDB: You can disable Linux Native AIO by"
3193
" setting innodb_native_aio = off in my.cnf\n");
3196
#endif /* LINUX_NATIVE_AIO */
3198
/******************************************************************//**
3199
Creates an aio wait array. Note that we return NULL in case of failure.
3200
We don't care about freeing memory here because we assume that a
3201
failure will result in server refusing to start up.
3202
@return own: aio array, NULL on failure */
2874
/****************************************************************************
2875
Creates an aio wait array. */
3204
2877
os_aio_array_t*
3205
2878
os_aio_array_create(
3206
2879
/*================*/
3207
ulint n, /*!< in: maximum number of pending aio
3208
operations allowed; n must be
3209
divisible by n_segments */
3210
ulint n_segments) /*!< in: number of segments in the aio array */
2880
/* out, own: aio array */
2881
ulint n, /* in: maximum number of pending aio operations
2882
allowed; n must be divisible by n_segments */
2883
ulint n_segments) /* in: number of segments in the aio array */
3212
2885
os_aio_array_t* array;
3214
2887
os_aio_slot_t* slot;
3215
2888
#ifdef WIN_ASYNC_IO
3216
2889
OVERLAPPED* over;
3217
#elif defined(LINUX_NATIVE_AIO)
3218
struct io_event* aio_event = NULL;
3221
2892
ut_a(n_segments > 0);
3223
array = static_cast<os_aio_array_t *>(ut_malloc(sizeof(os_aio_array_t)));
2894
array = ut_malloc(sizeof(os_aio_array_t));
3225
array->mutex = os_mutex_create();
2896
array->mutex = os_mutex_create(NULL);
3226
2897
array->not_full = os_event_create(NULL);
3227
2898
array->is_empty = os_event_create(NULL);
3231
2902
array->n_slots = n;
3232
2903
array->n_segments = n_segments;
3233
2904
array->n_reserved = 0;
3235
array->slots = static_cast<os_aio_slot_t *>(ut_malloc(n * sizeof(os_aio_slot_t)));
2905
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3237
array->handles = ut_malloc(n * sizeof(HANDLE));
2907
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3240
#if defined(LINUX_NATIVE_AIO)
3241
array->aio_ctx = NULL;
3242
array->aio_events = NULL;
3244
/* If we are not using native aio interface then skip this
3245
part of initialization. */
3246
if (!srv_use_native_aio) {
3247
goto skip_native_aio;
3250
/* Initialize the io_context array. One io_context
3251
per segment in the array. */
3253
array->aio_ctx = (io_context**) ut_malloc(n_segments *
3254
sizeof(*array->aio_ctx));
3255
for (i = 0; i < n_segments; ++i) {
3256
if (!os_aio_linux_create_io_ctx(n/n_segments,
3257
&array->aio_ctx[i])) {
3258
/* If something bad happened during aio setup
3259
we should call it a day and return right away.
3260
We don't care about any leaks because a failure
3261
to initialize the io subsystem means that the
3262
server (or atleast the innodb storage engine)
3263
is not going to startup. */
3268
/* Initialize the event array. One event per slot. */
3269
aio_event = (io_event*) ut_malloc(n * sizeof(io_event));
3270
memset(aio_event, 0x0, sizeof(io_event) * n);
3271
array->aio_events = aio_event;
3274
#endif /* LINUX_NATIVE_AIO */
3275
2909
for (i = 0; i < n; i++) {
3276
2910
slot = os_aio_array_get_nth_slot(array, i);
3279
2913
slot->reserved = FALSE;
3280
2914
#ifdef WIN_ASYNC_IO
3281
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
2915
slot->event = os_event_create(NULL);
3283
2917
over = &(slot->control);
3285
over->hEvent = slot->handle;
3287
*((array->handles) + i) = over->hEvent;
3289
#elif defined(LINUX_NATIVE_AIO)
3291
memset(&slot->control, 0x0, sizeof(slot->control));
2919
over->hEvent = slot->event->handle;
2921
*((array->native_events) + i) = over->hEvent;
3300
/************************************************************************//**
3301
Frees an aio wait array. */
2928
/****************************************************************************
2929
Initializes the asynchronous io system. Calls also os_io_init_simple.
2930
Creates a separate aio array for
2931
non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2932
segment, two aio arrays for log reads and writes with one segment, and a
2933
synchronous aio array of the specified size. The combined number of segments
2934
in the three first aio arrays is the parameter n_segments given to the
2935
function. The caller must create an i/o handler thread for each segment in
2936
the four first arrays, but not for the sync aio array. */
3306
os_aio_array_t* array) /*!< in, own: array to free */
3311
for (i = 0; i < array->n_slots; i++) {
3312
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3313
CloseHandle(slot->handle);
3315
#endif /* WIN_ASYNC_IO */
3318
ut_free(array->handles);
3319
#endif /* __WIN__ */
3320
os_mutex_free(array->mutex);
3321
os_event_free(array->not_full);
3322
os_event_free(array->is_empty);
3324
#if defined(LINUX_NATIVE_AIO)
3325
if (srv_use_native_aio) {
3326
ut_free(array->aio_events);
3327
ut_free(array->aio_ctx);
3329
#endif /* LINUX_NATIVE_AIO */
3331
ut_free(array->slots);
3335
/***********************************************************************
3336
Initializes the asynchronous io system. Creates one array each for ibuf
3337
and log i/o. Also creates one array each for read and write where each
3338
array is divided logically into n_read_segs and n_write_segs
3339
respectively. The caller must create an i/o handler thread for each
3340
segment in these arrays. This function also creates the sync array.
3341
No i/o handler thread needs to be created for that */
3346
ulint n_per_seg, /*<! in: maximum number of pending aio
3347
operations allowed per segment */
3348
ulint n_read_segs, /*<! in: number of reader threads */
3349
ulint n_write_segs, /*<! in: number of writer threads */
3350
ulint n_slots_sync) /*<! in: number of slots in the sync aio
2941
ulint n, /* in: maximum number of pending aio operations
2942
allowed; n must be divisible by n_segments */
2943
ulint n_segments, /* in: combined number of segments in the four
2944
first aio arrays; must be >= 4 */
2945
ulint n_slots_sync) /* in: number of slots in the sync aio array */
3354
ulint n_segments = 2 + n_read_segs + n_write_segs;
2952
ut_ad(n % n_segments == 0);
3356
2953
ut_ad(n_segments >= 4);
3358
2955
os_io_init_simple();
3594
3130
return(segment);
3597
/*******************************************************************//**
3133
/***********************************************************************
3598
3134
Requests for a slot in the aio array. If no slot is available, waits until
3599
not_full-event becomes signaled.
3600
@return pointer to slot */
3135
not_full-event becomes signaled. */
3603
3138
os_aio_array_reserve_slot(
3604
3139
/*======================*/
3605
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3606
os_aio_array_t* array, /*!< in: aio array */
3607
fil_node_t* message1,/*!< in: message to be passed along with
3608
the aio operation */
3609
void* message2,/*!< in: message to be passed along with
3610
the aio operation */
3611
os_file_t file, /*!< in: file handle */
3612
const char* name, /*!< in: name of the file or path as a
3140
/* out: pointer to slot */
3141
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3142
os_aio_array_t* array, /* in: aio array */
3143
fil_node_t* message1,/* in: message to be passed along with
3144
the aio operation */
3145
void* message2,/* in: message to be passed along with
3146
the aio operation */
3147
os_file_t file, /* in: file handle */
3148
const char* name, /* in: name of the file or path as a
3613
3149
null-terminated string */
3614
void* buf, /*!< in: buffer where to read or from which
3150
void* buf, /* in: buffer where to read or from which
3616
ulint offset, /*!< in: least significant 32 bits of file
3618
ulint offset_high, /*!< in: most significant 32 bits of
3620
ulint len) /*!< in: length of the block to read or write */
3152
ulint offset, /* in: least significant 32 bits of file
3154
ulint offset_high, /* in: most significant 32 bits of
3156
ulint len) /* in: length of the block to read or write */
3622
os_aio_slot_t* slot = NULL;
3158
os_aio_slot_t* slot;
3623
3159
#ifdef WIN_ASYNC_IO
3624
3160
OVERLAPPED* control;
3626
#elif defined(LINUX_NATIVE_AIO)
3634
ulint slots_per_seg;
3638
ut_a((len & 0xFFFFFFFFUL) == len);
3641
/* No need of a mutex. Only reading constant fields */
3642
slots_per_seg = array->n_slots / array->n_segments;
3644
/* We attempt to keep adjacent blocks in the same local
3645
segment. This can help in merging IO requests when we are
3646
doing simulated AIO */
3647
local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3648
% array->n_segments;
3651
3164
os_mutex_enter(array->mutex);
3653
3166
if (array->n_reserved == array->n_slots) {
3654
3167
os_mutex_exit(array->mutex);
3656
if (!srv_use_native_aio) {
3169
if (!os_aio_use_native_aio) {
3657
3170
/* If the handler threads are suspended, wake them
3658
3171
so that we get more slots */
3902
3342
os_event_reset(os_aio_segment_wait_events[g]);
3905
#endif /* __WIN__ */
3908
#if defined(LINUX_NATIVE_AIO)
3909
/*******************************************************************//**
3910
Dispatch an AIO request to the kernel.
3911
@return TRUE on success. */
3914
os_aio_linux_dispatch(
3915
/*==================*/
3916
os_aio_array_t* array, /*!< in: io request array. */
3917
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3923
ut_ad(slot != NULL);
3926
ut_a(slot->reserved);
3928
/* Find out what we are going to work with.
3929
The iocb struct is directly in the slot.
3930
The io_context is one per segment. */
3932
iocb = &slot->control;
3933
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3935
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3937
#if defined(UNIV_AIO_DEBUG)
3939
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3940
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3941
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3944
/* io_submit returns number of successfully
3945
queued requests or -errno. */
3946
if (UNIV_UNLIKELY(ret != 1)) {
3953
#endif /* LINUX_NATIVE_AIO */
3956
/*******************************************************************//**
3957
NOTE! Use the corresponding macro os_aio(), not directly this function!
3958
Requests an asynchronous i/o operation.
3959
@return TRUE if request was queued successfully, FALSE if fail */
3347
/***********************************************************************
3348
Requests an asynchronous i/o operation. */
3964
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3965
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3353
/* out: TRUE if request was queued
3354
successfully, FALSE if fail */
3355
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3356
ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
3966
3357
to OS_AIO_SIMULATED_WAKE_LATER: the
3967
3358
last flag advises this function not to wake
3968
3359
i/o-handler threads, but the caller will
4264
3624
#ifdef UNIV_DO_FLUSH
4265
3625
if (slot->type == OS_FILE_WRITE
4266
3626
&& !os_do_not_call_flush_at_each_write) {
4267
if (!os_file_flush(slot->file)) {
3627
ut_a(TRUE == os_file_flush(slot->file));
4271
3629
#endif /* UNIV_DO_FLUSH */
4272
} else if (os_file_handle_error(slot->name, "Windows aio")) {
3631
os_file_handle_error(slot->name, "Windows aio");
4277
3633
ret_val = FALSE;
4280
3636
os_mutex_exit(array->mutex);
4283
/* retry failed read/write operation synchronously.
4284
No need to hold array->mutex. */
4287
/* This read/write does not go through os_file_read
4288
and os_file_write APIs, need to register with
4289
performance schema explicitly here. */
4290
struct PSI_file_locker* locker = NULL;
4291
register_pfs_file_io_begin(locker, slot->file, slot->len,
4292
(slot->type == OS_FILE_WRITE)
4295
__FILE__, __LINE__);
4298
ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4300
switch (slot->type) {
4302
ret = WriteFile(slot->file, slot->buf,
4303
(DWORD) slot->len, &len,
4308
ret = ReadFile(slot->file, slot->buf,
4309
(DWORD) slot->len, &len,
4318
register_pfs_file_io_end(locker, len);
4321
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4322
/* aio was queued successfully!
4323
We want a synchronous i/o operation on a
4324
file where we also use async i/o: in Windows
4325
we must use the same wait mechanism as for
4328
ret = GetOverlappedResult(slot->file,
4333
ret_val = ret && len == slot->len;
4336
3638
os_aio_array_free_slot(array, slot);
4338
3640
return(ret_val);
4342
#if defined(LINUX_NATIVE_AIO)
4343
/******************************************************************//**
4344
This function is only used in Linux native asynchronous i/o. This is
4345
called from within the io-thread. If there are no completed IO requests
4346
in the slot array, the thread calls this function to collect more
4347
requests from the kernel.
4348
The io-thread waits on io_getevents(), which is a blocking call, with
4349
a timeout value. Unless the system is very heavy loaded, keeping the
4350
io-thread very busy, the io-thread will spend most of its time waiting
4352
The io-thread also exits in this function. It checks server status at
4353
each wakeup and that is why we use timed wait in io_getevents(). */
4356
os_aio_linux_collect(
4357
/*=================*/
4358
os_aio_array_t* array, /*!< in/out: slot array. */
4359
ulint segment, /*!< in: local segment no. */
4360
ulint seg_size) /*!< in: segment size. */
4366
struct timespec timeout;
4367
struct io_event* events;
4368
struct io_context* io_ctx;
4370
/* sanity checks. */
4371
ut_ad(array != NULL);
4372
ut_ad(seg_size > 0);
4373
ut_ad(segment < array->n_segments);
4375
/* Which part of event array we are going to work on. */
4376
events = &array->aio_events[segment * seg_size];
4378
/* Which io_context we are going to use. */
4379
io_ctx = array->aio_ctx[segment];
4381
/* Starting point of the segment we will be working on. */
4382
start_pos = segment * seg_size;
4385
end_pos = start_pos + seg_size;
4389
/* Go down if we are in shutdown mode.
4390
In case of srv_fast_shutdown == 2, there may be pending
4391
IO requests but that should be OK as we essentially treat
4392
that as a crash of InnoDB. */
4393
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4394
os_thread_exit(NULL);
4397
/* Initialize the events. The timeout value is arbitrary.
4398
We probably need to experiment with it a little. */
4399
memset(events, 0, sizeof(*events) * seg_size);
4401
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4403
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4405
/* This error handling is for any error in collecting the
4406
IO requests. The errors, if any, for any particular IO
4407
request are simply passed on to the calling routine. */
4409
/* Not enough resources! Try again. */
4410
if (ret == -EAGAIN) {
4414
/* Interrupted! I have tested the behaviour in case of an
4415
interrupt. If we have some completed IOs available then
4416
the return code will be the number of IOs. We get EINTR only
4417
if there are no completed IOs and we have been interrupted. */
4418
if (ret == -EINTR) {
4422
/* No pending request! Go back and check again. */
4427
/* All other errors! should cause a trap for now. */
4428
if (UNIV_UNLIKELY(ret < 0)) {
4429
ut_print_timestamp(stderr);
4431
" InnoDB: unexpected ret_code[%d] from"
4432
" io_getevents()!\n", ret);
4438
for (i = 0; i < ret; i++) {
4439
os_aio_slot_t* slot;
4440
struct iocb* control;
4442
control = (struct iocb *)events[i].obj;
4443
ut_a(control != NULL);
4445
slot = (os_aio_slot_t *) control->data;
4447
/* Some sanity checks. */
4449
ut_a(slot->reserved);
4451
#if defined(UNIV_AIO_DEBUG)
4453
"io_getevents[%c]: slot[%p] ctx[%p]"
4455
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4456
slot, io_ctx, segment);
4459
/* We are not scribbling previous segment. */
4460
ut_a(slot->pos >= start_pos);
4462
/* We have not overstepped to next segment. */
4463
ut_a(slot->pos < end_pos);
4465
/* Mark this request as completed. The error handling
4466
will be done in the calling function. */
4467
os_mutex_enter(array->mutex);
4468
slot->n_bytes = events[i].res;
4469
slot->ret = events[i].res2;
4470
slot->io_already_done = TRUE;
4471
os_mutex_exit(array->mutex);
4477
/**********************************************************************//**
4478
This function is only used in Linux native asynchronous i/o.
4479
Waits for an aio operation to complete. This function is used to wait for
4480
the completed requests. The aio array of pending requests is divided
4481
into segments. The thread specifies which segment or slot it wants to wait
4482
for. NOTE: this function will also take care of freeing the aio slot,
4483
therefore no other thread is allowed to do the freeing!
4484
@return TRUE if the IO was successful */
4487
os_aio_linux_handle(
4488
/*================*/
4489
ulint global_seg, /*!< in: segment number in the aio array
4490
to wait for; segment 0 is the ibuf
4491
i/o thread, segment 1 is log i/o thread,
4492
then follow the non-ibuf read threads,
4493
and the last are the non-ibuf write
4495
fil_node_t**message1, /*!< out: the messages passed with the */
4496
void** message2, /*!< aio request; note that in case the
4497
aio operation failed, these output
4498
parameters are valid and can be used to
4499
restart the operation. */
4500
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4503
os_aio_array_t* array;
4504
os_aio_slot_t* slot;
4509
/* Should never be doing Sync IO here. */
4510
ut_a(global_seg != ULINT_UNDEFINED);
4512
/* Find the array and the local segment. */
4513
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4514
n = array->n_slots / array->n_segments;
4516
/* Loop until we have found a completed request. */
4518
os_mutex_enter(array->mutex);
4519
for (i = 0; i < n; ++i) {
4520
slot = os_aio_array_get_nth_slot(
4521
array, i + segment * n);
4522
if (slot->reserved && slot->io_already_done) {
4523
/* Something for us to work on. */
4528
os_mutex_exit(array->mutex);
4530
/* We don't have any completed request.
4531
Wait for some request. Note that we return
4532
from wait iff we have found a request. */
4534
srv_set_io_thread_op_info(global_seg,
4535
"waiting for completed aio requests");
4536
os_aio_linux_collect(array, segment, n);
4540
/* Note that it may be that there are more then one completed
4541
IO requests. We process them one at a time. We may have a case
4542
here to improve the performance slightly by dealing with all
4543
requests in one sweep. */
4544
srv_set_io_thread_op_info(global_seg,
4545
"processing completed aio requests");
4547
/* Ensure that we are scribbling only our segment. */
4550
ut_ad(slot != NULL);
4551
ut_ad(slot->reserved);
4552
ut_ad(slot->io_already_done);
4554
*message1 = slot->message1;
4555
*message2 = slot->message2;
4559
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4562
#ifdef UNIV_DO_FLUSH
4563
if (slot->type == OS_FILE_WRITE
4564
&& !os_do_not_call_flush_at_each_write)
4565
&& !os_file_flush(slot->file) {
4568
#endif /* UNIV_DO_FLUSH */
4572
/* os_file_handle_error does tell us if we should retry
4573
this IO. As it stands now, we don't do this retry when
4574
reaping requests from a different context than
4575
the dispatcher. This non-retry logic is the same for
4576
windows and linux native AIO.
4577
We should probably look into this to transparently
4578
re-submit the IO. */
4579
os_file_handle_error(slot->name, "Linux aio");
4584
os_mutex_exit(array->mutex);
4586
os_aio_array_free_slot(array, slot);
4590
#endif /* LINUX_NATIVE_AIO */
4592
/**********************************************************************//**
3644
/**************************************************************************
4593
3645
Does simulated aio. This function should be called by an i/o-handler
4595
@return TRUE if the aio operation succeeded */
4598
3649
os_aio_simulated_handle(
4599
3650
/*====================*/
4600
ulint global_segment, /*!< in: the number of the segment in the aio
3651
/* out: TRUE if the aio operation succeeded */
3652
ulint global_segment, /* in: the number of the segment in the aio
4601
3653
arrays to wait for; segment 0 is the ibuf
4602
3654
i/o thread, segment 1 the log i/o thread,
4603
3655
then follow the non-ibuf read threads, and as
4604
3656
the last are the non-ibuf write threads */
4605
fil_node_t**message1, /*!< out: the messages passed with the aio
3657
fil_node_t**message1, /* out: the messages passed with the aio
4606
3658
request; note that also in the case where
4607
3659
the aio operation failed, these output
4608
3660
parameters are valid and can be used to
4609
3661
restart the operation, for example */
4610
3662
void** message2,
4611
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
3663
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
4613
3665
os_aio_array_t* array;