1
/*****************************************************************************
3
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
4
Copyright (c) 2009, Percona Inc.
6
Portions of this file contain modifications contributed and copyrighted
7
by Percona Inc.. Those modifications are
8
gratefully acknowledged and are described briefly in the InnoDB
9
documentation. The contributions by Percona Inc. are incorporated with
10
their permission, and subject to the conditions contained in the file
13
This program is free software; you can redistribute it and/or modify it under
14
the terms of the GNU General Public License as published by the Free Software
15
Foundation; version 2 of the License.
17
This program is distributed in the hope that it will be useful, but WITHOUT
18
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
You should have received a copy of the GNU General Public License along with
22
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23
St, Fifth Floor, Boston, MA 02110-1301 USA
25
*****************************************************************************/
27
/**************************************************//**
1
/******************************************************
29
2
The interface to the operating system file i/o primitives
31
6
Created 10/21/1995 Heikki Tuuri
32
7
*******************************************************/
34
9
#include "os0file.h"
11
#include "os0thread.h"
40
12
#include "ut0mem.h"
41
13
#include "srv0srv.h"
42
14
#include "srv0start.h"
43
15
#include "fil0fil.h"
44
16
#include "buf0buf.h"
18
#if defined(UNIV_HOTBACKUP) && defined(__WIN__)
19
/* Add includes for the _stat() call to compile on Windows */
20
#include <sys/types.h>
49
#ifndef UNIV_HOTBACKUP
51
# include "os0thread.h"
52
#else /* !UNIV_HOTBACKUP */
54
/* Add includes for the _stat() call to compile on Windows */
55
# include <sys/types.h>
56
# include <sys/stat.h>
58
#endif /* !UNIV_HOTBACKUP */
60
#if defined(LINUX_NATIVE_AIO)
23
#endif /* UNIV_HOTBACKUP */
64
25
/* This specifies the file permissions InnoDB uses when it creates files in
65
26
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
69
/** Umask for creating files */
70
30
UNIV_INTERN ulint os_innodb_umask
71
31
= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
73
/** Umask for creating files */
74
33
UNIV_INTERN ulint os_innodb_umask = 0;
91
49
/* In simulated aio, merge at most this many consecutive i/os */
92
50
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
148
/** Flag: enable debug printout for asynchronous i/o */
52
/* If this flag is TRUE, then we will use the native aio of the
53
OS (provided we compiled Innobase with it in), otherwise we will
54
use simulated aio we build below with threads */
56
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
149
58
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
/** The asynchronous i/o array slot structure */
60
/* The aio array slot structure */
159
61
typedef struct os_aio_slot_struct os_aio_slot_t;
161
/** The asynchronous i/o array slot structure */
162
63
struct os_aio_slot_struct{
163
ibool is_read; /*!< TRUE if a read operation */
164
ulint pos; /*!< index of the slot in the aio
64
ibool is_read; /* TRUE if a read operation */
65
ulint pos; /* index of the slot in the aio
166
ibool reserved; /*!< TRUE if this slot is reserved */
167
time_t reservation_time;/*!< time when reserved */
168
ulint len; /*!< length of the block to read or
67
ibool reserved; /* TRUE if this slot is reserved */
68
time_t reservation_time;/* time when reserved */
69
ulint len; /* length of the block to read or
170
byte* buf; /*!< buffer used in i/o */
171
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
172
ulint offset; /*!< 32 low bits of file offset in
71
byte* buf; /* buffer used in i/o */
72
ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
73
ulint offset; /* 32 low bits of file offset in
174
ulint offset_high; /*!< 32 high bits of file offset */
175
os_file_t file; /*!< file where to read or write */
176
const char* name; /*!< file name or path */
177
ibool io_already_done;/*!< used only in simulated aio:
75
ulint offset_high; /* 32 high bits of file offset */
76
os_file_t file; /* file where to read or write */
77
const char* name; /* file name or path */
78
ibool io_already_done;/* used only in simulated aio:
178
79
TRUE if the physical i/o already
179
80
made and only the slot message
180
81
needs to be passed to the caller
181
82
of os_aio_simulated_handle */
182
fil_node_t* message1; /*!< message which is given by the */
183
void* message2; /*!< the requester of an aio operation
83
fil_node_t* message1; /* message which is given by the */
84
void* message2; /* the requester of an aio operation
184
85
and which can be used to identify
185
86
which pending aio operation was
187
88
#ifdef WIN_ASYNC_IO
188
HANDLE handle; /*!< handle object we need in the
89
os_event_t event; /* event object we need in the
189
90
OVERLAPPED struct */
190
OVERLAPPED control; /*!< Windows control block for the
91
OVERLAPPED control; /* Windows control block for the
192
#elif defined(LINUX_NATIVE_AIO)
193
struct iocb control; /* Linux control block for aio */
194
int n_bytes; /* bytes written/read. */
195
int ret; /* AIO return code */
199
/** The asynchronous i/o array structure */
96
/* The aio array structure */
200
97
typedef struct os_aio_array_struct os_aio_array_t;
202
/** The asynchronous i/o array structure */
203
99
struct os_aio_array_struct{
204
os_mutex_t mutex; /*!< the mutex protecting the aio array */
206
/*!< The event which is set to the
207
signaled state when there is space in
208
the aio outside the ibuf segment */
210
/*!< The event which is set to the
211
signaled state when there are no
212
pending i/os in this array */
213
ulint n_slots;/*!< Total number of slots in the aio
214
array. This must be divisible by
217
/*!< Number of segments in the aio
218
array of pending aio requests. A
219
thread can wait separately for any one
221
ulint cur_seg;/*!< We reserve IO requests in round
222
robin fashion to different segments.
223
This points to the segment that is to
224
be used to service next IO request. */
226
/*!< Number of reserved slots in the
227
aio array outside the ibuf segment */
228
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
100
os_mutex_t mutex; /* the mutex protecting the aio array */
101
os_event_t not_full; /* The event which is set to the signaled
102
state when there is space in the aio
103
outside the ibuf segment */
104
os_event_t is_empty; /* The event which is set to the signaled
105
state when there are no pending i/os
107
ulint n_slots; /* Total number of slots in the aio array.
108
This must be divisible by n_threads. */
109
ulint n_segments;/* Number of segments in the aio array of
110
pending aio requests. A thread can wait
111
separately for any one of the segments. */
112
ulint n_reserved;/* Number of reserved slots in the
113
aio array outside the ibuf segment */
114
os_aio_slot_t* slots; /* Pointer to the slots in the array */
231
/*!< Pointer to an array of OS native
232
event handles where we copied the
233
handles from slots, in the same
234
order. This can be used in
235
WaitForMultipleObjects; used only in
239
#if defined(LINUX_NATIVE_AIO)
240
io_context_t* aio_ctx;
241
/* completion queue for IO. There is
242
one such queue per segment. Each thread
243
will work on one ctx exclusively. */
244
struct io_event* aio_events;
245
/* The array to collect completed IOs.
246
There is one such event for each
247
possible pending IO. The size of the
248
array is equal to n_slots. */
116
os_native_event_t* native_events;
117
/* Pointer to an array of OS native event
118
handles where we copied the handles from
119
slots, in the same order. This can be used
120
in WaitForMultipleObjects; used only in
252
#if defined(LINUX_NATIVE_AIO)
253
/** timeout for each io_getevents() call = 500ms. */
254
#define OS_AIO_REAP_TIMEOUT (500000000UL)
256
/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
257
#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
259
/** number of attempts before giving up on io_setup(). */
260
#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
263
/** Array of events used in simulated aio */
125
/* Array of events used in simulated aio */
264
126
static os_event_t* os_aio_segment_wait_events = NULL;
266
/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
267
are NULL when the module has not yet been initialized. @{ */
268
static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
269
static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
270
static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
271
static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
272
static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
128
/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
129
are NULL when the module has not yet been initialized. */
130
static os_aio_array_t* os_aio_read_array = NULL;
131
static os_aio_array_t* os_aio_write_array = NULL;
132
static os_aio_array_t* os_aio_ibuf_array = NULL;
133
static os_aio_array_t* os_aio_log_array = NULL;
134
static os_aio_array_t* os_aio_sync_array = NULL;
275
/** Number of asynchronous I/O segments. Set by os_aio_init(). */
276
136
static ulint os_aio_n_segments = ULINT_UNDEFINED;
278
/** If the following is TRUE, read i/o handler threads try to
138
/* If the following is TRUE, read i/o handler threads try to
279
139
wait until a batch of new read requests have been posted */
280
140
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281
#endif /* !UNIV_HOTBACKUP */
283
142
UNIV_INTERN ulint os_n_file_reads = 0;
284
143
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
687
os_file_count_mutex = os_mutex_create();
493
os_file_count_mutex = os_mutex_create(NULL);
689
495
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
690
os_file_seek_mutexes[i] = os_mutex_create();
496
os_file_seek_mutexes[i] = os_mutex_create(NULL);
694
/***********************************************************************//**
500
/***************************************************************************
695
501
Creates a temporary file. This function is like tmpfile(3), but
696
502
the temporary file is created in the MySQL temporary directory.
697
@return temporary file handle, or NULL on error */
503
On Netware, this function is like tmpfile(3), because the C run-time
504
library of Netware does not expose the delete-on-close flag. */
700
507
os_file_create_tmpfile(void)
701
508
/*========================*/
509
/* out: temporary file handle, or NULL on error */
511
#ifdef UNIV_HOTBACKUP
517
FILE* file = tmpfile();
518
# else /* __NETWARE__ */
703
519
FILE* file = NULL;
704
520
int fd = innobase_mysql_tmpfile();
707
523
file = fdopen(fd, "w+b");
525
# endif /* __NETWARE__ */
711
528
ut_print_timestamp(stderr);
713
530
" InnoDB: Error: unable to create temporary file;"
714
531
" errno: %d\n", errno);
536
# endif /* !__NETWARE__ */
540
#endif /* UNIV_HOTBACKUP */
722
#endif /* !UNIV_HOTBACKUP */
724
/***********************************************************************//**
543
/***************************************************************************
725
544
The os_file_opendir() function opens a directory stream corresponding to the
726
545
directory named by the dirname argument. The directory stream is positioned
727
546
at the first entry. In both Unix and Windows we automatically skip the '.'
728
and '..' items at the start of the directory listing.
729
@return directory stream, NULL if error */
547
and '..' items at the start of the directory listing. */
734
const char* dirname, /*!< in: directory name; it must not
552
/* out: directory stream, NULL if
554
const char* dirname, /* in: directory name; it must not
735
555
contain a trailing '\' or '/' */
736
ibool error_is_fatal) /*!< in: TRUE if we should treat an
556
ibool error_is_fatal) /* in: TRUE if we should treat an
737
557
error as a fatal error; if we try to
738
558
open symlinks then we do not wish a
739
559
fatal error if it happens not to be
3082
2840
return(success);
3085
#ifndef UNIV_HOTBACKUP
3086
/****************************************************************//**
3087
Returns a pointer to the nth slot in the aio array.
3088
@return pointer to slot */
2843
/********************************************************************
2844
Returns a pointer to the nth slot in the aio array. */
3091
2847
os_aio_array_get_nth_slot(
3092
2848
/*======================*/
3093
os_aio_array_t* array, /*!< in: aio array */
3094
ulint index) /*!< in: index of the slot */
2849
/* out: pointer to slot */
2850
os_aio_array_t* array, /* in: aio array */
2851
ulint index) /* in: index of the slot */
3096
2853
ut_a(index < array->n_slots);
3098
2855
return((array->slots) + index);
3101
#if defined(LINUX_NATIVE_AIO)
3102
/******************************************************************//**
3103
Creates an io_context for native linux AIO.
3104
@return TRUE on success. */
3107
os_aio_linux_create_io_ctx(
3108
/*=======================*/
3109
ulint max_events, /*!< in: number of events. */
3110
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3116
memset(io_ctx, 0x0, sizeof(*io_ctx));
3118
/* Initialize the io_ctx. Tell it how many pending
3119
IO requests this context will handle. */
3121
ret = io_setup(max_events, io_ctx);
3123
#if defined(UNIV_AIO_DEBUG)
3125
"InnoDB: Linux native AIO:"
3126
" initialized io_ctx for segment\n");
3128
/* Success. Return now. */
3132
/* If we hit EAGAIN we'll make a few attempts before failing. */
3137
/* First time around. */
3138
ut_print_timestamp(stderr);
3140
" InnoDB: Warning: io_setup() failed"
3141
" with EAGAIN. Will make %d attempts"
3142
" before giving up.\n",
3143
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3146
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3149
"InnoDB: Warning: io_setup() attempt"
3152
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3156
/* Have tried enough. Better call it a day. */
3157
ut_print_timestamp(stderr);
3159
" InnoDB: Error: io_setup() failed"
3160
" with EAGAIN after %d attempts.\n",
3161
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3165
ut_print_timestamp(stderr);
3167
" InnoDB: Error: Linux Native AIO interface"
3168
" is not supported on this platform. Please"
3169
" check your OS documentation and install"
3170
" appropriate binary of InnoDB.\n");
3175
ut_print_timestamp(stderr);
3177
" InnoDB: Error: Linux Native AIO setup"
3178
" returned following error[%d]\n", -ret);
3183
"InnoDB: You can disable Linux Native AIO by"
3184
" setting innodb_native_aio = off in my.cnf\n");
3187
#endif /* LINUX_NATIVE_AIO */
3189
/******************************************************************//**
3190
Creates an aio wait array. Note that we return NULL in case of failure.
3191
We don't care about freeing memory here because we assume that a
3192
failure will result in server refusing to start up.
3193
@return own: aio array, NULL on failure */
2858
/****************************************************************************
2859
Creates an aio wait array. */
3195
2861
os_aio_array_t*
3196
2862
os_aio_array_create(
3197
2863
/*================*/
3198
ulint n, /*!< in: maximum number of pending aio
3199
operations allowed; n must be
3200
divisible by n_segments */
3201
ulint n_segments) /*!< in: number of segments in the aio array */
2864
/* out, own: aio array */
2865
ulint n, /* in: maximum number of pending aio operations
2866
allowed; n must be divisible by n_segments */
2867
ulint n_segments) /* in: number of segments in the aio array */
3203
2869
os_aio_array_t* array;
3205
2871
os_aio_slot_t* slot;
3206
2872
#ifdef WIN_ASYNC_IO
3207
2873
OVERLAPPED* over;
3208
#elif defined(LINUX_NATIVE_AIO)
3209
struct io_event* io_event = NULL;
3212
2876
ut_a(n_segments > 0);
3214
2878
array = ut_malloc(sizeof(os_aio_array_t));
3216
array->mutex = os_mutex_create();
2880
array->mutex = os_mutex_create(NULL);
3217
2881
array->not_full = os_event_create(NULL);
3218
2882
array->is_empty = os_event_create(NULL);
3222
2886
array->n_slots = n;
3223
2887
array->n_segments = n_segments;
3224
2888
array->n_reserved = 0;
3226
2889
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3228
array->handles = ut_malloc(n * sizeof(HANDLE));
2891
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3231
#if defined(LINUX_NATIVE_AIO)
3232
array->aio_ctx = NULL;
3233
array->aio_events = NULL;
3235
/* If we are not using native aio interface then skip this
3236
part of initialization. */
3237
if (!srv_use_native_aio) {
3238
goto skip_native_aio;
3241
/* Initialize the io_context array. One io_context
3242
per segment in the array. */
3244
array->aio_ctx = ut_malloc(n_segments *
3245
sizeof(*array->aio_ctx));
3246
for (i = 0; i < n_segments; ++i) {
3247
if (!os_aio_linux_create_io_ctx(n/n_segments,
3248
&array->aio_ctx[i])) {
3249
/* If something bad happened during aio setup
3250
we should call it a day and return right away.
3251
We don't care about any leaks because a failure
3252
to initialize the io subsystem means that the
3253
server (or atleast the innodb storage engine)
3254
is not going to startup. */
3259
/* Initialize the event array. One event per slot. */
3260
io_event = ut_malloc(n * sizeof(*io_event));
3261
memset(io_event, 0x0, sizeof(*io_event) * n);
3262
array->aio_events = io_event;
3265
#endif /* LINUX_NATIVE_AIO */
3266
2893
for (i = 0; i < n; i++) {
3267
2894
slot = os_aio_array_get_nth_slot(array, i);
3270
2897
slot->reserved = FALSE;
3271
2898
#ifdef WIN_ASYNC_IO
3272
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
2899
slot->event = os_event_create(NULL);
3274
2901
over = &(slot->control);
3276
over->hEvent = slot->handle;
3278
*((array->handles) + i) = over->hEvent;
3280
#elif defined(LINUX_NATIVE_AIO)
3282
memset(&slot->control, 0x0, sizeof(slot->control));
2903
over->hEvent = slot->event->handle;
2905
*((array->native_events) + i) = over->hEvent;
3291
/************************************************************************//**
3292
Frees an aio wait array. */
2912
/****************************************************************************
2913
Initializes the asynchronous io system. Calls also os_io_init_simple.
2914
Creates a separate aio array for
2915
non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2916
segment, two aio arrays for log reads and writes with one segment, and a
2917
synchronous aio array of the specified size. The combined number of segments
2918
in the three first aio arrays is the parameter n_segments given to the
2919
function. The caller must create an i/o handler thread for each segment in
2920
the four first arrays, but not for the sync aio array. */
3297
os_aio_array_t* array) /*!< in, own: array to free */
3302
for (i = 0; i < array->n_slots; i++) {
3303
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3304
CloseHandle(slot->handle);
3306
#endif /* WIN_ASYNC_IO */
3309
ut_free(array->handles);
3310
#endif /* __WIN__ */
3311
os_mutex_free(array->mutex);
3312
os_event_free(array->not_full);
3313
os_event_free(array->is_empty);
3315
#if defined(LINUX_NATIVE_AIO)
3316
if (srv_use_native_aio) {
3317
ut_free(array->aio_events);
3318
ut_free(array->aio_ctx);
3320
#endif /* LINUX_NATIVE_AIO */
3322
ut_free(array->slots);
3326
/***********************************************************************
3327
Initializes the asynchronous io system. Creates one array each for ibuf
3328
and log i/o. Also creates one array each for read and write where each
3329
array is divided logically into n_read_segs and n_write_segs
3330
respectively. The caller must create an i/o handler thread for each
3331
segment in these arrays. This function also creates the sync array.
3332
No i/o handler thread needs to be created for that */
3337
ulint n_per_seg, /*<! in: maximum number of pending aio
3338
operations allowed per segment */
3339
ulint n_read_segs, /*<! in: number of reader threads */
3340
ulint n_write_segs, /*<! in: number of writer threads */
3341
ulint n_slots_sync) /*<! in: number of slots in the sync aio
2925
ulint n, /* in: maximum number of pending aio operations
2926
allowed; n must be divisible by n_segments */
2927
ulint n_segments, /* in: combined number of segments in the four
2928
first aio arrays; must be >= 4 */
2929
ulint n_slots_sync) /* in: number of slots in the sync aio array */
3345
ulint n_segments = 2 + n_read_segs + n_write_segs;
2936
ut_ad(n % n_segments == 0);
3347
2937
ut_ad(n_segments >= 4);
3349
2939
os_io_init_simple();
3585
3114
return(segment);
3588
/*******************************************************************//**
3117
/***********************************************************************
3589
3118
Requests for a slot in the aio array. If no slot is available, waits until
3590
not_full-event becomes signaled.
3591
@return pointer to slot */
3119
not_full-event becomes signaled. */
3594
3122
os_aio_array_reserve_slot(
3595
3123
/*======================*/
3596
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3597
os_aio_array_t* array, /*!< in: aio array */
3598
fil_node_t* message1,/*!< in: message to be passed along with
3599
the aio operation */
3600
void* message2,/*!< in: message to be passed along with
3601
the aio operation */
3602
os_file_t file, /*!< in: file handle */
3603
const char* name, /*!< in: name of the file or path as a
3124
/* out: pointer to slot */
3125
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3126
os_aio_array_t* array, /* in: aio array */
3127
fil_node_t* message1,/* in: message to be passed along with
3128
the aio operation */
3129
void* message2,/* in: message to be passed along with
3130
the aio operation */
3131
os_file_t file, /* in: file handle */
3132
const char* name, /* in: name of the file or path as a
3604
3133
null-terminated string */
3605
void* buf, /*!< in: buffer where to read or from which
3134
void* buf, /* in: buffer where to read or from which
3607
ulint offset, /*!< in: least significant 32 bits of file
3609
ulint offset_high, /*!< in: most significant 32 bits of
3611
ulint len) /*!< in: length of the block to read or write */
3136
ulint offset, /* in: least significant 32 bits of file
3138
ulint offset_high, /* in: most significant 32 bits of
3140
ulint len) /* in: length of the block to read or write */
3613
os_aio_slot_t* slot = NULL;
3142
os_aio_slot_t* slot;
3614
3143
#ifdef WIN_ASYNC_IO
3615
3144
OVERLAPPED* control;
3617
#elif defined(LINUX_NATIVE_AIO)
3625
ulint slots_per_seg;
3628
/* No need of a mutex. Only reading constant fields */
3629
slots_per_seg = array->n_slots / array->n_segments;
3631
/* We attempt to keep adjacent blocks in the same local
3632
segment. This can help in merging IO requests when we are
3633
doing simulated AIO */
3634
local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3635
% array->n_segments;
3638
3148
os_mutex_enter(array->mutex);
3640
3150
if (array->n_reserved == array->n_slots) {
3641
3151
os_mutex_exit(array->mutex);
3643
if (!srv_use_native_aio) {
3153
if (!os_aio_use_native_aio) {
3644
3154
/* If the handler threads are suspended, wake them
3645
3155
so that we get more slots */
3889
3326
os_event_reset(os_aio_segment_wait_events[g]);
3892
#endif /* __WIN__ */
3895
#if defined(LINUX_NATIVE_AIO)
3896
/*******************************************************************//**
3897
Dispatch an AIO request to the kernel.
3898
@return TRUE on success. */
3901
os_aio_linux_dispatch(
3902
/*==================*/
3903
os_aio_array_t* array, /*!< in: io request array. */
3904
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3910
ut_ad(slot != NULL);
3913
ut_a(slot->reserved);
3915
/* Find out what we are going to work with.
3916
The iocb struct is directly in the slot.
3917
The io_context is one per segment. */
3919
iocb = &slot->control;
3920
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3922
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3924
#if defined(UNIV_AIO_DEBUG)
3926
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3927
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3928
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3931
/* io_submit returns number of successfully
3932
queued requests or -errno. */
3933
if (UNIV_UNLIKELY(ret != 1)) {
3940
#endif /* LINUX_NATIVE_AIO */
3943
/*******************************************************************//**
3944
NOTE! Use the corresponding macro os_aio(), not directly this function!
3945
Requests an asynchronous i/o operation.
3946
@return TRUE if request was queued successfully, FALSE if fail */
3331
/***********************************************************************
3332
Requests an asynchronous i/o operation. */
3951
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3952
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3337
/* out: TRUE if request was queued
3338
successfully, FALSE if fail */
3339
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3340
ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
3953
3341
to OS_AIO_SIMULATED_WAKE_LATER: the
3954
3342
last flag advises this function not to wake
3955
3343
i/o-handler threads, but the caller will
4248
3608
#ifdef UNIV_DO_FLUSH
4249
3609
if (slot->type == OS_FILE_WRITE
4250
3610
&& !os_do_not_call_flush_at_each_write) {
4251
if (!os_file_flush(slot->file)) {
3611
ut_a(TRUE == os_file_flush(slot->file));
4255
3613
#endif /* UNIV_DO_FLUSH */
4256
} else if (os_file_handle_error(slot->name, "Windows aio")) {
3615
os_file_handle_error(slot->name, "Windows aio");
4261
3617
ret_val = FALSE;
4264
3620
os_mutex_exit(array->mutex);
4267
/* retry failed read/write operation synchronously.
4268
No need to hold array->mutex. */
4271
/* This read/write does not go through os_file_read
4272
and os_file_write APIs, need to register with
4273
performance schema explicitly here. */
4274
struct PSI_file_locker* locker = NULL;
4275
register_pfs_file_io_begin(locker, slot->file, slot->len,
4276
(slot->type == OS_FILE_WRITE)
4279
__FILE__, __LINE__);
4282
switch (slot->type) {
4284
ret = WriteFile(slot->file, slot->buf,
4290
ret = ReadFile(slot->file, slot->buf,
4300
register_pfs_file_io_end(locker, len);
4303
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4304
/* aio was queued successfully!
4305
We want a synchronous i/o operation on a
4306
file where we also use async i/o: in Windows
4307
we must use the same wait mechanism as for
4310
ret = GetOverlappedResult(slot->file,
4315
ret_val = ret && len == slot->len;
4318
3622
os_aio_array_free_slot(array, slot);
4320
3624
return(ret_val);
4324
#if defined(LINUX_NATIVE_AIO)
4325
/******************************************************************//**
4326
This function is only used in Linux native asynchronous i/o. This is
4327
called from within the io-thread. If there are no completed IO requests
4328
in the slot array, the thread calls this function to collect more
4329
requests from the kernel.
4330
The io-thread waits on io_getevents(), which is a blocking call, with
4331
a timeout value. Unless the system is very heavy loaded, keeping the
4332
io-thread very busy, the io-thread will spend most of its time waiting
4334
The io-thread also exits in this function. It checks server status at
4335
each wakeup and that is why we use timed wait in io_getevents(). */
4338
os_aio_linux_collect(
4339
/*=================*/
4340
os_aio_array_t* array, /*!< in/out: slot array. */
4341
ulint segment, /*!< in: local segment no. */
4342
ulint seg_size) /*!< in: segment size. */
4348
struct timespec timeout;
4349
struct io_event* events;
4350
struct io_context* io_ctx;
4352
/* sanity checks. */
4353
ut_ad(array != NULL);
4354
ut_ad(seg_size > 0);
4355
ut_ad(segment < array->n_segments);
4357
/* Which part of event array we are going to work on. */
4358
events = &array->aio_events[segment * seg_size];
4360
/* Which io_context we are going to use. */
4361
io_ctx = array->aio_ctx[segment];
4363
/* Starting point of the segment we will be working on. */
4364
start_pos = segment * seg_size;
4367
end_pos = start_pos + seg_size;
4371
/* Go down if we are in shutdown mode.
4372
In case of srv_fast_shutdown == 2, there may be pending
4373
IO requests but that should be OK as we essentially treat
4374
that as a crash of InnoDB. */
4375
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4376
os_thread_exit(NULL);
4379
/* Initialize the events. The timeout value is arbitrary.
4380
We probably need to experiment with it a little. */
4381
memset(events, 0, sizeof(*events) * seg_size);
4383
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4385
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4387
/* This error handling is for any error in collecting the
4388
IO requests. The errors, if any, for any particular IO
4389
request are simply passed on to the calling routine. */
4391
/* Not enough resources! Try again. */
4392
if (ret == -EAGAIN) {
4396
/* Interrupted! I have tested the behaviour in case of an
4397
interrupt. If we have some completed IOs available then
4398
the return code will be the number of IOs. We get EINTR only
4399
if there are no completed IOs and we have been interrupted. */
4400
if (ret == -EINTR) {
4404
/* No pending request! Go back and check again. */
4409
/* All other errors! should cause a trap for now. */
4410
if (UNIV_UNLIKELY(ret < 0)) {
4411
ut_print_timestamp(stderr);
4413
" InnoDB: unexpected ret_code[%d] from"
4414
" io_getevents()!\n", ret);
4420
for (i = 0; i < ret; i++) {
4421
os_aio_slot_t* slot;
4422
struct iocb* control;
4424
control = (struct iocb *)events[i].obj;
4425
ut_a(control != NULL);
4427
slot = (os_aio_slot_t *) control->data;
4429
/* Some sanity checks. */
4431
ut_a(slot->reserved);
4433
#if defined(UNIV_AIO_DEBUG)
4435
"io_getevents[%c]: slot[%p] ctx[%p]"
4437
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4438
slot, io_ctx, segment);
4441
/* We are not scribbling previous segment. */
4442
ut_a(slot->pos >= start_pos);
4444
/* We have not overstepped to next segment. */
4445
ut_a(slot->pos < end_pos);
4447
/* Mark this request as completed. The error handling
4448
will be done in the calling function. */
4449
os_mutex_enter(array->mutex);
4450
slot->n_bytes = events[i].res;
4451
slot->ret = events[i].res2;
4452
slot->io_already_done = TRUE;
4453
os_mutex_exit(array->mutex);
4459
/**********************************************************************//**
4460
This function is only used in Linux native asynchronous i/o.
4461
Waits for an aio operation to complete. This function is used to wait for
4462
the completed requests. The aio array of pending requests is divided
4463
into segments. The thread specifies which segment or slot it wants to wait
4464
for. NOTE: this function will also take care of freeing the aio slot,
4465
therefore no other thread is allowed to do the freeing!
4466
@return TRUE if the IO was successful */
4469
os_aio_linux_handle(
4470
/*================*/
4471
ulint global_seg, /*!< in: segment number in the aio array
4472
to wait for; segment 0 is the ibuf
4473
i/o thread, segment 1 is log i/o thread,
4474
then follow the non-ibuf read threads,
4475
and the last are the non-ibuf write
4477
fil_node_t**message1, /*!< out: the messages passed with the */
4478
void** message2, /*!< aio request; note that in case the
4479
aio operation failed, these output
4480
parameters are valid and can be used to
4481
restart the operation. */
4482
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4485
os_aio_array_t* array;
4486
os_aio_slot_t* slot;
4491
/* Should never be doing Sync IO here. */
4492
ut_a(global_seg != ULINT_UNDEFINED);
4494
/* Find the array and the local segment. */
4495
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4496
n = array->n_slots / array->n_segments;
4498
/* Loop until we have found a completed request. */
4500
os_mutex_enter(array->mutex);
4501
for (i = 0; i < n; ++i) {
4502
slot = os_aio_array_get_nth_slot(
4503
array, i + segment * n);
4504
if (slot->reserved && slot->io_already_done) {
4505
/* Something for us to work on. */
4510
os_mutex_exit(array->mutex);
4512
/* We don't have any completed request.
4513
Wait for some request. Note that we return
4514
from wait iff we have found a request. */
4516
srv_set_io_thread_op_info(global_seg,
4517
"waiting for completed aio requests");
4518
os_aio_linux_collect(array, segment, n);
4522
/* Note that it may be that there are more then one completed
4523
IO requests. We process them one at a time. We may have a case
4524
here to improve the performance slightly by dealing with all
4525
requests in one sweep. */
4526
srv_set_io_thread_op_info(global_seg,
4527
"processing completed aio requests");
4529
/* Ensure that we are scribbling only our segment. */
4532
ut_ad(slot != NULL);
4533
ut_ad(slot->reserved);
4534
ut_ad(slot->io_already_done);
4536
*message1 = slot->message1;
4537
*message2 = slot->message2;
4541
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4544
#ifdef UNIV_DO_FLUSH
4545
if (slot->type == OS_FILE_WRITE
4546
&& !os_do_not_call_flush_at_each_write)
4547
&& !os_file_flush(slot->file) {
4550
#endif /* UNIV_DO_FLUSH */
4554
/* os_file_handle_error does tell us if we should retry
4555
this IO. As it stands now, we don't do this retry when
4556
reaping requests from a different context than
4557
the dispatcher. This non-retry logic is the same for
4558
windows and linux native AIO.
4559
We should probably look into this to transparently
4560
re-submit the IO. */
4561
os_file_handle_error(slot->name, "Linux aio");
4566
os_mutex_exit(array->mutex);
4568
os_aio_array_free_slot(array, slot);
4572
#endif /* LINUX_NATIVE_AIO */
4574
/**********************************************************************//**
3628
/**************************************************************************
4575
3629
Does simulated aio. This function should be called by an i/o-handler
4577
@return TRUE if the aio operation succeeded */
4580
3633
os_aio_simulated_handle(
4581
3634
/*====================*/
4582
ulint global_segment, /*!< in: the number of the segment in the aio
3635
/* out: TRUE if the aio operation succeeded */
3636
ulint global_segment, /* in: the number of the segment in the aio
4583
3637
arrays to wait for; segment 0 is the ibuf
4584
3638
i/o thread, segment 1 the log i/o thread,
4585
3639
then follow the non-ibuf read threads, and as
4586
3640
the last are the non-ibuf write threads */
4587
fil_node_t**message1, /*!< out: the messages passed with the aio
3641
fil_node_t**message1, /* out: the messages passed with the aio
4588
3642
request; note that also in the case where
4589
3643
the aio operation failed, these output
4590
3644
parameters are valid and can be used to
4591
3645
restart the operation, for example */
4592
3646
void** message2,
4593
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
3647
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
4595
3649
os_aio_array_t* array;