1
/*****************************************************************************
3
Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4
Copyright (C) 2009, Percona Inc.
6
Portions of this file contain modifications contributed and copyrighted
7
by Percona Inc.. Those modifications are
8
gratefully acknowledged and are described briefly in the InnoDB
9
documentation. The contributions by Percona Inc. are incorporated with
10
their permission, and subject to the conditions contained in the file
13
This program is free software; you can redistribute it and/or modify it under
14
the terms of the GNU General Public License as published by the Free Software
15
Foundation; version 2 of the License.
17
This program is distributed in the hope that it will be useful, but WITHOUT
18
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
You should have received a copy of the GNU General Public License along with
22
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23
St, Fifth Floor, Boston, MA 02110-1301 USA
25
*****************************************************************************/
27
/**************************************************//**
29
The interface to the operating system file i/o primitives
31
Created 10/21/1995 Heikki Tuuri
32
*******************************************************/
42
#include "srv0start.h"
49
#ifndef UNIV_HOTBACKUP
51
# include "os0thread.h"
52
#else /* !UNIV_HOTBACKUP */
54
/* Add includes for the _stat() call to compile on Windows */
55
# include <sys/types.h>
56
# include <sys/stat.h>
58
#endif /* !UNIV_HOTBACKUP */
60
#if defined(LINUX_NATIVE_AIO)
64
/* This specifies the file permissions InnoDB uses when it creates files in
65
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
69
/** Umask for creating files */
70
UNIV_INTERN ulint os_innodb_umask
71
= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
73
/** Umask for creating files */
74
UNIV_INTERN ulint os_innodb_umask = 0;
78
/* If the following is set to TRUE, we do not call os_file_flush in every
79
os_file_write. We can set this TRUE when the doublewrite buffer is used. */
80
UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
82
/* We do not call os_file_flush in every os_file_write. */
83
#endif /* UNIV_DO_FLUSH */
85
#ifndef UNIV_HOTBACKUP
86
/* We use these mutexes to protect lseek + file i/o operation, if the
87
OS does not provide an atomic pread or pwrite, or similar */
88
#define OS_FILE_N_SEEK_MUTEXES 16
89
UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
91
/* In simulated aio, merge at most this many consecutive i/os */
92
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
148
/** Flag: enable debug printout for asynchronous i/o */
149
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
/** The asynchronous i/o array slot structure */
159
typedef struct os_aio_slot_struct os_aio_slot_t;
161
/** The asynchronous i/o array slot structure */
162
struct os_aio_slot_struct{
163
ibool is_read; /*!< TRUE if a read operation */
164
ulint pos; /*!< index of the slot in the aio
166
ibool reserved; /*!< TRUE if this slot is reserved */
167
time_t reservation_time;/*!< time when reserved */
168
ulint len; /*!< length of the block to read or
170
byte* buf; /*!< buffer used in i/o */
171
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
172
ulint offset; /*!< 32 low bits of file offset in
174
ulint offset_high; /*!< 32 high bits of file offset */
175
os_file_t file; /*!< file where to read or write */
176
const char* name; /*!< file name or path */
177
ibool io_already_done;/*!< used only in simulated aio:
178
TRUE if the physical i/o already
179
made and only the slot message
180
needs to be passed to the caller
181
of os_aio_simulated_handle */
182
fil_node_t* message1; /*!< message which is given by the */
183
void* message2; /*!< the requester of an aio operation
184
and which can be used to identify
185
which pending aio operation was
188
HANDLE handle; /*!< handle object we need in the
190
OVERLAPPED control; /*!< Windows control block for the
192
#elif defined(LINUX_NATIVE_AIO)
193
struct iocb control; /* Linux control block for aio */
194
int n_bytes; /* bytes written/read. */
195
int ret; /* AIO return code */
199
/** The asynchronous i/o array structure */
200
typedef struct os_aio_array_struct os_aio_array_t;
202
/** The asynchronous i/o array structure */
203
struct os_aio_array_struct{
204
os_mutex_t mutex; /*!< the mutex protecting the aio array */
206
/*!< The event which is set to the
207
signaled state when there is space in
208
the aio outside the ibuf segment */
210
/*!< The event which is set to the
211
signaled state when there are no
212
pending i/os in this array */
213
ulint n_slots;/*!< Total number of slots in the aio
214
array. This must be divisible by
217
/*!< Number of segments in the aio
218
array of pending aio requests. A
219
thread can wait separately for any one
221
ulint cur_seg;/*!< We reserve IO requests in round
222
robin fashion to different segments.
223
This points to the segment that is to
224
be used to service next IO request. */
226
/*!< Number of reserved slots in the
227
aio array outside the ibuf segment */
228
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
231
/*!< Pointer to an array of OS native
232
event handles where we copied the
233
handles from slots, in the same
234
order. This can be used in
235
WaitForMultipleObjects; used only in
239
#if defined(LINUX_NATIVE_AIO)
240
io_context_t* aio_ctx;
241
/* completion queue for IO. There is
242
one such queue per segment. Each thread
243
will work on one ctx exclusively. */
244
struct io_event* aio_events;
245
/* The array to collect completed IOs.
246
There is one such event for each
247
possible pending IO. The size of the
248
array is equal to n_slots. */
252
#if defined(LINUX_NATIVE_AIO)
253
/** timeout for each io_getevents() call = 500ms. */
254
#define OS_AIO_REAP_TIMEOUT (500000000UL)
256
/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
257
#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
259
/** number of attempts before giving up on io_setup(). */
260
#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
263
/** Array of events used in simulated aio */
264
static os_event_t* os_aio_segment_wait_events = NULL;
266
/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
267
are NULL when the module has not yet been initialized. @{ */
268
static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
269
static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
270
static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
271
static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
272
static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
275
/** Number of asynchronous I/O segments. Set by os_aio_init(). */
276
static ulint os_aio_n_segments = ULINT_UNDEFINED;
278
/** If the following is TRUE, read i/o handler threads try to
279
wait until a batch of new read requests have been posted */
280
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281
#endif /* !UNIV_HOTBACKUP */
283
UNIV_INTERN ulint os_n_file_reads = 0;
284
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
285
UNIV_INTERN ulint os_n_file_writes = 0;
286
UNIV_INTERN ulint os_n_fsyncs = 0;
287
UNIV_INTERN ulint os_n_file_reads_old = 0;
288
UNIV_INTERN ulint os_n_file_writes_old = 0;
289
UNIV_INTERN ulint os_n_fsyncs_old = 0;
290
UNIV_INTERN time_t os_last_printout;
292
UNIV_INTERN ibool os_has_said_disk_full = FALSE;
294
#ifndef UNIV_HOTBACKUP
295
/** The mutex protecting the following counts of pending I/O operations */
296
static os_mutex_t os_file_count_mutex;
297
#endif /* !UNIV_HOTBACKUP */
298
/** Number of pending os_file_pread() operations */
299
UNIV_INTERN ulint os_file_n_pending_preads = 0;
300
/** Number of pending os_file_pwrite() operations */
301
UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
302
/** Number of pending write operations */
303
UNIV_INTERN ulint os_n_pending_writes = 0;
304
/** Number of pending read operations */
305
UNIV_INTERN ulint os_n_pending_reads = 0;
307
/***********************************************************************//**
308
Gets the operating system version. Currently works only on Windows.
309
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
313
os_get_os_version(void)
314
/*===================*/
317
OSVERSIONINFO os_info;
319
os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
321
ut_a(GetVersionEx(&os_info));
323
if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
325
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
327
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
328
switch (os_info.dwMajorVersion) {
333
return (os_info.dwMinorVersion == 0) ? OS_WIN2000
336
return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
352
/***********************************************************************//**
353
Retrieves the last error number if an error occurs in a file io function.
354
The number should be retrieved before any other OS calls (because they may
355
overwrite the error number). If the number is not known to this program,
356
the OS error number + 100 is returned.
357
@return error number, or OS error number + 100 */
360
os_file_get_last_error(
361
/*===================*/
362
ibool report_all_errors) /*!< in: TRUE if we want an error message
363
printed of all errors */
369
err = (ulint) GetLastError();
371
if (report_all_errors
372
|| (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
374
ut_print_timestamp(stderr);
376
" InnoDB: Operating system error number %lu"
377
" in a file operation.\n", (ulong) err);
379
if (err == ERROR_PATH_NOT_FOUND) {
381
"InnoDB: The error means the system"
382
" cannot find the path specified.\n");
384
if (srv_is_being_started) {
386
"InnoDB: If you are installing InnoDB,"
387
" remember that you must create\n"
388
"InnoDB: directories yourself, InnoDB"
389
" does not create them.\n");
391
} else if (err == ERROR_ACCESS_DENIED) {
393
"InnoDB: The error means mysqld does not have"
394
" the access rights to\n"
395
"InnoDB: the directory. It may also be"
396
" you have created a subdirectory\n"
397
"InnoDB: of the same name as a data file.\n");
398
} else if (err == ERROR_SHARING_VIOLATION
399
|| err == ERROR_LOCK_VIOLATION) {
401
"InnoDB: The error means that another program"
402
" is using InnoDB's files.\n"
403
"InnoDB: This might be a backup or antivirus"
404
" software or another instance\n"
406
" Please close it to get rid of this error.\n");
407
} else if (err == ERROR_WORKING_SET_QUOTA
408
|| err == ERROR_NO_SYSTEM_RESOURCES) {
410
"InnoDB: The error means that there are no"
411
" sufficient system resources or quota to"
412
" complete the operation.\n");
413
} else if (err == ERROR_OPERATION_ABORTED) {
415
"InnoDB: The error means that the I/O"
416
" operation has been aborted\n"
417
"InnoDB: because of either a thread exit"
418
" or an application request.\n"
419
"InnoDB: Retry attempt is made.\n");
422
"InnoDB: Some operating system error numbers"
423
" are described at\n"
426
"operating-system-error-codes.html\n");
432
if (err == ERROR_FILE_NOT_FOUND) {
433
return(OS_FILE_NOT_FOUND);
434
} else if (err == ERROR_DISK_FULL) {
435
return(OS_FILE_DISK_FULL);
436
} else if (err == ERROR_FILE_EXISTS) {
437
return(OS_FILE_ALREADY_EXISTS);
438
} else if (err == ERROR_SHARING_VIOLATION
439
|| err == ERROR_LOCK_VIOLATION) {
440
return(OS_FILE_SHARING_VIOLATION);
441
} else if (err == ERROR_WORKING_SET_QUOTA
442
|| err == ERROR_NO_SYSTEM_RESOURCES) {
443
return(OS_FILE_INSUFFICIENT_RESOURCE);
444
} else if (err == ERROR_OPERATION_ABORTED) {
445
return(OS_FILE_OPERATION_ABORTED);
452
if (report_all_errors
453
|| (err != ENOSPC && err != EEXIST)) {
455
ut_print_timestamp(stderr);
457
" InnoDB: Operating system error number %lu"
458
" in a file operation.\n", (ulong) err);
462
"InnoDB: The error means the system"
463
" cannot find the path specified.\n");
465
if (srv_is_being_started) {
467
"InnoDB: If you are installing InnoDB,"
468
" remember that you must create\n"
469
"InnoDB: directories yourself, InnoDB"
470
" does not create them.\n");
472
} else if (err == EACCES) {
474
"InnoDB: The error means mysqld does not have"
475
" the access rights to\n"
476
"InnoDB: the directory.\n");
478
if (strerror((int)err) != NULL) {
480
"InnoDB: Error number %lu"
482
err, strerror((int)err));
486
"InnoDB: Some operating system"
487
" error numbers are described at\n"
490
"operating-system-error-codes.html\n");
498
return(OS_FILE_DISK_FULL);
500
return(OS_FILE_NOT_FOUND);
502
return(OS_FILE_ALREADY_EXISTS);
506
return(OS_FILE_PATH_ERROR);
508
if (srv_use_native_aio) {
509
return(OS_FILE_AIO_RESOURCES_RESERVED);
513
if (srv_use_native_aio) {
514
return(OS_FILE_AIO_INTERRUPTED);
522
/****************************************************************//**
523
Does error handling when a file operation fails.
524
Conditionally exits (calling exit(3)) based on should_exit value and the
526
@return TRUE if we should retry the operation */
529
os_file_handle_error_cond_exit(
530
/*===========================*/
531
const char* name, /*!< in: name of a file or NULL */
532
const char* operation, /*!< in: operation */
533
ibool should_exit) /*!< in: call exit(3) if unknown error
534
and this parameter is TRUE */
538
err = os_file_get_last_error(FALSE);
540
if (err == OS_FILE_DISK_FULL) {
541
/* We only print a warning about disk full once */
543
if (os_has_said_disk_full) {
549
ut_print_timestamp(stderr);
551
" InnoDB: Encountered a problem with"
555
ut_print_timestamp(stderr);
557
" InnoDB: Disk is full. Try to clean the disk"
558
" to free space.\n");
560
os_has_said_disk_full = TRUE;
565
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
568
} else if (err == OS_FILE_AIO_INTERRUPTED) {
571
} else if (err == OS_FILE_ALREADY_EXISTS
572
|| err == OS_FILE_PATH_ERROR) {
575
} else if (err == OS_FILE_SHARING_VIOLATION) {
577
os_thread_sleep(10000000); /* 10 sec */
579
} else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
581
os_thread_sleep(100000); /* 100 ms */
583
} else if (err == OS_FILE_OPERATION_ABORTED) {
585
os_thread_sleep(100000); /* 100 ms */
589
fprintf(stderr, "InnoDB: File name %s\n", name);
592
fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
596
fprintf(stderr, "InnoDB: Cannot continue operation.\n");
607
/****************************************************************//**
608
Does error handling when a file operation fails.
609
@return TRUE if we should retry the operation */
612
os_file_handle_error(
613
/*=================*/
614
const char* name, /*!< in: name of a file or NULL */
615
const char* operation)/*!< in: operation */
617
/* exit in case of unknown error */
618
return(os_file_handle_error_cond_exit(name, operation, TRUE));
621
/****************************************************************//**
622
Does error handling when a file operation fails.
623
@return TRUE if we should retry the operation */
626
os_file_handle_error_no_exit(
627
/*=========================*/
628
const char* name, /*!< in: name of a file or NULL */
629
const char* operation)/*!< in: operation */
631
/* don't exit in case of unknown error */
632
return(os_file_handle_error_cond_exit(name, operation, FALSE));
636
#define USE_FILE_LOCK
637
#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
638
/* InnoDB Hot Backup does not lock the data files.
639
* On Windows, mandatory locking is used.
641
# undef USE_FILE_LOCK
644
/****************************************************************//**
645
Obtain an exclusive lock on a file.
646
@return 0 on success */
651
int fd, /*!< in: file descriptor */
652
const char* name) /*!< in: file name */
656
lk.l_whence = SEEK_SET;
657
lk.l_start = lk.l_len = 0;
658
if (fcntl(fd, F_SETLK, &lk) == -1) {
660
"InnoDB: Unable to lock %s, error: %d\n", name, errno);
662
if (errno == EAGAIN || errno == EACCES) {
664
"InnoDB: Check that you do not already have"
665
" another mysqld process\n"
666
"InnoDB: using the same InnoDB data"
675
#endif /* USE_FILE_LOCK */
677
#ifndef UNIV_HOTBACKUP
678
/****************************************************************//**
679
Creates the seek mutexes used in positioned reads and writes. */
682
os_io_init_simple(void)
683
/*===================*/
687
os_file_count_mutex = os_mutex_create();
689
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
690
os_file_seek_mutexes[i] = os_mutex_create();
694
/***********************************************************************//**
695
Creates a temporary file. This function is like tmpfile(3), but
696
the temporary file is created in the MySQL temporary directory.
697
@return temporary file handle, or NULL on error */
700
os_file_create_tmpfile(void)
701
/*========================*/
704
int fd = innobase_mysql_tmpfile();
707
file = fdopen(fd, "w+b");
711
ut_print_timestamp(stderr);
713
" InnoDB: Error: unable to create temporary file;"
714
" errno: %d\n", errno);
722
#endif /* !UNIV_HOTBACKUP */
724
/***********************************************************************//**
725
The os_file_opendir() function opens a directory stream corresponding to the
726
directory named by the dirname argument. The directory stream is positioned
727
at the first entry. In both Unix and Windows we automatically skip the '.'
728
and '..' items at the start of the directory listing.
729
@return directory stream, NULL if error */
734
const char* dirname, /*!< in: directory name; it must not
735
contain a trailing '\' or '/' */
736
ibool error_is_fatal) /*!< in: TRUE if we should treat an
737
error as a fatal error; if we try to
738
open symlinks then we do not wish a
739
fatal error if it happens not to be
744
LPWIN32_FIND_DATA lpFindFileData;
745
char path[OS_FILE_MAX_PATH + 3];
747
ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
749
strcpy(path, dirname);
750
strcpy(path + strlen(path), "\\*");
752
/* Note that in Windows opening the 'directory stream' also retrieves
753
the first entry in the directory. Since it is '.', that is no problem,
754
as we will skip over the '.' and '..' entries anyway. */
756
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
758
dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
760
ut_free(lpFindFileData);
762
if (dir == INVALID_HANDLE_VALUE) {
764
if (error_is_fatal) {
765
os_file_handle_error(dirname, "opendir");
773
dir = opendir(dirname);
775
if (dir == NULL && error_is_fatal) {
776
os_file_handle_error(dirname, "opendir");
783
/***********************************************************************//**
784
Closes a directory stream.
785
@return 0 if success, -1 if failure */
790
os_file_dir_t dir) /*!< in: directory stream */
795
ret = FindClose(dir);
798
os_file_handle_error_no_exit(NULL, "closedir");
810
os_file_handle_error_no_exit(NULL, "closedir");
817
/***********************************************************************//**
818
This function returns information of the next file in the directory. We jump
819
over the '.' and '..' entries in the directory.
820
@return 0 if ok, -1 if error, 1 if at the end of the directory */
823
os_file_readdir_next_file(
824
/*======================*/
825
const char* dirname,/*!< in: directory name or path */
826
os_file_dir_t dir, /*!< in: directory stream */
827
os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
830
LPWIN32_FIND_DATA lpFindFileData;
833
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
835
ret = FindNextFile(dir, lpFindFileData);
838
ut_a(strlen((char *) lpFindFileData->cFileName)
841
if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
842
|| strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
847
strcpy(info->name, (char *) lpFindFileData->cFileName);
849
info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
850
+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
853
if (lpFindFileData->dwFileAttributes
854
& FILE_ATTRIBUTE_REPARSE_POINT) {
855
/* TODO: test Windows symlinks */
856
/* TODO: MySQL has apparently its own symlink
857
implementation in Windows, dbname.sym can
858
redirect a database directory:
859
REFMAN "windows-symbolic-links.html" */
860
info->type = OS_FILE_TYPE_LINK;
861
} else if (lpFindFileData->dwFileAttributes
862
& FILE_ATTRIBUTE_DIRECTORY) {
863
info->type = OS_FILE_TYPE_DIR;
865
/* It is probably safest to assume that all other
866
file types are normal. Better to check them rather
867
than blindly skip them. */
869
info->type = OS_FILE_TYPE_FILE;
873
ut_free(lpFindFileData);
877
} else if (GetLastError() == ERROR_NO_MORE_FILES) {
881
os_file_handle_error_no_exit(dirname,
882
"readdir_next_file");
889
struct stat statinfo;
890
#ifdef HAVE_READDIR_R
891
char dirent_buf[sizeof(struct dirent)
892
+ _POSIX_PATH_MAX + 100];
893
/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
894
the max file name len; but in most standards, the
895
length is NAME_MAX; we add 100 to be even safer */
900
#ifdef HAVE_READDIR_R
901
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
905
/* On AIX, only if we got non-NULL 'ent' (result) value and
906
a non-zero 'ret' (return) value, it indicates a failed
907
readdir_r() call. An NULL 'ent' with an non-zero 'ret'
908
would indicate the "end of the directory" is reached. */
913
"InnoDB: cannot read directory %s, error %lu\n",
914
dirname, (ulong)ret);
920
/* End of directory */
925
ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
934
ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
936
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
941
strcpy(info->name, ent->d_name);
943
full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
945
sprintf(full_path, "%s/%s", dirname, ent->d_name);
947
ret = stat(full_path, &statinfo);
951
if (errno == ENOENT) {
952
/* readdir() returned a file that does not exist,
953
it must have been deleted in the meantime. Do what
954
would have happened if the file was deleted before
955
readdir() - ignore and go to the next entry.
956
If this is the last entry then info->name will still
957
contain the name of the deleted file when this
958
function returns, but this is not an issue since the
959
caller shouldn't be looking at info when end of
960
directory is returned. */
967
os_file_handle_error_no_exit(full_path, "stat");
974
info->size = (ib_int64_t)statinfo.st_size;
976
if (S_ISDIR(statinfo.st_mode)) {
977
info->type = OS_FILE_TYPE_DIR;
978
} else if (S_ISLNK(statinfo.st_mode)) {
979
info->type = OS_FILE_TYPE_LINK;
980
} else if (S_ISREG(statinfo.st_mode)) {
981
info->type = OS_FILE_TYPE_FILE;
983
info->type = OS_FILE_TYPE_UNKNOWN;
992
/*****************************************************************//**
993
This function attempts to create a directory named pathname. The new directory
994
gets default permissions. On Unix the permissions are (0770 & ~umask). If the
995
directory exists already, nothing is done and the call succeeds, unless the
996
fail_if_exists arguments is true.
997
@return TRUE if call succeeds, FALSE on error */
1000
os_file_create_directory(
1001
/*=====================*/
1002
const char* pathname, /*!< in: directory name as
1003
null-terminated string */
1004
ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
1005
is treated as an error. */
1010
rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1012
|| (GetLastError() == ERROR_ALREADY_EXISTS
1013
&& !fail_if_exists))) {
1015
os_file_handle_error(pathname, "CreateDirectory");
1024
rcode = mkdir(pathname, 0770);
1026
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1028
os_file_handle_error(pathname, "mkdir");
1037
/****************************************************************//**
1038
NOTE! Use the corresponding macro os_file_create_simple(), not directly
1040
A simple function to open or create a file.
1041
@return own: handle to the file, not defined if error, error number
1042
can be retrieved with os_file_get_last_error */
1045
os_file_create_simple_func(
1046
/*=======================*/
1047
const char* name, /*!< in: name of the file or path as a
1048
null-terminated string */
1049
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
1050
opened (if does not exist, error), or
1051
OS_FILE_CREATE if a new file is created
1052
(if exists, error), or
1053
OS_FILE_CREATE_PATH if new file
1054
(if exists, error) and subdirectories along
1055
its path are created (if needed)*/
1056
ulint access_type,/*!< in: OS_FILE_READ_ONLY or
1057
OS_FILE_READ_WRITE */
1058
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1064
DWORD attributes = 0;
1070
if (create_mode == OS_FILE_OPEN) {
1071
create_flag = OPEN_EXISTING;
1072
} else if (create_mode == OS_FILE_CREATE) {
1073
create_flag = CREATE_NEW;
1074
} else if (create_mode == OS_FILE_CREATE_PATH) {
1075
/* create subdirs along the path if needed */
1076
*success = os_file_create_subdirs_if_needed(name);
1080
create_flag = CREATE_NEW;
1081
create_mode = OS_FILE_CREATE;
1087
if (access_type == OS_FILE_READ_ONLY) {
1088
access = GENERIC_READ;
1089
} else if (access_type == OS_FILE_READ_WRITE) {
1090
access = GENERIC_READ | GENERIC_WRITE;
1096
file = CreateFile((LPCTSTR) name,
1098
FILE_SHARE_READ | FILE_SHARE_WRITE,
1099
/* file can be read and written also
1100
by other processes */
1101
NULL, /* default security attributes */
1104
NULL); /*!< no template file */
1106
if (file == INVALID_HANDLE_VALUE) {
1109
retry = os_file_handle_error(name,
1110
create_mode == OS_FILE_OPEN ?
1128
if (create_mode == OS_FILE_OPEN) {
1129
if (access_type == OS_FILE_READ_ONLY) {
1130
create_flag = O_RDONLY;
1132
create_flag = O_RDWR;
1134
} else if (create_mode == OS_FILE_CREATE) {
1135
create_flag = O_RDWR | O_CREAT | O_EXCL;
1136
} else if (create_mode == OS_FILE_CREATE_PATH) {
1137
/* create subdirs along the path if needed */
1138
*success = os_file_create_subdirs_if_needed(name);
1142
create_flag = O_RDWR | O_CREAT | O_EXCL;
1143
create_mode = OS_FILE_CREATE;
1149
if (create_mode == OS_FILE_CREATE) {
1150
file = open(name, create_flag, S_IRUSR | S_IWUSR
1151
| S_IRGRP | S_IWGRP);
1153
file = open(name, create_flag);
1159
retry = os_file_handle_error(name,
1160
create_mode == OS_FILE_OPEN ?
1165
#ifdef USE_FILE_LOCK
1166
} else if (access_type == OS_FILE_READ_WRITE
1167
&& os_file_lock(file, name)) {
1177
#endif /* __WIN__ */
1180
/****************************************************************//**
1181
NOTE! Use the corresponding macro
1182
os_file_create_simple_no_error_handling(), not directly this function!
1183
A simple function to open or create a file.
1184
@return own: handle to the file, not defined if error, error number
1185
can be retrieved with os_file_get_last_error */
1188
os_file_create_simple_no_error_handling_func(
1189
/*=========================================*/
1190
const char* name, /*!< in: name of the file or path as a
1191
null-terminated string */
1192
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
1193
is opened (if does not exist, error), or
1194
OS_FILE_CREATE if a new file is created
1195
(if exists, error) */
1196
ulint access_type,/*!< in: OS_FILE_READ_ONLY,
1197
OS_FILE_READ_WRITE, or
1198
OS_FILE_READ_ALLOW_DELETE; the last option is
1199
used by a backup program reading the file */
1200
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1206
DWORD attributes = 0;
1207
DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1211
if (create_mode == OS_FILE_OPEN) {
1212
create_flag = OPEN_EXISTING;
1213
} else if (create_mode == OS_FILE_CREATE) {
1214
create_flag = CREATE_NEW;
1220
if (access_type == OS_FILE_READ_ONLY) {
1221
access = GENERIC_READ;
1222
} else if (access_type == OS_FILE_READ_WRITE) {
1223
access = GENERIC_READ | GENERIC_WRITE;
1224
} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1225
access = GENERIC_READ;
1226
share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1227
| FILE_SHARE_WRITE; /*!< A backup program has to give
1228
mysqld the maximum freedom to
1229
do what it likes with the
1236
file = CreateFile((LPCTSTR) name,
1239
NULL, /* default security attributes */
1242
NULL); /*!< no template file */
1244
if (file == INVALID_HANDLE_VALUE) {
1257
if (create_mode == OS_FILE_OPEN) {
1258
if (access_type == OS_FILE_READ_ONLY) {
1259
create_flag = O_RDONLY;
1261
create_flag = O_RDWR;
1263
} else if (create_mode == OS_FILE_CREATE) {
1264
create_flag = O_RDWR | O_CREAT | O_EXCL;
1270
if (create_mode == OS_FILE_CREATE) {
1271
file = open(name, create_flag, S_IRUSR | S_IWUSR
1272
| S_IRGRP | S_IWGRP);
1274
file = open(name, create_flag);
1279
#ifdef USE_FILE_LOCK
1280
} else if (access_type == OS_FILE_READ_WRITE
1281
&& os_file_lock(file, name)) {
1291
#endif /* __WIN__ */
1294
/****************************************************************//**
1295
Tries to disable OS caching on an opened file descriptor. */
1298
os_file_set_nocache(
1299
/*================*/
1300
int fd, /*!< in: file descriptor to alter */
1301
const char* file_name, /*!< in: file name, used in the
1302
diagnostic message */
1303
const char* operation_name) /*!< in: "open" or "create"; used in the
1304
diagnostic message */
1306
/* some versions of Solaris may not have DIRECTIO_ON */
1307
#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1308
if (directio(fd, DIRECTIO_ON) == -1) {
1310
errno_save = (int)errno;
1311
ut_print_timestamp(stderr);
1313
" InnoDB: Failed to set DIRECTIO_ON "
1314
"on file %s: %s: %s, continuing anyway\n",
1315
file_name, operation_name, strerror(errno_save));
1317
#elif defined(O_DIRECT)
1318
if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1320
errno_save = (int)errno;
1321
ut_print_timestamp(stderr);
1323
" InnoDB: Failed to set O_DIRECT "
1324
"on file %s: %s: %s, continuing anyway\n",
1325
file_name, operation_name, strerror(errno_save));
1326
if (errno_save == EINVAL) {
1327
ut_print_timestamp(stderr);
1329
" InnoDB: O_DIRECT is known to result in "
1330
"'Invalid argument' on Linux on tmpfs, "
1331
"see MySQL Bug#26662\n");
1334
#else /* Required for OSX */
1337
(void)operation_name;
1341
/****************************************************************//**
1342
NOTE! Use the corresponding macro os_file_create(), not directly
1344
Opens an existing file or creates a new.
1345
@return own: handle to the file, not defined if error, error number
1346
can be retrieved with os_file_get_last_error */
1349
os_file_create_func(
1350
/*================*/
1351
const char* name, /*!< in: name of the file or path as a
1352
null-terminated string */
1353
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
1354
is opened (if does not exist, error), or
1355
OS_FILE_CREATE if a new file is created
1357
OS_FILE_OVERWRITE if a new file is created
1358
or an old overwritten;
1359
OS_FILE_OPEN_RAW, if a raw device or disk
1360
partition should be opened */
1361
ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1362
non-buffered i/o is desired,
1363
OS_FILE_NORMAL, if any normal file;
1364
NOTE that it also depends on type, os_aio_..
1365
and srv_.. variables whether we really use
1366
async i/o or unbuffered i/o: look in the
1367
function source code for the exact rules */
1368
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
1369
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1373
DWORD share_mode = FILE_SHARE_READ;
1380
if (create_mode == OS_FILE_OPEN_RAW) {
1381
create_flag = OPEN_EXISTING;
1382
share_mode = FILE_SHARE_WRITE;
1383
} else if (create_mode == OS_FILE_OPEN
1384
|| create_mode == OS_FILE_OPEN_RETRY) {
1385
create_flag = OPEN_EXISTING;
1386
} else if (create_mode == OS_FILE_CREATE) {
1387
create_flag = CREATE_NEW;
1388
} else if (create_mode == OS_FILE_OVERWRITE) {
1389
create_flag = CREATE_ALWAYS;
1395
if (purpose == OS_FILE_AIO) {
1396
/* If specified, use asynchronous (overlapped) io and no
1397
buffering of writes in the OS */
1400
if (srv_use_native_aio) {
1401
attributes = attributes | FILE_FLAG_OVERLAPPED;
1404
#ifdef UNIV_NON_BUFFERED_IO
1405
# ifndef UNIV_HOTBACKUP
1406
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1407
/* Do not use unbuffered i/o to log files because
1408
value 2 denotes that we do not flush the log at every
1409
commit, but only once per second */
1410
} else if (srv_win_file_flush_method
1411
== SRV_WIN_IO_UNBUFFERED) {
1412
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1414
# else /* !UNIV_HOTBACKUP */
1415
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1416
# endif /* !UNIV_HOTBACKUP */
1417
#endif /* UNIV_NON_BUFFERED_IO */
1418
} else if (purpose == OS_FILE_NORMAL) {
1420
#ifdef UNIV_NON_BUFFERED_IO
1421
# ifndef UNIV_HOTBACKUP
1422
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1423
/* Do not use unbuffered i/o to log files because
1424
value 2 denotes that we do not flush the log at every
1425
commit, but only once per second */
1426
} else if (srv_win_file_flush_method
1427
== SRV_WIN_IO_UNBUFFERED) {
1428
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1430
# else /* !UNIV_HOTBACKUP */
1431
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1432
# endif /* !UNIV_HOTBACKUP */
1433
#endif /* UNIV_NON_BUFFERED_IO */
1439
file = CreateFile((LPCTSTR) name,
1440
GENERIC_READ | GENERIC_WRITE, /* read and write
1442
share_mode, /* File can be read also by other
1443
processes; we must give the read
1444
permission because of ibbackup. We do
1445
not give the write permission to
1446
others because if one would succeed to
1447
start 2 instances of mysqld on the
1448
SAME files, that could cause severe
1449
database corruption! When opening
1450
raw disk partitions, Microsoft manuals
1451
say that we must give also the write
1453
NULL, /* default security attributes */
1456
NULL); /*!< no template file */
1458
if (file == INVALID_HANDLE_VALUE) {
1461
/* When srv_file_per_table is on, file creation failure may not
1462
be critical to the whole instance. Do not crash the server in
1463
case of unknown errors.
1464
Please note "srv_file_per_table" is a global variable with
1465
no explicit synchronization protection. It could be
1466
changed during this execution path. It might not have the
1467
same value as the one when building the table definition */
1468
if (srv_file_per_table) {
1469
retry = os_file_handle_error_no_exit(name,
1470
create_mode == OS_FILE_CREATE ?
1473
retry = os_file_handle_error(name,
1474
create_mode == OS_FILE_CREATE ?
1490
const char* mode_str = NULL;
1495
if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1496
|| create_mode == OS_FILE_OPEN_RETRY) {
1498
create_flag = O_RDWR;
1499
} else if (create_mode == OS_FILE_CREATE) {
1500
mode_str = "CREATE";
1501
create_flag = O_RDWR | O_CREAT | O_EXCL;
1502
} else if (create_mode == OS_FILE_OVERWRITE) {
1503
mode_str = "OVERWRITE";
1504
create_flag = O_RDWR | O_CREAT | O_TRUNC;
1510
ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1511
ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1514
/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1515
O_SYNC because the datasync options seemed to corrupt files in 2001
1516
in both Linux and Solaris */
1517
if (type == OS_LOG_FILE
1518
&& srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1521
fprintf(stderr, "Using O_SYNC for file %s\n", name);
1524
create_flag = create_flag | O_SYNC;
1528
file = open(name, create_flag, os_innodb_umask);
1533
/* When srv_file_per_table is on, file creation failure may not
1534
be critical to the whole instance. Do not crash the server in
1535
case of unknown errors.
1536
Please note "srv_file_per_table" is a global variable with
1537
no explicit synchronization protection. It could be
1538
changed during this execution path. It might not have the
1539
same value as the one when building the table definition */
1540
if (srv_file_per_table) {
1541
retry = os_file_handle_error_no_exit(name,
1542
create_mode == OS_FILE_CREATE ?
1545
retry = os_file_handle_error(name,
1546
create_mode == OS_FILE_CREATE ?
1553
return(file /* -1 */);
1560
/* We disable OS caching (O_DIRECT) only on data files */
1561
if (type != OS_LOG_FILE
1562
&& srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1564
os_file_set_nocache(file, name, mode_str);
1567
#ifdef USE_FILE_LOCK
1568
if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1570
if (create_mode == OS_FILE_OPEN_RETRY) {
1572
ut_print_timestamp(stderr);
1573
fputs(" InnoDB: Retrying to lock"
1574
" the first data file\n",
1576
for (i = 0; i < 100; i++) {
1577
os_thread_sleep(1000000);
1578
if (!os_file_lock(file, name)) {
1583
ut_print_timestamp(stderr);
1584
fputs(" InnoDB: Unable to open the first data file\n",
1592
#endif /* USE_FILE_LOCK */
1595
#endif /* __WIN__ */
1598
/***********************************************************************//**
1599
Deletes a file if it exists. The file has to be closed before calling this.
1600
@return TRUE if success */
1603
os_file_delete_if_exists(
1604
/*=====================*/
1605
const char* name) /*!< in: file path as a null-terminated string */
1611
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1614
ret = DeleteFile((LPCTSTR)name);
1620
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1621
/* the file does not exist, this not an error */
1628
if (count > 100 && 0 == (count % 10)) {
1630
"InnoDB: Warning: cannot delete file %s\n"
1631
"InnoDB: Are you running ibbackup"
1632
" to back up the file?\n", name);
1634
os_file_get_last_error(TRUE); /* print error information */
1637
os_thread_sleep(1000000); /* sleep for a second */
1650
if (ret != 0 && errno != ENOENT) {
1651
os_file_handle_error_no_exit(name, "delete");
1660
/***********************************************************************//**
1661
Deletes a file. The file has to be closed before calling this.
1662
@return TRUE if success */
1667
const char* name) /*!< in: file path as a null-terminated string */
1673
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1676
ret = DeleteFile((LPCTSTR)name);
1682
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1683
/* If the file does not exist, we classify this as a 'mild'
1691
if (count > 100 && 0 == (count % 10)) {
1693
"InnoDB: Warning: cannot delete file %s\n"
1694
"InnoDB: Are you running ibbackup"
1695
" to back up the file?\n", name);
1697
os_file_get_last_error(TRUE); /* print error information */
1700
os_thread_sleep(1000000); /* sleep for a second */
1714
os_file_handle_error_no_exit(name, "delete");
1723
/***********************************************************************//**
1724
NOTE! Use the corresponding macro os_file_rename(), not directly this function!
1725
Renames a file (can also move it to another directory). It is safest that the
1726
file is closed before calling this function.
1727
@return TRUE if success */
1730
os_file_rename_func(
1731
/*================*/
1732
const char* oldpath,/*!< in: old file path as a null-terminated
1734
const char* newpath)/*!< in: new file path */
1739
ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1745
os_file_handle_error_no_exit(oldpath, "rename");
1751
ret = rename(oldpath, newpath);
1754
os_file_handle_error_no_exit(oldpath, "rename");
1763
/***********************************************************************//**
1764
NOTE! Use the corresponding macro os_file_close(), not directly this function!
1765
Closes a file handle. In case of error, error number can be retrieved with
1766
os_file_get_last_error.
1767
@return TRUE if success */
1772
os_file_t file) /*!< in, own: handle to a file */
1779
ret = CloseHandle(file);
1785
os_file_handle_error(NULL, "close");
1794
os_file_handle_error(NULL, "close");
1803
#ifdef UNIV_HOTBACKUP
1804
/***********************************************************************//**
1805
Closes a file handle.
1806
@return TRUE if success */
1809
os_file_close_no_error_handling(
1810
/*============================*/
1811
os_file_t file) /*!< in, own: handle to a file */
1818
ret = CloseHandle(file);
1838
#endif /* UNIV_HOTBACKUP */
1840
/***********************************************************************//**
1842
@return TRUE if success */
1847
os_file_t file, /*!< in: handle to a file */
1848
ulint* size, /*!< out: least significant 32 bits of file
1850
ulint* size_high)/*!< out: most significant 32 bits of size */
1856
low = GetFileSize(file, &high);
1858
if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1869
offs = lseek(file, 0, SEEK_END);
1871
if (offs == ((off_t)-1)) {
1876
if (sizeof(off_t) > 4) {
1877
*size = (ulint)(offs & 0xFFFFFFFFUL);
1878
*size_high = (ulint)(offs >> 32);
1880
*size = (ulint) offs;
1888
/***********************************************************************//**
1889
Gets file size as a 64-bit integer ib_int64_t.
1890
@return size in bytes, -1 if error */
1893
os_file_get_size_as_iblonglong(
1894
/*===========================*/
1895
os_file_t file) /*!< in: handle to a file */
1901
success = os_file_get_size(file, &size, &size_high);
1908
return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
1911
/***********************************************************************//**
1912
Write the specified number of zeros to a newly created file.
1913
@return TRUE if success */
1918
const char* name, /*!< in: name of the file or path as a
1919
null-terminated string */
1920
os_file_t file, /*!< in: handle to a file */
1921
ulint size, /*!< in: least significant 32 bits of file
1923
ulint size_high)/*!< in: most significant 32 bits of size */
1925
ib_int64_t current_size;
1926
ib_int64_t desired_size;
1932
ut_a(size == (size & 0xFFFFFFFF));
1935
desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
1937
/* Write up to 1 megabyte at a time. */
1938
buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1940
buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
1942
/* Align the buffer for possible raw i/o */
1943
buf = ut_align(buf2, UNIV_PAGE_SIZE);
1945
/* Write buffer full of zeros */
1946
memset(buf, 0, buf_size);
1948
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1950
fprintf(stderr, "InnoDB: Progress in MB:");
1953
while (current_size < desired_size) {
1956
if (desired_size - current_size < (ib_int64_t) buf_size) {
1957
n_bytes = (ulint) (desired_size - current_size);
1962
ret = os_file_write(name, file, buf,
1963
(ulint)(current_size & 0xFFFFFFFF),
1964
(ulint)(current_size >> 32),
1968
goto error_handling;
1971
/* Print about progress for each 100 MB written */
1972
if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
1973
!= current_size / (ib_int64_t)(100 * 1024 * 1024)) {
1975
fprintf(stderr, " %lu00",
1976
(ulong) ((current_size + n_bytes)
1977
/ (ib_int64_t)(100 * 1024 * 1024)));
1980
current_size += n_bytes;
1983
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1985
fprintf(stderr, "\n");
1990
ret = os_file_flush(file);
2000
/***********************************************************************//**
2001
Truncates a file at its current position.
2002
@return TRUE if success */
2007
FILE* file) /*!< in: file to be truncated */
2010
HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2011
return(SetEndOfFile(h));
2013
return(!ftruncate(fileno(file), ftell(file)));
2014
#endif /* __WIN__ */
2018
/***********************************************************************//**
2019
Wrapper to fsync(2) that retries the call on some errors.
2020
Returns the value 0 if successful; otherwise the value -1 is returned and
2021
the global variable errno is set to indicate the error.
2022
@return 0 if success, -1 otherwise */
2028
os_file_t file) /*!< in: handle to a file */
2041
if (ret == -1 && errno == ENOLCK) {
2043
if (failures % 100 == 0) {
2045
ut_print_timestamp(stderr);
2047
" InnoDB: fsync(): "
2048
"No locks available; retrying\n");
2051
os_thread_sleep(200000 /* 0.2 sec */);
2064
#endif /* !__WIN__ */
2066
/***********************************************************************//**
2067
NOTE! Use the corresponding macro os_file_flush(), not directly this function!
2068
Flushes the write buffers of a given file to the disk.
2069
@return TRUE if success */
2074
os_file_t file) /*!< in, own: handle to a file */
2083
ret = FlushFileBuffers(file);
2089
/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2090
actually a raw device, we choose to ignore that error if we are using
2093
if (srv_start_raw_disk_in_use && GetLastError()
2094
== ERROR_INVALID_FUNCTION) {
2098
os_file_handle_error(NULL, "flush");
2100
/* It is a fatal error if a file flush does not succeed, because then
2101
the database can get corrupt on disk */
2108
#if defined(HAVE_DARWIN_THREADS)
2109
# ifndef F_FULLFSYNC
2110
/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2111
# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2112
# elif F_FULLFSYNC != 51
2113
# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2115
/* Apple has disabled fsync() for internal disk drives in OS X. That
2116
caused corruption for a user when he tested a power outage. Let us in
2117
OS X use a nonstandard flush method recommended by an Apple
2120
if (!srv_have_fullfsync) {
2121
/* If we are not on an operating system that supports this,
2122
then fall back to a plain fsync. */
2124
ret = os_file_fsync(file);
2126
ret = fcntl(file, F_FULLFSYNC, NULL);
2129
/* If we are not on a file system that supports this,
2130
then fall back to a plain fsync. */
2131
ret = os_file_fsync(file);
2135
ret = os_file_fsync(file);
2142
/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2143
we choose to ignore that error if we are using raw disks */
2145
if (srv_start_raw_disk_in_use && errno == EINVAL) {
2150
ut_print_timestamp(stderr);
2153
" InnoDB: Error: the OS said file flush did not succeed\n");
2155
os_file_handle_error(NULL, "flush");
2157
/* It is a fatal error if a file flush does not succeed, because then
2158
the database can get corrupt on disk */
2166
/*******************************************************************//**
2167
Does a synchronous read operation in Posix.
2168
@return number of bytes read, -1 if error */
2173
os_file_t file, /*!< in: handle to a file */
2174
void* buf, /*!< in: buffer where to read */
2175
ulint n, /*!< in: number of bytes to read */
2176
ulint offset, /*!< in: least significant 32 bits of file
2177
offset from where to read */
2178
ulint offset_high) /*!< in: most significant 32 bits of
2182
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2184
#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2186
ut_a((offset & 0xFFFFFFFFUL) == offset);
2188
/* If off_t is > 4 bytes in size, then we assume we can pass a
2191
if (sizeof(off_t) > 4) {
2192
offs = (off_t)offset + (((off_t)offset_high) << 32);
2195
offs = (off_t)offset;
2197
if (offset_high > 0) {
2199
"InnoDB: Error: file read at offset > 4 GB\n");
2205
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2206
os_mutex_enter(os_file_count_mutex);
2207
os_file_n_pending_preads++;
2208
os_n_pending_reads++;
2209
os_mutex_exit(os_file_count_mutex);
2211
n_bytes = pread(file, buf, (ssize_t)n, offs);
2213
os_mutex_enter(os_file_count_mutex);
2214
os_file_n_pending_preads--;
2215
os_n_pending_reads--;
2216
os_mutex_exit(os_file_count_mutex);
2223
#ifndef UNIV_HOTBACKUP
2225
#endif /* !UNIV_HOTBACKUP */
2227
os_mutex_enter(os_file_count_mutex);
2228
os_n_pending_reads++;
2229
os_mutex_exit(os_file_count_mutex);
2231
#ifndef UNIV_HOTBACKUP
2232
/* Protect the seek / read operation with a mutex */
2233
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2235
os_mutex_enter(os_file_seek_mutexes[i]);
2236
#endif /* !UNIV_HOTBACKUP */
2238
ret_offset = lseek(file, offs, SEEK_SET);
2240
if (ret_offset < 0) {
2243
ret = read(file, buf, (ssize_t)n);
2246
#ifndef UNIV_HOTBACKUP
2247
os_mutex_exit(os_file_seek_mutexes[i]);
2248
#endif /* !UNIV_HOTBACKUP */
2250
os_mutex_enter(os_file_count_mutex);
2251
os_n_pending_reads--;
2252
os_mutex_exit(os_file_count_mutex);
2259
/*******************************************************************//**
2260
Does a synchronous write operation in Posix.
2261
@return number of bytes written, -1 if error */
2266
os_file_t file, /*!< in: handle to a file */
2267
const void* buf, /*!< in: buffer from where to write */
2268
ulint n, /*!< in: number of bytes to write */
2269
ulint offset, /*!< in: least significant 32 bits of file
2270
offset where to write */
2271
ulint offset_high) /*!< in: most significant 32 bits of
2277
ut_a((offset & 0xFFFFFFFFUL) == offset);
2279
/* If off_t is > 4 bytes in size, then we assume we can pass a
2282
if (sizeof(off_t) > 4) {
2283
offs = (off_t)offset + (((off_t)offset_high) << 32);
2285
offs = (off_t)offset;
2287
if (offset_high > 0) {
2289
"InnoDB: Error: file write"
2290
" at offset > 4 GB\n");
2296
#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2297
os_mutex_enter(os_file_count_mutex);
2298
os_file_n_pending_pwrites++;
2299
os_n_pending_writes++;
2300
os_mutex_exit(os_file_count_mutex);
2302
ret = pwrite(file, buf, (ssize_t)n, offs);
2304
os_mutex_enter(os_file_count_mutex);
2305
os_file_n_pending_pwrites--;
2306
os_n_pending_writes--;
2307
os_mutex_exit(os_file_count_mutex);
2309
# ifdef UNIV_DO_FLUSH
2310
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2311
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2312
&& !os_do_not_call_flush_at_each_write) {
2314
/* Always do fsync to reduce the probability that when
2315
the OS crashes, a database page is only partially
2316
physically written to disk. */
2318
ut_a(TRUE == os_file_flush(file));
2320
# endif /* UNIV_DO_FLUSH */
2326
# ifndef UNIV_HOTBACKUP
2328
# endif /* !UNIV_HOTBACKUP */
2330
os_mutex_enter(os_file_count_mutex);
2331
os_n_pending_writes++;
2332
os_mutex_exit(os_file_count_mutex);
2334
# ifndef UNIV_HOTBACKUP
2335
/* Protect the seek / write operation with a mutex */
2336
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2338
os_mutex_enter(os_file_seek_mutexes[i]);
2339
# endif /* UNIV_HOTBACKUP */
2341
ret_offset = lseek(file, offs, SEEK_SET);
2343
if (ret_offset < 0) {
2349
ret = write(file, buf, (ssize_t)n);
2351
# ifdef UNIV_DO_FLUSH
2352
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2353
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2354
&& !os_do_not_call_flush_at_each_write) {
2356
/* Always do fsync to reduce the probability that when
2357
the OS crashes, a database page is only partially
2358
physically written to disk. */
2360
ut_a(TRUE == os_file_flush(file));
2362
# endif /* UNIV_DO_FLUSH */
2365
# ifndef UNIV_HOTBACKUP
2366
os_mutex_exit(os_file_seek_mutexes[i]);
2367
# endif /* !UNIV_HOTBACKUP */
2369
os_mutex_enter(os_file_count_mutex);
2370
os_n_pending_writes--;
2371
os_mutex_exit(os_file_count_mutex);
2379
/*******************************************************************//**
2380
NOTE! Use the corresponding macro os_file_read(), not directly this
2382
Requests a synchronous positioned read operation.
2383
@return TRUE if request was successful, FALSE if fail */
2388
os_file_t file, /*!< in: handle to a file */
2389
void* buf, /*!< in: buffer where to read */
2390
ulint offset, /*!< in: least significant 32 bits of file
2391
offset where to read */
2392
ulint offset_high, /*!< in: most significant 32 bits of
2394
ulint n) /*!< in: number of bytes to read */
2403
#ifndef UNIV_HOTBACKUP
2405
#endif /* !UNIV_HOTBACKUP */
2407
ut_a((offset & 0xFFFFFFFFUL) == offset);
2410
os_bytes_read_since_printout += n;
2417
low = (DWORD) offset;
2418
high = (DWORD) offset_high;
2420
os_mutex_enter(os_file_count_mutex);
2421
os_n_pending_reads++;
2422
os_mutex_exit(os_file_count_mutex);
2424
#ifndef UNIV_HOTBACKUP
2425
/* Protect the seek / read operation with a mutex */
2426
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2428
os_mutex_enter(os_file_seek_mutexes[i]);
2429
#endif /* !UNIV_HOTBACKUP */
2431
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2433
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2435
#ifndef UNIV_HOTBACKUP
2436
os_mutex_exit(os_file_seek_mutexes[i]);
2437
#endif /* !UNIV_HOTBACKUP */
2439
os_mutex_enter(os_file_count_mutex);
2440
os_n_pending_reads--;
2441
os_mutex_exit(os_file_count_mutex);
2443
goto error_handling;
2446
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2448
#ifndef UNIV_HOTBACKUP
2449
os_mutex_exit(os_file_seek_mutexes[i]);
2450
#endif /* !UNIV_HOTBACKUP */
2452
os_mutex_enter(os_file_count_mutex);
2453
os_n_pending_reads--;
2454
os_mutex_exit(os_file_count_mutex);
2456
if (ret && len == n) {
2463
os_bytes_read_since_printout += n;
2466
ret = os_file_pread(file, buf, n, offset, offset_high);
2468
if ((ulint)ret == n) {
2474
"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2475
"InnoDB: Was only able to read %ld.\n",
2476
(ulong)n, (ulong)offset_high,
2477
(ulong)offset, (long)ret);
2478
#endif /* __WIN__ */
2482
retry = os_file_handle_error(NULL, "read");
2489
"InnoDB: Fatal error: cannot read from file."
2490
" OS error number %lu.\n",
2492
(ulong) GetLastError()
2504
/*******************************************************************//**
2505
NOTE! Use the corresponding macro os_file_read_no_error_handling(),
2506
not directly this function!
2507
Requests a synchronous positioned read operation. This function does not do
2508
any error handling. In case of error it returns FALSE.
2509
@return TRUE if request was successful, FALSE if fail */
2512
os_file_read_no_error_handling_func(
2513
/*================================*/
2514
os_file_t file, /*!< in: handle to a file */
2515
void* buf, /*!< in: buffer where to read */
2516
ulint offset, /*!< in: least significant 32 bits of file
2517
offset where to read */
2518
ulint offset_high, /*!< in: most significant 32 bits of
2520
ulint n) /*!< in: number of bytes to read */
2529
#ifndef UNIV_HOTBACKUP
2531
#endif /* !UNIV_HOTBACKUP */
2533
ut_a((offset & 0xFFFFFFFFUL) == offset);
2536
os_bytes_read_since_printout += n;
2543
low = (DWORD) offset;
2544
high = (DWORD) offset_high;
2546
os_mutex_enter(os_file_count_mutex);
2547
os_n_pending_reads++;
2548
os_mutex_exit(os_file_count_mutex);
2550
#ifndef UNIV_HOTBACKUP
2551
/* Protect the seek / read operation with a mutex */
2552
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2554
os_mutex_enter(os_file_seek_mutexes[i]);
2555
#endif /* !UNIV_HOTBACKUP */
2557
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2559
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2561
#ifndef UNIV_HOTBACKUP
2562
os_mutex_exit(os_file_seek_mutexes[i]);
2563
#endif /* !UNIV_HOTBACKUP */
2565
os_mutex_enter(os_file_count_mutex);
2566
os_n_pending_reads--;
2567
os_mutex_exit(os_file_count_mutex);
2569
goto error_handling;
2572
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2574
#ifndef UNIV_HOTBACKUP
2575
os_mutex_exit(os_file_seek_mutexes[i]);
2576
#endif /* !UNIV_HOTBACKUP */
2578
os_mutex_enter(os_file_count_mutex);
2579
os_n_pending_reads--;
2580
os_mutex_exit(os_file_count_mutex);
2582
if (ret && len == n) {
2589
os_bytes_read_since_printout += n;
2592
ret = os_file_pread(file, buf, n, offset, offset_high);
2594
if ((ulint)ret == n) {
2598
#endif /* __WIN__ */
2602
retry = os_file_handle_error_no_exit(NULL, "read");
2611
/*******************************************************************//**
2612
Rewind file to its start, read at most size - 1 bytes from it to str, and
2613
NUL-terminate str. All errors are silently ignored. This function is
2614
mostly meant to be used with temporary files. */
2617
os_file_read_string(
2618
/*================*/
2619
FILE* file, /*!< in: file to read from */
2620
char* str, /*!< in: buffer where to read */
2621
ulint size) /*!< in: size of buffer */
2630
flen = fread(str, 1, size - 1, file);
2634
/*******************************************************************//**
2635
NOTE! Use the corresponding macro os_file_write(), not directly
2637
Requests a synchronous write operation.
2638
@return TRUE if request was successful, FALSE if fail */
2643
const char* name, /*!< in: name of the file or path as a
2644
null-terminated string */
2645
os_file_t file, /*!< in: handle to a file */
2646
const void* buf, /*!< in: buffer from which to write */
2647
ulint offset, /*!< in: least significant 32 bits of file
2648
offset where to write */
2649
ulint offset_high, /*!< in: most significant 32 bits of
2651
ulint n) /*!< in: number of bytes to write */
2659
ulint n_retries = 0;
2661
#ifndef UNIV_HOTBACKUP
2663
#endif /* !UNIV_HOTBACKUP */
2665
ut_a((offset & 0xFFFFFFFF) == offset);
2673
low = (DWORD) offset;
2674
high = (DWORD) offset_high;
2676
os_mutex_enter(os_file_count_mutex);
2677
os_n_pending_writes++;
2678
os_mutex_exit(os_file_count_mutex);
2680
#ifndef UNIV_HOTBACKUP
2681
/* Protect the seek / write operation with a mutex */
2682
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2684
os_mutex_enter(os_file_seek_mutexes[i]);
2685
#endif /* !UNIV_HOTBACKUP */
2687
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2689
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2691
#ifndef UNIV_HOTBACKUP
2692
os_mutex_exit(os_file_seek_mutexes[i]);
2693
#endif /* !UNIV_HOTBACKUP */
2695
os_mutex_enter(os_file_count_mutex);
2696
os_n_pending_writes--;
2697
os_mutex_exit(os_file_count_mutex);
2699
ut_print_timestamp(stderr);
2702
" InnoDB: Error: File pointer positioning to"
2703
" file %s failed at\n"
2704
"InnoDB: offset %lu %lu. Operating system"
2705
" error number %lu.\n"
2706
"InnoDB: Some operating system error numbers"
2707
" are described at\n"
2709
REFMAN "operating-system-error-codes.html\n",
2710
name, (ulong) offset_high, (ulong) offset,
2711
(ulong) GetLastError());
2716
ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2718
/* Always do fsync to reduce the probability that when the OS crashes,
2719
a database page is only partially physically written to disk. */
2721
# ifdef UNIV_DO_FLUSH
2722
if (!os_do_not_call_flush_at_each_write) {
2723
ut_a(TRUE == os_file_flush(file));
2725
# endif /* UNIV_DO_FLUSH */
2727
#ifndef UNIV_HOTBACKUP
2728
os_mutex_exit(os_file_seek_mutexes[i]);
2729
#endif /* !UNIV_HOTBACKUP */
2731
os_mutex_enter(os_file_count_mutex);
2732
os_n_pending_writes--;
2733
os_mutex_exit(os_file_count_mutex);
2735
if (ret && len == n) {
2740
/* If some background file system backup tool is running, then, at
2741
least in Windows 2000, we may get here a specific error. Let us
2742
retry the operation 100 times, with 1 second waits. */
2744
if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2746
os_thread_sleep(1000000);
2753
if (!os_has_said_disk_full) {
2755
err = (ulint)GetLastError();
2757
ut_print_timestamp(stderr);
2760
" InnoDB: Error: Write to file %s failed"
2761
" at offset %lu %lu.\n"
2762
"InnoDB: %lu bytes should have been written,"
2763
" only %lu were written.\n"
2764
"InnoDB: Operating system error number %lu.\n"
2765
"InnoDB: Check that your OS and file system"
2766
" support files of this size.\n"
2767
"InnoDB: Check also that the disk is not full"
2768
" or a disk quota exceeded.\n",
2769
name, (ulong) offset_high, (ulong) offset,
2770
(ulong) n, (ulong) len, (ulong) err);
2772
if (strerror((int)err) != NULL) {
2774
"InnoDB: Error number %lu means '%s'.\n",
2775
(ulong) err, strerror((int)err));
2779
"InnoDB: Some operating system error numbers"
2780
" are described at\n"
2782
REFMAN "operating-system-error-codes.html\n");
2784
os_has_said_disk_full = TRUE;
2791
ret = os_file_pwrite(file, buf, n, offset, offset_high);
2793
if ((ulint)ret == n) {
2798
if (!os_has_said_disk_full) {
2800
ut_print_timestamp(stderr);
2803
" InnoDB: Error: Write to file %s failed"
2804
" at offset %lu %lu.\n"
2805
"InnoDB: %lu bytes should have been written,"
2806
" only %ld were written.\n"
2807
"InnoDB: Operating system error number %lu.\n"
2808
"InnoDB: Check that your OS and file system"
2809
" support files of this size.\n"
2810
"InnoDB: Check also that the disk is not full"
2811
" or a disk quota exceeded.\n",
2812
name, offset_high, offset, n, (long int)ret,
2814
if (strerror(errno) != NULL) {
2816
"InnoDB: Error number %lu means '%s'.\n",
2817
(ulint)errno, strerror(errno));
2821
"InnoDB: Some operating system error numbers"
2822
" are described at\n"
2824
REFMAN "operating-system-error-codes.html\n");
2826
os_has_said_disk_full = TRUE;
2833
/*******************************************************************//**
2834
Check the existence and type of the given file.
2835
@return TRUE if call succeeded */
2840
const char* path, /*!< in: pathname of the file */
2841
ibool* exists, /*!< out: TRUE if file exists */
2842
os_file_type_t* type) /*!< out: type of the file (if it exists) */
2846
struct _stat statinfo;
2848
ret = _stat(path, &statinfo);
2849
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2850
/* file does not exist */
2854
/* file exists, but stat call failed */
2856
os_file_handle_error_no_exit(path, "stat");
2861
if (_S_IFDIR & statinfo.st_mode) {
2862
*type = OS_FILE_TYPE_DIR;
2863
} else if (_S_IFREG & statinfo.st_mode) {
2864
*type = OS_FILE_TYPE_FILE;
2866
*type = OS_FILE_TYPE_UNKNOWN;
2874
struct stat statinfo;
2876
ret = stat(path, &statinfo);
2877
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2878
/* file does not exist */
2882
/* file exists, but stat call failed */
2884
os_file_handle_error_no_exit(path, "stat");
2889
if (S_ISDIR(statinfo.st_mode)) {
2890
*type = OS_FILE_TYPE_DIR;
2891
} else if (S_ISLNK(statinfo.st_mode)) {
2892
*type = OS_FILE_TYPE_LINK;
2893
} else if (S_ISREG(statinfo.st_mode)) {
2894
*type = OS_FILE_TYPE_FILE;
2896
*type = OS_FILE_TYPE_UNKNOWN;
2905
/*******************************************************************//**
2906
This function returns information about the specified file
2907
@return TRUE if stat information found */
2912
const char* path, /*!< in: pathname of the file */
2913
os_file_stat_t* stat_info) /*!< information of a file in a
2918
struct _stat statinfo;
2920
ret = _stat(path, &statinfo);
2921
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2922
/* file does not exist */
2926
/* file exists, but stat call failed */
2928
os_file_handle_error_no_exit(path, "stat");
2932
if (_S_IFDIR & statinfo.st_mode) {
2933
stat_info->type = OS_FILE_TYPE_DIR;
2934
} else if (_S_IFREG & statinfo.st_mode) {
2935
stat_info->type = OS_FILE_TYPE_FILE;
2937
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2940
stat_info->ctime = statinfo.st_ctime;
2941
stat_info->atime = statinfo.st_atime;
2942
stat_info->mtime = statinfo.st_mtime;
2943
stat_info->size = statinfo.st_size;
2948
struct stat statinfo;
2950
ret = stat(path, &statinfo);
2952
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2953
/* file does not exist */
2957
/* file exists, but stat call failed */
2959
os_file_handle_error_no_exit(path, "stat");
2964
if (S_ISDIR(statinfo.st_mode)) {
2965
stat_info->type = OS_FILE_TYPE_DIR;
2966
} else if (S_ISLNK(statinfo.st_mode)) {
2967
stat_info->type = OS_FILE_TYPE_LINK;
2968
} else if (S_ISREG(statinfo.st_mode)) {
2969
stat_info->type = OS_FILE_TYPE_FILE;
2971
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2974
stat_info->ctime = statinfo.st_ctime;
2975
stat_info->atime = statinfo.st_atime;
2976
stat_info->mtime = statinfo.st_mtime;
2977
stat_info->size = statinfo.st_size;
2983
/* path name separator character */
2985
# define OS_FILE_PATH_SEPARATOR '\\'
2987
# define OS_FILE_PATH_SEPARATOR '/'
2990
/****************************************************************//**
2991
The function os_file_dirname returns a directory component of a
2992
null-terminated pathname string. In the usual case, dirname returns
2993
the string up to, but not including, the final '/', and basename
2994
is the component following the final '/'. Trailing '/' charac�
2995
ters are not counted as part of the pathname.
2997
If path does not contain a slash, dirname returns the string ".".
2999
Concatenating the string returned by dirname, a "/", and the basename
3000
yields a complete pathname.
3002
The return value is a copy of the directory component of the pathname.
3003
The copy is allocated from heap. It is the caller responsibility
3004
to free it after it is no longer needed.
3006
The following list of examples (taken from SUSv2) shows the strings
3007
returned by dirname and basename for different paths:
3009
path dirname basename
3010
"/usr/lib" "/usr" "lib"
3017
@return own: directory component of the pathname */
3022
const char* path) /*!< in: pathname */
3024
/* Find the offset of the last slash */
3025
const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3027
/* No slash in the path, return "." */
3029
return(mem_strdup("."));
3032
/* Ok, there is a slash */
3034
if (last_slash == path) {
3035
/* last slash is the first char of the path */
3037
return(mem_strdup("/"));
3040
/* Non-trivial directory component */
3042
return(mem_strdupl(path, last_slash - path));
3045
/****************************************************************//**
3046
Creates all missing subdirectories along the given path.
3047
@return TRUE if call succeeded FALSE otherwise */
3050
os_file_create_subdirs_if_needed(
3051
/*=============================*/
3052
const char* path) /*!< in: path name */
3055
ibool success, subdir_exists;
3056
os_file_type_t type;
3058
subdir = os_file_dirname(path);
3059
if (strlen(subdir) == 1
3060
&& (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3061
/* subdir is root or cwd, nothing to do */
3067
/* Test if subdir exists */
3068
success = os_file_status(subdir, &subdir_exists, &type);
3069
if (success && !subdir_exists) {
3070
/* subdir does not exist, create it */
3071
success = os_file_create_subdirs_if_needed(subdir);
3077
success = os_file_create_directory(subdir, FALSE);
3085
#ifndef UNIV_HOTBACKUP
3086
/****************************************************************//**
3087
Returns a pointer to the nth slot in the aio array.
3088
@return pointer to slot */
3091
os_aio_array_get_nth_slot(
3092
/*======================*/
3093
os_aio_array_t* array, /*!< in: aio array */
3094
ulint index) /*!< in: index of the slot */
3096
ut_a(index < array->n_slots);
3098
return((array->slots) + index);
3101
#if defined(LINUX_NATIVE_AIO)
3102
/******************************************************************//**
3103
Creates an io_context for native linux AIO.
3104
@return TRUE on success. */
3107
os_aio_linux_create_io_ctx(
3108
/*=======================*/
3109
ulint max_events, /*!< in: number of events. */
3110
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3116
memset(io_ctx, 0x0, sizeof(*io_ctx));
3118
/* Initialize the io_ctx. Tell it how many pending
3119
IO requests this context will handle. */
3121
ret = io_setup(max_events, io_ctx);
3123
#if defined(UNIV_AIO_DEBUG)
3125
"InnoDB: Linux native AIO:"
3126
" initialized io_ctx for segment\n");
3128
/* Success. Return now. */
3132
/* If we hit EAGAIN we'll make a few attempts before failing. */
3137
/* First time around. */
3138
ut_print_timestamp(stderr);
3140
" InnoDB: Warning: io_setup() failed"
3141
" with EAGAIN. Will make %d attempts"
3142
" before giving up.\n",
3143
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3146
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3149
"InnoDB: Warning: io_setup() attempt"
3152
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3156
/* Have tried enough. Better call it a day. */
3157
ut_print_timestamp(stderr);
3159
" InnoDB: Error: io_setup() failed"
3160
" with EAGAIN after %d attempts.\n",
3161
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3165
ut_print_timestamp(stderr);
3167
" InnoDB: Error: Linux Native AIO interface"
3168
" is not supported on this platform. Please"
3169
" check your OS documentation and install"
3170
" appropriate binary of InnoDB.\n");
3175
ut_print_timestamp(stderr);
3177
" InnoDB: Error: Linux Native AIO setup"
3178
" returned following error[%d]\n", -ret);
3183
"InnoDB: You can disable Linux Native AIO by"
3184
" setting innodb_native_aio = off in my.cnf\n");
3187
#endif /* LINUX_NATIVE_AIO */
3189
/******************************************************************//**
3190
Creates an aio wait array. Note that we return NULL in case of failure.
3191
We don't care about freeing memory here because we assume that a
3192
failure will result in server refusing to start up.
3193
@return own: aio array, NULL on failure */
3196
os_aio_array_create(
3197
/*================*/
3198
ulint n, /*!< in: maximum number of pending aio
3199
operations allowed; n must be
3200
divisible by n_segments */
3201
ulint n_segments) /*!< in: number of segments in the aio array */
3203
os_aio_array_t* array;
3205
os_aio_slot_t* slot;
3208
#elif defined(LINUX_NATIVE_AIO)
3209
struct io_event* io_event = NULL;
3212
ut_a(n_segments > 0);
3214
array = ut_malloc(sizeof(os_aio_array_t));
3216
array->mutex = os_mutex_create();
3217
array->not_full = os_event_create(NULL);
3218
array->is_empty = os_event_create(NULL);
3220
os_event_set(array->is_empty);
3223
array->n_segments = n_segments;
3224
array->n_reserved = 0;
3226
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3228
array->handles = ut_malloc(n * sizeof(HANDLE));
3231
#if defined(LINUX_NATIVE_AIO)
3232
array->aio_ctx = NULL;
3233
array->aio_events = NULL;
3235
/* If we are not using native aio interface then skip this
3236
part of initialization. */
3237
if (!srv_use_native_aio) {
3238
goto skip_native_aio;
3241
/* Initialize the io_context array. One io_context
3242
per segment in the array. */
3244
array->aio_ctx = ut_malloc(n_segments *
3245
sizeof(*array->aio_ctx));
3246
for (i = 0; i < n_segments; ++i) {
3247
if (!os_aio_linux_create_io_ctx(n/n_segments,
3248
&array->aio_ctx[i])) {
3249
/* If something bad happened during aio setup
3250
we should call it a day and return right away.
3251
We don't care about any leaks because a failure
3252
to initialize the io subsystem means that the
3253
server (or atleast the innodb storage engine)
3254
is not going to startup. */
3259
/* Initialize the event array. One event per slot. */
3260
io_event = ut_malloc(n * sizeof(*io_event));
3261
memset(io_event, 0x0, sizeof(*io_event) * n);
3262
array->aio_events = io_event;
3265
#endif /* LINUX_NATIVE_AIO */
3266
for (i = 0; i < n; i++) {
3267
slot = os_aio_array_get_nth_slot(array, i);
3270
slot->reserved = FALSE;
3272
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3274
over = &(slot->control);
3276
over->hEvent = slot->handle;
3278
*((array->handles) + i) = over->hEvent;
3280
#elif defined(LINUX_NATIVE_AIO)
3282
memset(&slot->control, 0x0, sizeof(slot->control));
3291
/************************************************************************//**
3292
Frees an aio wait array. */
3297
os_aio_array_t* array) /*!< in, own: array to free */
3302
for (i = 0; i < array->n_slots; i++) {
3303
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3304
CloseHandle(slot->handle);
3306
#endif /* WIN_ASYNC_IO */
3309
ut_free(array->handles);
3310
#endif /* __WIN__ */
3311
os_mutex_free(array->mutex);
3312
os_event_free(array->not_full);
3313
os_event_free(array->is_empty);
3315
#if defined(LINUX_NATIVE_AIO)
3316
if (srv_use_native_aio) {
3317
ut_free(array->aio_events);
3318
ut_free(array->aio_ctx);
3320
#endif /* LINUX_NATIVE_AIO */
3322
ut_free(array->slots);
3326
/***********************************************************************
3327
Initializes the asynchronous io system. Creates one array each for ibuf
3328
and log i/o. Also creates one array each for read and write where each
3329
array is divided logically into n_read_segs and n_write_segs
3330
respectively. The caller must create an i/o handler thread for each
3331
segment in these arrays. This function also creates the sync array.
3332
No i/o handler thread needs to be created for that */
3337
ulint n_per_seg, /*<! in: maximum number of pending aio
3338
operations allowed per segment */
3339
ulint n_read_segs, /*<! in: number of reader threads */
3340
ulint n_write_segs, /*<! in: number of writer threads */
3341
ulint n_slots_sync) /*<! in: number of slots in the sync aio
3345
ulint n_segments = 2 + n_read_segs + n_write_segs;
3347
ut_ad(n_segments >= 4);
3349
os_io_init_simple();
3351
for (i = 0; i < n_segments; i++) {
3352
srv_set_io_thread_op_info(i, "not started yet");
3356
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3358
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3359
if (os_aio_ibuf_array == NULL) {
3363
srv_io_thread_function[0] = "insert buffer thread";
3365
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3366
if (os_aio_log_array == NULL) {
3370
srv_io_thread_function[1] = "log thread";
3372
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
3374
if (os_aio_read_array == NULL) {
3378
for (i = 2; i < 2 + n_read_segs; i++) {
3379
ut_a(i < SRV_MAX_N_IO_THREADS);
3380
srv_io_thread_function[i] = "read thread";
3383
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
3385
if (os_aio_write_array == NULL) {
3389
for (i = 2 + n_read_segs; i < n_segments; i++) {
3390
ut_a(i < SRV_MAX_N_IO_THREADS);
3391
srv_io_thread_function[i] = "write thread";
3394
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3395
if (os_aio_sync_array == NULL) {
3400
os_aio_n_segments = n_segments;
3404
os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
3406
for (i = 0; i < n_segments; i++) {
3407
os_aio_segment_wait_events[i] = os_event_create(NULL);
3410
os_last_printout = time(NULL);
3419
/***********************************************************************
3420
Frees the asynchronous io system. */
3428
os_aio_array_free(os_aio_ibuf_array);
3429
os_aio_ibuf_array = NULL;
3430
os_aio_array_free(os_aio_log_array);
3431
os_aio_log_array = NULL;
3432
os_aio_array_free(os_aio_read_array);
3433
os_aio_read_array = NULL;
3434
os_aio_array_free(os_aio_write_array);
3435
os_aio_write_array = NULL;
3436
os_aio_array_free(os_aio_sync_array);
3437
os_aio_sync_array = NULL;
3439
for (i = 0; i < os_aio_n_segments; i++) {
3440
os_event_free(os_aio_segment_wait_events[i]);
3443
ut_free(os_aio_segment_wait_events);
3444
os_aio_segment_wait_events = 0;
3445
os_aio_n_segments = 0;
3449
/************************************************************************//**
3450
Wakes up all async i/o threads in the array in Windows async i/o at
3454
os_aio_array_wake_win_aio_at_shutdown(
3455
/*==================================*/
3456
os_aio_array_t* array) /*!< in: aio array */
3460
for (i = 0; i < array->n_slots; i++) {
3462
SetEvent((array->slots + i)->handle);
3467
/************************************************************************//**
3468
Wakes up all async i/o threads so that they know to exit themselves in
3472
os_aio_wake_all_threads_at_shutdown(void)
3473
/*=====================================*/
3478
/* This code wakes up all ai/o threads in Windows native aio */
3479
os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3480
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3481
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3482
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3484
#elif defined(LINUX_NATIVE_AIO)
3486
/* When using native AIO interface the io helper threads
3487
wait on io_getevents with a timeout value of 500ms. At
3488
each wake up these threads check the server status.
3489
No need to do anything to wake them up. */
3491
if (srv_use_native_aio) {
3494
/* Fall through to simulated AIO handler wakeup if we are
3495
not using native AIO. */
3497
/* This loop wakes up all simulated ai/o threads */
3499
for (i = 0; i < os_aio_n_segments; i++) {
3501
os_event_set(os_aio_segment_wait_events[i]);
3505
/************************************************************************//**
3506
Waits until there are no pending writes in os_aio_write_array. There can
3507
be other, synchronous, pending writes. */
3510
os_aio_wait_until_no_pending_writes(void)
3511
/*=====================================*/
3513
os_event_wait(os_aio_write_array->is_empty);
3516
/**********************************************************************//**
3517
Calculates segment number for a slot.
3518
@return segment number (which is the number used by, for example,
3519
i/o-handler threads) */
3522
os_aio_get_segment_no_from_slot(
3523
/*============================*/
3524
os_aio_array_t* array, /*!< in: aio wait array */
3525
os_aio_slot_t* slot) /*!< in: slot in this array */
3530
if (array == os_aio_ibuf_array) {
3533
} else if (array == os_aio_log_array) {
3536
} else if (array == os_aio_read_array) {
3537
seg_len = os_aio_read_array->n_slots
3538
/ os_aio_read_array->n_segments;
3540
segment = 2 + slot->pos / seg_len;
3542
ut_a(array == os_aio_write_array);
3543
seg_len = os_aio_write_array->n_slots
3544
/ os_aio_write_array->n_segments;
3546
segment = os_aio_read_array->n_segments + 2
3547
+ slot->pos / seg_len;
3553
/**********************************************************************//**
3554
Calculates local segment number and aio array from global segment number.
3555
@return local segment number within the aio array */
3558
os_aio_get_array_and_local_segment(
3559
/*===============================*/
3560
os_aio_array_t** array, /*!< out: aio wait array */
3561
ulint global_segment)/*!< in: global segment number */
3565
ut_a(global_segment < os_aio_n_segments);
3567
if (global_segment == 0) {
3568
*array = os_aio_ibuf_array;
3571
} else if (global_segment == 1) {
3572
*array = os_aio_log_array;
3575
} else if (global_segment < os_aio_read_array->n_segments + 2) {
3576
*array = os_aio_read_array;
3578
segment = global_segment - 2;
3580
*array = os_aio_write_array;
3582
segment = global_segment - (os_aio_read_array->n_segments + 2);
3588
/*******************************************************************//**
3589
Requests for a slot in the aio array. If no slot is available, waits until
3590
not_full-event becomes signaled.
3591
@return pointer to slot */
3594
os_aio_array_reserve_slot(
3595
/*======================*/
3596
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3597
os_aio_array_t* array, /*!< in: aio array */
3598
fil_node_t* message1,/*!< in: message to be passed along with
3599
the aio operation */
3600
void* message2,/*!< in: message to be passed along with
3601
the aio operation */
3602
os_file_t file, /*!< in: file handle */
3603
const char* name, /*!< in: name of the file or path as a
3604
null-terminated string */
3605
void* buf, /*!< in: buffer where to read or from which
3607
ulint offset, /*!< in: least significant 32 bits of file
3609
ulint offset_high, /*!< in: most significant 32 bits of
3611
ulint len) /*!< in: length of the block to read or write */
3613
os_aio_slot_t* slot = NULL;
3615
OVERLAPPED* control;
3617
#elif defined(LINUX_NATIVE_AIO)
3625
ulint slots_per_seg;
3628
/* No need of a mutex. Only reading constant fields */
3629
slots_per_seg = array->n_slots / array->n_segments;
3631
/* We attempt to keep adjacent blocks in the same local
3632
segment. This can help in merging IO requests when we are
3633
doing simulated AIO */
3634
local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3635
% array->n_segments;
3638
os_mutex_enter(array->mutex);
3640
if (array->n_reserved == array->n_slots) {
3641
os_mutex_exit(array->mutex);
3643
if (!srv_use_native_aio) {
3644
/* If the handler threads are suspended, wake them
3645
so that we get more slots */
3647
os_aio_simulated_wake_handler_threads();
3650
os_event_wait(array->not_full);
3655
/* We start our search for an available slot from our preferred
3656
local segment and do a full scan of the array. We are
3657
guaranteed to find a slot in full scan. */
3658
for (i = local_seg * slots_per_seg, counter = 0;
3659
counter < array->n_slots; i++, counter++) {
3661
i %= array->n_slots;
3662
slot = os_aio_array_get_nth_slot(array, i);
3664
if (slot->reserved == FALSE) {
3669
/* We MUST always be able to get hold of a reserved slot. */
3673
ut_a(slot->reserved == FALSE);
3674
array->n_reserved++;
3676
if (array->n_reserved == 1) {
3677
os_event_reset(array->is_empty);
3680
if (array->n_reserved == array->n_slots) {
3681
os_event_reset(array->not_full);
3684
slot->reserved = TRUE;
3685
slot->reservation_time = time(NULL);
3686
slot->message1 = message1;
3687
slot->message2 = message2;
3693
slot->offset = offset;
3694
slot->offset_high = offset_high;
3695
slot->io_already_done = FALSE;
3698
control = &(slot->control);
3699
control->Offset = (DWORD)offset;
3700
control->OffsetHigh = (DWORD)offset_high;
3701
ResetEvent(slot->handle);
3703
#elif defined(LINUX_NATIVE_AIO)
3705
/* If we are not using native AIO skip this part. */
3706
if (!srv_use_native_aio) {
3707
goto skip_native_aio;
3710
/* Check if we are dealing with 64 bit arch.
3711
If not then make sure that offset fits in 32 bits. */
3712
if (sizeof(aio_offset) == 8) {
3713
aio_offset = offset_high;
3715
aio_offset += offset;
3717
ut_a(offset_high == 0);
3718
aio_offset = offset;
3721
iocb = &slot->control;
3723
if (type == OS_FILE_READ) {
3724
io_prep_pread(iocb, file, buf, len, aio_offset);
3726
ut_a(type == OS_FILE_WRITE);
3727
io_prep_pwrite(iocb, file, buf, len, aio_offset);
3730
iocb->data = (void*)slot;
3733
/*fprintf(stderr, "Filled up Linux native iocb.\n");*/
3737
#endif /* LINUX_NATIVE_AIO */
3738
os_mutex_exit(array->mutex);
3743
/*******************************************************************//**
3744
Frees a slot in the aio array. */
3747
os_aio_array_free_slot(
3748
/*===================*/
3749
os_aio_array_t* array, /*!< in: aio array */
3750
os_aio_slot_t* slot) /*!< in: pointer to slot */
3755
os_mutex_enter(array->mutex);
3757
ut_ad(slot->reserved);
3759
slot->reserved = FALSE;
3761
array->n_reserved--;
3763
if (array->n_reserved == array->n_slots - 1) {
3764
os_event_set(array->not_full);
3767
if (array->n_reserved == 0) {
3768
os_event_set(array->is_empty);
3773
ResetEvent(slot->handle);
3775
#elif defined(LINUX_NATIVE_AIO)
3777
if (srv_use_native_aio) {
3778
memset(&slot->control, 0x0, sizeof(slot->control));
3781
/*fprintf(stderr, "Freed up Linux native slot.\n");*/
3783
/* These fields should not be used if we are not
3784
using native AIO. */
3785
ut_ad(slot->n_bytes == 0);
3786
ut_ad(slot->ret == 0);
3790
os_mutex_exit(array->mutex);
3793
/**********************************************************************//**
3794
Wakes up a simulated aio i/o-handler thread if it has something to do. */
3797
os_aio_simulated_wake_handler_thread(
3798
/*=================================*/
3799
ulint global_segment) /*!< in: the number of the segment in the aio
3802
os_aio_array_t* array;
3803
os_aio_slot_t* slot;
3808
ut_ad(!srv_use_native_aio);
3810
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3812
n = array->n_slots / array->n_segments;
3814
/* Look through n slots after the segment * n'th slot */
3816
os_mutex_enter(array->mutex);
3818
for (i = 0; i < n; i++) {
3819
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3821
if (slot->reserved) {
3822
/* Found an i/o request */
3828
os_mutex_exit(array->mutex);
3831
os_event_set(os_aio_segment_wait_events[global_segment]);
3835
/**********************************************************************//**
3836
Wakes up simulated aio i/o-handler threads if they have something to do. */
3839
os_aio_simulated_wake_handler_threads(void)
3840
/*=======================================*/
3844
if (srv_use_native_aio) {
3845
/* We do not use simulated aio: do nothing */
3850
os_aio_recommend_sleep_for_read_threads = FALSE;
3852
for (i = 0; i < os_aio_n_segments; i++) {
3853
os_aio_simulated_wake_handler_thread(i);
3857
/**********************************************************************//**
3858
This function can be called if one wants to post a batch of reads and
3859
prefers an i/o-handler thread to handle them all at once later. You must
3860
call os_aio_simulated_wake_handler_threads later to ensure the threads
3861
are not left sleeping! */
3864
os_aio_simulated_put_read_threads_to_sleep(void)
3865
/*============================================*/
3868
/* The idea of putting background IO threads to sleep is only for
3869
Windows when using simulated AIO. Windows XP seems to schedule
3870
background threads too eagerly to allow for coalescing during
3871
readahead requests. */
3873
os_aio_array_t* array;
3876
if (srv_use_native_aio) {
3877
/* We do not use simulated aio: do nothing */
3882
os_aio_recommend_sleep_for_read_threads = TRUE;
3884
for (g = 0; g < os_aio_n_segments; g++) {
3885
os_aio_get_array_and_local_segment(&array, g);
3887
if (array == os_aio_read_array) {
3889
os_event_reset(os_aio_segment_wait_events[g]);
3892
#endif /* __WIN__ */
3895
#if defined(LINUX_NATIVE_AIO)
3896
/*******************************************************************//**
3897
Dispatch an AIO request to the kernel.
3898
@return TRUE on success. */
3901
os_aio_linux_dispatch(
3902
/*==================*/
3903
os_aio_array_t* array, /*!< in: io request array. */
3904
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3910
ut_ad(slot != NULL);
3913
ut_a(slot->reserved);
3915
/* Find out what we are going to work with.
3916
The iocb struct is directly in the slot.
3917
The io_context is one per segment. */
3919
iocb = &slot->control;
3920
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3922
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3924
#if defined(UNIV_AIO_DEBUG)
3926
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3927
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3928
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3931
/* io_submit returns number of successfully
3932
queued requests or -errno. */
3933
if (UNIV_UNLIKELY(ret != 1)) {
3940
#endif /* LINUX_NATIVE_AIO */
3943
/*******************************************************************//**
3944
NOTE! Use the corresponding macro os_aio(), not directly this function!
3945
Requests an asynchronous i/o operation.
3946
@return TRUE if request was queued successfully, FALSE if fail */
3951
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3952
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3953
to OS_AIO_SIMULATED_WAKE_LATER: the
3954
last flag advises this function not to wake
3955
i/o-handler threads, but the caller will
3956
do the waking explicitly later, in this
3957
way the caller can post several requests in
3958
a batch; NOTE that the batch must not be
3959
so big that it exhausts the slots in aio
3960
arrays! NOTE that a simulated batch
3961
may introduce hidden chances of deadlocks,
3962
because i/os are not actually handled until
3963
all have been posted: use with great
3965
const char* name, /*!< in: name of the file or path as a
3966
null-terminated string */
3967
os_file_t file, /*!< in: handle to a file */
3968
void* buf, /*!< in: buffer where to read or from which
3970
ulint offset, /*!< in: least significant 32 bits of file
3971
offset where to read or write */
3972
ulint offset_high, /*!< in: most significant 32 bits of
3974
ulint n, /*!< in: number of bytes to read or write */
3975
fil_node_t* message1,/*!< in: message for the aio handler
3976
(can be used to identify a completed
3977
aio operation); ignored if mode is
3979
void* message2)/*!< in: message for the aio handler
3980
(can be used to identify a completed
3981
aio operation); ignored if mode is
3984
os_aio_array_t* array;
3985
os_aio_slot_t* slot;
3989
DWORD len = (DWORD) n;
3990
struct fil_node_struct * dummy_mess1;
3993
#endif /* WIN_ASYNC_IO */
3994
#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4002
ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4003
ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4004
ut_ad(os_aio_validate());
4006
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4007
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4009
if (mode == OS_AIO_SYNC
4011
&& !srv_use_native_aio
4012
#endif /* WIN_ASYNC_IO */
4014
/* This is actually an ordinary synchronous read or write:
4015
no need to use an i/o-handler thread. NOTE that if we use
4016
Windows async i/o, Windows does not allow us to use
4017
ordinary synchronous os_file_read etc. on the same file,
4018
therefore we have built a special mechanism for synchronous
4019
wait in the Windows case. */
4021
if (type == OS_FILE_READ) {
4022
return(os_file_read(file, buf, offset,
4026
ut_a(type == OS_FILE_WRITE);
4028
return(os_file_write(name, file, buf, offset, offset_high, n));
4031
#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4034
if (mode == OS_AIO_NORMAL) {
4035
if (type == OS_FILE_READ) {
4036
array = os_aio_read_array;
4038
array = os_aio_write_array;
4040
} else if (mode == OS_AIO_IBUF) {
4041
ut_ad(type == OS_FILE_READ);
4042
/* Reduce probability of deadlock bugs in connection with ibuf:
4043
do not let the ibuf i/o handler sleep */
4047
array = os_aio_ibuf_array;
4048
} else if (mode == OS_AIO_LOG) {
4050
array = os_aio_log_array;
4051
} else if (mode == OS_AIO_SYNC) {
4052
array = os_aio_sync_array;
4054
#if defined(LINUX_NATIVE_AIO)
4055
/* In Linux native AIO we don't use sync IO array. */
4056
ut_a(!srv_use_native_aio);
4057
#endif /* LINUX_NATIVE_AIO */
4059
array = NULL; /* Eliminate compiler warning */
4063
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4064
name, buf, offset, offset_high, n);
4065
if (type == OS_FILE_READ) {
4066
if (srv_use_native_aio) {
4068
os_bytes_read_since_printout += n;
4070
ret = ReadFile(file, buf, (DWORD)n, &len,
4073
#elif defined(LINUX_NATIVE_AIO)
4074
if (!os_aio_linux_dispatch(array, slot)) {
4080
os_aio_simulated_wake_handler_thread(
4081
os_aio_get_segment_no_from_slot(
4085
} else if (type == OS_FILE_WRITE) {
4086
if (srv_use_native_aio) {
4089
ret = WriteFile(file, buf, (DWORD)n, &len,
4092
#elif defined(LINUX_NATIVE_AIO)
4093
if (!os_aio_linux_dispatch(array, slot)) {
4099
os_aio_simulated_wake_handler_thread(
4100
os_aio_get_segment_no_from_slot(
4109
if (srv_use_native_aio) {
4110
if ((ret && len == n)
4111
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
4112
/* aio was queued successfully! */
4114
if (mode == OS_AIO_SYNC) {
4115
/* We want a synchronous i/o operation on a
4116
file where we also use async i/o: in Windows
4117
we must use the same wait mechanism as for
4120
retval = os_aio_windows_handle(ULINT_UNDEFINED,
4134
#endif /* WIN_ASYNC_IO */
4135
/* aio was queued successfully! */
4138
#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4140
os_aio_array_free_slot(array, slot);
4142
retry = os_file_handle_error(name,
4143
type == OS_FILE_READ
4144
? "aio read" : "aio write");
4151
#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4155
/**********************************************************************//**
4156
This function is only used in Windows asynchronous i/o.
4157
Waits for an aio operation to complete. This function is used to wait the
4158
for completed requests. The aio array of pending requests is divided
4159
into segments. The thread specifies which segment or slot it wants to wait
4160
for. NOTE: this function will also take care of freeing the aio slot,
4161
therefore no other thread is allowed to do the freeing!
4162
@return TRUE if the aio operation succeeded */
4165
os_aio_windows_handle(
4166
/*==================*/
4167
ulint segment, /*!< in: the number of the segment in the aio
4168
arrays to wait for; segment 0 is the ibuf
4169
i/o thread, segment 1 the log i/o thread,
4170
then follow the non-ibuf read threads, and as
4171
the last are the non-ibuf write threads; if
4172
this is ULINT_UNDEFINED, then it means that
4173
sync aio is used, and this parameter is
4175
ulint pos, /*!< this parameter is used only in sync aio:
4176
wait for the aio slot at this position */
4177
fil_node_t**message1, /*!< out: the messages passed with the aio
4178
request; note that also in the case where
4179
the aio operation failed, these output
4180
parameters are valid and can be used to
4181
restart the operation, for example */
4183
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4185
ulint orig_seg = segment;
4186
os_aio_array_t* array;
4187
os_aio_slot_t* slot;
4195
if (segment == ULINT_UNDEFINED) {
4196
array = os_aio_sync_array;
4199
segment = os_aio_get_array_and_local_segment(&array, segment);
4202
/* NOTE! We only access constant fields in os_aio_array. Therefore
4203
we do not have to acquire the protecting mutex yet */
4205
ut_ad(os_aio_validate());
4206
ut_ad(segment < array->n_segments);
4208
n = array->n_slots / array->n_segments;
4210
if (array == os_aio_sync_array) {
4211
WaitForSingleObject(
4212
os_aio_array_get_nth_slot(array, pos)->handle,
4216
srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4217
i = WaitForMultipleObjects((DWORD) n,
4218
array->handles + segment * n,
4223
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4224
os_thread_exit(NULL);
4227
os_mutex_enter(array->mutex);
4229
slot = os_aio_array_get_nth_slot(array, i + segment * n);
4231
ut_a(slot->reserved);
4233
if (orig_seg != ULINT_UNDEFINED) {
4234
srv_set_io_thread_op_info(orig_seg,
4235
"get windows aio return value");
4238
ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
4240
*message1 = slot->message1;
4241
*message2 = slot->message2;
4245
if (ret && len == slot->len) {
4248
#ifdef UNIV_DO_FLUSH
4249
if (slot->type == OS_FILE_WRITE
4250
&& !os_do_not_call_flush_at_each_write) {
4251
if (!os_file_flush(slot->file)) {
4255
#endif /* UNIV_DO_FLUSH */
4256
} else if (os_file_handle_error(slot->name, "Windows aio")) {
4264
os_mutex_exit(array->mutex);
4267
/* retry failed read/write operation synchronously.
4268
No need to hold array->mutex. */
4271
/* This read/write does not go through os_file_read
4272
and os_file_write APIs, need to register with
4273
performance schema explicitly here. */
4274
struct PSI_file_locker* locker = NULL;
4275
register_pfs_file_io_begin(locker, slot->file, slot->len,
4276
(slot->type == OS_FILE_WRITE)
4279
__FILE__, __LINE__);
4282
switch (slot->type) {
4284
ret = WriteFile(slot->file, slot->buf,
4290
ret = ReadFile(slot->file, slot->buf,
4300
register_pfs_file_io_end(locker, len);
4303
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4304
/* aio was queued successfully!
4305
We want a synchronous i/o operation on a
4306
file where we also use async i/o: in Windows
4307
we must use the same wait mechanism as for
4310
ret = GetOverlappedResult(slot->file,
4315
ret_val = ret && len == slot->len;
4318
os_aio_array_free_slot(array, slot);
4324
#if defined(LINUX_NATIVE_AIO)
4325
/******************************************************************//**
4326
This function is only used in Linux native asynchronous i/o. This is
4327
called from within the io-thread. If there are no completed IO requests
4328
in the slot array, the thread calls this function to collect more
4329
requests from the kernel.
4330
The io-thread waits on io_getevents(), which is a blocking call, with
4331
a timeout value. Unless the system is very heavy loaded, keeping the
4332
io-thread very busy, the io-thread will spend most of its time waiting
4334
The io-thread also exits in this function. It checks server status at
4335
each wakeup and that is why we use timed wait in io_getevents(). */
4338
os_aio_linux_collect(
4339
/*=================*/
4340
os_aio_array_t* array, /*!< in/out: slot array. */
4341
ulint segment, /*!< in: local segment no. */
4342
ulint seg_size) /*!< in: segment size. */
4348
struct timespec timeout;
4349
struct io_event* events;
4350
struct io_context* io_ctx;
4352
/* sanity checks. */
4353
ut_ad(array != NULL);
4354
ut_ad(seg_size > 0);
4355
ut_ad(segment < array->n_segments);
4357
/* Which part of event array we are going to work on. */
4358
events = &array->aio_events[segment * seg_size];
4360
/* Which io_context we are going to use. */
4361
io_ctx = array->aio_ctx[segment];
4363
/* Starting point of the segment we will be working on. */
4364
start_pos = segment * seg_size;
4367
end_pos = start_pos + seg_size;
4371
/* Go down if we are in shutdown mode.
4372
In case of srv_fast_shutdown == 2, there may be pending
4373
IO requests but that should be OK as we essentially treat
4374
that as a crash of InnoDB. */
4375
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4376
os_thread_exit(NULL);
4379
/* Initialize the events. The timeout value is arbitrary.
4380
We probably need to experiment with it a little. */
4381
memset(events, 0, sizeof(*events) * seg_size);
4383
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4385
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4387
/* This error handling is for any error in collecting the
4388
IO requests. The errors, if any, for any particular IO
4389
request are simply passed on to the calling routine. */
4391
/* Not enough resources! Try again. */
4392
if (ret == -EAGAIN) {
4396
/* Interrupted! I have tested the behaviour in case of an
4397
interrupt. If we have some completed IOs available then
4398
the return code will be the number of IOs. We get EINTR only
4399
if there are no completed IOs and we have been interrupted. */
4400
if (ret == -EINTR) {
4404
/* No pending request! Go back and check again. */
4409
/* All other errors! should cause a trap for now. */
4410
if (UNIV_UNLIKELY(ret < 0)) {
4411
ut_print_timestamp(stderr);
4413
" InnoDB: unexpected ret_code[%d] from"
4414
" io_getevents()!\n", ret);
4420
for (i = 0; i < ret; i++) {
4421
os_aio_slot_t* slot;
4422
struct iocb* control;
4424
control = (struct iocb *)events[i].obj;
4425
ut_a(control != NULL);
4427
slot = (os_aio_slot_t *) control->data;
4429
/* Some sanity checks. */
4431
ut_a(slot->reserved);
4433
#if defined(UNIV_AIO_DEBUG)
4435
"io_getevents[%c]: slot[%p] ctx[%p]"
4437
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4438
slot, io_ctx, segment);
4441
/* We are not scribbling previous segment. */
4442
ut_a(slot->pos >= start_pos);
4444
/* We have not overstepped to next segment. */
4445
ut_a(slot->pos < end_pos);
4447
/* Mark this request as completed. The error handling
4448
will be done in the calling function. */
4449
os_mutex_enter(array->mutex);
4450
slot->n_bytes = events[i].res;
4451
slot->ret = events[i].res2;
4452
slot->io_already_done = TRUE;
4453
os_mutex_exit(array->mutex);
4459
/**********************************************************************//**
4460
This function is only used in Linux native asynchronous i/o.
4461
Waits for an aio operation to complete. This function is used to wait for
4462
the completed requests. The aio array of pending requests is divided
4463
into segments. The thread specifies which segment or slot it wants to wait
4464
for. NOTE: this function will also take care of freeing the aio slot,
4465
therefore no other thread is allowed to do the freeing!
4466
@return TRUE if the IO was successful */
4469
os_aio_linux_handle(
4470
/*================*/
4471
ulint global_seg, /*!< in: segment number in the aio array
4472
to wait for; segment 0 is the ibuf
4473
i/o thread, segment 1 is log i/o thread,
4474
then follow the non-ibuf read threads,
4475
and the last are the non-ibuf write
4477
fil_node_t**message1, /*!< out: the messages passed with the */
4478
void** message2, /*!< aio request; note that in case the
4479
aio operation failed, these output
4480
parameters are valid and can be used to
4481
restart the operation. */
4482
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4485
os_aio_array_t* array;
4486
os_aio_slot_t* slot;
4491
/* Should never be doing Sync IO here. */
4492
ut_a(global_seg != ULINT_UNDEFINED);
4494
/* Find the array and the local segment. */
4495
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4496
n = array->n_slots / array->n_segments;
4498
/* Loop until we have found a completed request. */
4500
os_mutex_enter(array->mutex);
4501
for (i = 0; i < n; ++i) {
4502
slot = os_aio_array_get_nth_slot(
4503
array, i + segment * n);
4504
if (slot->reserved && slot->io_already_done) {
4505
/* Something for us to work on. */
4510
os_mutex_exit(array->mutex);
4512
/* We don't have any completed request.
4513
Wait for some request. Note that we return
4514
from wait iff we have found a request. */
4516
srv_set_io_thread_op_info(global_seg,
4517
"waiting for completed aio requests");
4518
os_aio_linux_collect(array, segment, n);
4522
/* Note that it may be that there are more then one completed
4523
IO requests. We process them one at a time. We may have a case
4524
here to improve the performance slightly by dealing with all
4525
requests in one sweep. */
4526
srv_set_io_thread_op_info(global_seg,
4527
"processing completed aio requests");
4529
/* Ensure that we are scribbling only our segment. */
4532
ut_ad(slot != NULL);
4533
ut_ad(slot->reserved);
4534
ut_ad(slot->io_already_done);
4536
*message1 = slot->message1;
4537
*message2 = slot->message2;
4541
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4544
#ifdef UNIV_DO_FLUSH
4545
if (slot->type == OS_FILE_WRITE
4546
&& !os_do_not_call_flush_at_each_write)
4547
&& !os_file_flush(slot->file) {
4550
#endif /* UNIV_DO_FLUSH */
4554
/* os_file_handle_error does tell us if we should retry
4555
this IO. As it stands now, we don't do this retry when
4556
reaping requests from a different context than
4557
the dispatcher. This non-retry logic is the same for
4558
windows and linux native AIO.
4559
We should probably look into this to transparently
4560
re-submit the IO. */
4561
os_file_handle_error(slot->name, "Linux aio");
4566
os_mutex_exit(array->mutex);
4568
os_aio_array_free_slot(array, slot);
4572
#endif /* LINUX_NATIVE_AIO */
4574
/**********************************************************************//**
4575
Does simulated aio. This function should be called by an i/o-handler
4577
@return TRUE if the aio operation succeeded */
4580
os_aio_simulated_handle(
4581
/*====================*/
4582
ulint global_segment, /*!< in: the number of the segment in the aio
4583
arrays to wait for; segment 0 is the ibuf
4584
i/o thread, segment 1 the log i/o thread,
4585
then follow the non-ibuf read threads, and as
4586
the last are the non-ibuf write threads */
4587
fil_node_t**message1, /*!< out: the messages passed with the aio
4588
request; note that also in the case where
4589
the aio operation failed, these output
4590
parameters are valid and can be used to
4591
restart the operation, for example */
4593
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4595
os_aio_array_t* array;
4597
os_aio_slot_t* slot;
4598
os_aio_slot_t* slot2;
4599
os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
4600
ulint n_consecutive;
4603
ulint lowest_offset;
4607
byte* combined_buf2;
4612
/* Fix compiler warning */
4613
*consecutive_ios = NULL;
4615
memset(consecutive_ios, 0, sizeof(os_aio_slot_t*) * OS_AIO_MERGE_N_CONSECUTIVE);
4616
segment = os_aio_get_array_and_local_segment(&array, global_segment);
4619
/* NOTE! We only access constant fields in os_aio_array. Therefore
4620
we do not have to acquire the protecting mutex yet */
4622
srv_set_io_thread_op_info(global_segment,
4623
"looking for i/o requests (a)");
4624
ut_ad(os_aio_validate());
4625
ut_ad(segment < array->n_segments);
4627
n = array->n_slots / array->n_segments;
4629
/* Look through n slots after the segment * n'th slot */
4631
if (array == os_aio_read_array
4632
&& os_aio_recommend_sleep_for_read_threads) {
4634
/* Give other threads chance to add several i/os to the array
4637
goto recommended_sleep;
4640
os_mutex_enter(array->mutex);
4642
srv_set_io_thread_op_info(global_segment,
4643
"looking for i/o requests (b)");
4645
/* Check if there is a slot for which the i/o has already been
4648
for (i = 0; i < n; i++) {
4649
slot = os_aio_array_get_nth_slot(array, i + segment * n);
4651
if (slot->reserved && slot->io_already_done) {
4653
if (os_aio_print_debug) {
4655
"InnoDB: i/o for slot %lu"
4656
" already done, returning\n",
4668
/* If there are at least 2 seconds old requests, then pick the oldest
4669
one to prevent starvation. If several requests have the same age,
4670
then pick the one at the lowest offset. */
4673
lowest_offset = ULINT_MAX;
4675
for (i = 0; i < n; i++) {
4676
slot = os_aio_array_get_nth_slot(array, i + segment * n);
4678
if (slot->reserved) {
4679
age = (ulint)difftime(time(NULL),
4680
slot->reservation_time);
4682
if ((age >= 2 && age > biggest_age)
4683
|| (age >= 2 && age == biggest_age
4684
&& slot->offset < lowest_offset)) {
4686
/* Found an i/o request */
4687
consecutive_ios[0] = slot;
4692
lowest_offset = slot->offset;
4697
if (n_consecutive == 0) {
4698
/* There were no old requests. Look for an i/o request at the
4699
lowest offset in the array (we ignore the high 32 bits of the
4700
offset in these heuristics) */
4702
lowest_offset = ULINT_MAX;
4704
for (i = 0; i < n; i++) {
4705
slot = os_aio_array_get_nth_slot(array,
4708
if (slot->reserved && slot->offset < lowest_offset) {
4710
/* Found an i/o request */
4711
consecutive_ios[0] = slot;
4715
lowest_offset = slot->offset;
4720
if (n_consecutive == 0) {
4722
/* No i/o requested at the moment */
4727
/* if n_consecutive != 0, then we have assigned
4728
something valid to consecutive_ios[0] */
4729
ut_ad(n_consecutive != 0);
4730
ut_ad(consecutive_ios[0] != NULL);
4732
slot = consecutive_ios[0];
4734
/* Check if there are several consecutive blocks to read or write */
4737
for (i = 0; i < n; i++) {
4738
slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4740
if (slot2->reserved && slot2 != slot
4741
&& slot2->offset == slot->offset + slot->len
4742
/* check that sum does not wrap over */
4743
&& slot->offset + slot->len > slot->offset
4744
&& slot2->offset_high == slot->offset_high
4745
&& slot2->type == slot->type
4746
&& slot2->file == slot->file) {
4748
/* Found a consecutive i/o request */
4750
consecutive_ios[n_consecutive] = slot2;
4755
if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4757
goto consecutive_loop;
4764
srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4766
/* We have now collected n_consecutive i/o requests in the array;
4767
allocate a single buffer which can hold all data, and perform the
4771
slot = consecutive_ios[0];
4773
for (i = 0; i < n_consecutive; i++) {
4774
total_len += consecutive_ios[i]->len;
4777
if (n_consecutive == 1) {
4778
/* We can use the buffer of the i/o request */
4779
combined_buf = slot->buf;
4780
combined_buf2 = NULL;
4782
combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
4784
ut_a(combined_buf2);
4786
combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
4789
/* We release the array mutex for the time of the i/o: NOTE that
4790
this assumes that there is just one i/o-handler thread serving
4791
a single segment of slots! */
4793
os_mutex_exit(array->mutex);
4795
if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4796
/* Copy the buffers to the combined buffer */
4799
for (i = 0; i < n_consecutive; i++) {
4801
ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4802
consecutive_ios[i]->len);
4803
offs += consecutive_ios[i]->len;
4807
srv_set_io_thread_op_info(global_segment, "doing file i/o");
4809
if (os_aio_print_debug) {
4811
"InnoDB: doing i/o of type %lu at offset %lu %lu,"
4813
(ulong) slot->type, (ulong) slot->offset_high,
4814
(ulong) slot->offset, (ulong) total_len);
4817
/* Do the i/o with ordinary, synchronous i/o functions: */
4818
if (slot->type == OS_FILE_WRITE) {
4819
ret = os_file_write(slot->name, slot->file, combined_buf,
4820
slot->offset, slot->offset_high,
4823
ret = os_file_read(slot->file, combined_buf,
4824
slot->offset, slot->offset_high, total_len);
4828
srv_set_io_thread_op_info(global_segment, "file i/o done");
4832
"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4833
n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4836
if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4837
/* Copy the combined buffer to individual buffers */
4840
for (i = 0; i < n_consecutive; i++) {
4842
ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4843
consecutive_ios[i]->len);
4844
offs += consecutive_ios[i]->len;
4848
if (combined_buf2) {
4849
ut_free(combined_buf2);
4852
os_mutex_enter(array->mutex);
4854
/* Mark the i/os done in slots */
4856
for (i = 0; i < n_consecutive; i++) {
4857
consecutive_ios[i]->io_already_done = TRUE;
4860
/* We return the messages for the first slot now, and if there were
4861
several slots, the messages will be returned with subsequent calls
4866
ut_a(slot->reserved);
4868
*message1 = slot->message1;
4869
*message2 = slot->message2;
4873
os_mutex_exit(array->mutex);
4875
os_aio_array_free_slot(array, slot);
4880
srv_set_io_thread_op_info(global_segment, "resetting wait event");
4882
/* We wait here until there again can be i/os in the segment
4885
os_event_reset(os_aio_segment_wait_events[global_segment]);
4887
os_mutex_exit(array->mutex);
4890
srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4892
os_event_wait(os_aio_segment_wait_events[global_segment]);
4894
if (os_aio_print_debug) {
4896
"InnoDB: i/o handler thread for i/o"
4897
" segment %lu wakes up\n",
4898
(ulong) global_segment);
4904
/**********************************************************************//**
4905
Validates the consistency of an aio array.
4906
@return TRUE if ok */
4909
os_aio_array_validate(
4910
/*==================*/
4911
os_aio_array_t* array) /*!< in: aio wait array */
4913
os_aio_slot_t* slot;
4914
ulint n_reserved = 0;
4919
os_mutex_enter(array->mutex);
4921
ut_a(array->n_slots > 0);
4922
ut_a(array->n_segments > 0);
4924
for (i = 0; i < array->n_slots; i++) {
4925
slot = os_aio_array_get_nth_slot(array, i);
4927
if (slot->reserved) {
4929
ut_a(slot->len > 0);
4933
ut_a(array->n_reserved == n_reserved);
4935
os_mutex_exit(array->mutex);
4940
/**********************************************************************//**
4941
Validates the consistency the aio system.
4942
@return TRUE if ok */
4945
os_aio_validate(void)
4946
/*=================*/
4948
os_aio_array_validate(os_aio_read_array);
4949
os_aio_array_validate(os_aio_write_array);
4950
os_aio_array_validate(os_aio_ibuf_array);
4951
os_aio_array_validate(os_aio_log_array);
4952
os_aio_array_validate(os_aio_sync_array);
4957
/**********************************************************************//**
4958
Prints pending IO requests per segment of an aio array.
4959
We probably don't need per segment statistics but they can help us
4960
during development phase to see if the IO requests are being
4961
distributed as expected. */
4964
os_aio_print_segment_info(
4965
/*======================*/
4966
FILE* file, /*!< in: file where to print */
4967
ulint* n_seg, /*!< in: pending IO array */
4968
os_aio_array_t* array) /*!< in: array to process */
4974
ut_ad(array->n_segments > 0);
4976
if (array->n_segments == 1) {
4980
fprintf(file, " [");
4981
for (i = 0; i < array->n_segments; i++) {
4983
fprintf(file, ", ");
4986
fprintf(file, "%lu", n_seg[i]);
4988
fprintf(file, "] ");
4991
/**********************************************************************//**
4992
Prints info of the aio arrays. */
4997
FILE* file) /*!< in: file where to print */
4999
os_aio_array_t* array;
5000
os_aio_slot_t* slot;
5002
ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5003
time_t current_time;
5004
double time_elapsed;
5005
double avg_bytes_read;
5008
for (i = 0; i < srv_n_file_io_threads; i++) {
5009
fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
5010
srv_io_thread_op_info[i],
5011
srv_io_thread_function[i]);
5014
if (os_aio_segment_wait_events[i]->is_set) {
5015
fprintf(file, " ev set");
5019
fprintf(file, "\n");
5022
fputs("Pending normal aio reads:", file);
5024
array = os_aio_read_array;
5028
os_mutex_enter(array->mutex);
5030
ut_a(array->n_slots > 0);
5031
ut_a(array->n_segments > 0);
5035
memset(n_res_seg, 0x0, sizeof(n_res_seg));
5037
for (i = 0; i < array->n_slots; i++) {
5040
slot = os_aio_array_get_nth_slot(array, i);
5042
seg_no = (i * array->n_segments) / array->n_slots;
5043
if (slot->reserved) {
5045
n_res_seg[seg_no]++;
5047
fprintf(stderr, "Reserved slot, messages %p %p\n",
5048
(void*) slot->message1,
5049
(void*) slot->message2);
5051
ut_a(slot->len > 0);
5055
ut_a(array->n_reserved == n_reserved);
5057
fprintf(file, " %lu", (ulong) n_reserved);
5059
os_aio_print_segment_info(file, n_res_seg, array);
5061
os_mutex_exit(array->mutex);
5063
if (array == os_aio_read_array) {
5064
fputs(", aio writes:", file);
5066
array = os_aio_write_array;
5071
if (array == os_aio_write_array) {
5072
fputs(",\n ibuf aio reads:", file);
5073
array = os_aio_ibuf_array;
5078
if (array == os_aio_ibuf_array) {
5079
fputs(", log i/o's:", file);
5080
array = os_aio_log_array;
5085
if (array == os_aio_log_array) {
5086
fputs(", sync i/o's:", file);
5087
array = os_aio_sync_array;
5093
current_time = time(NULL);
5094
time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5097
"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5098
"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5099
(ulong) fil_n_pending_log_flushes,
5100
(ulong) fil_n_pending_tablespace_flushes,
5101
(ulong) os_n_file_reads, (ulong) os_n_file_writes,
5102
(ulong) os_n_fsyncs);
5104
if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
5106
"%lu pending preads, %lu pending pwrites\n",
5107
(ulong) os_file_n_pending_preads,
5108
(ulong) os_file_n_pending_pwrites);
5111
if (os_n_file_reads == os_n_file_reads_old) {
5112
avg_bytes_read = 0.0;
5114
avg_bytes_read = (double) os_bytes_read_since_printout
5115
/ (os_n_file_reads - os_n_file_reads_old);
5119
"%.2f reads/s, %lu avg bytes/read,"
5120
" %.2f writes/s, %.2f fsyncs/s\n",
5121
(os_n_file_reads - os_n_file_reads_old)
5123
(ulong)avg_bytes_read,
5124
(os_n_file_writes - os_n_file_writes_old)
5126
(os_n_fsyncs - os_n_fsyncs_old)
5129
os_n_file_reads_old = os_n_file_reads;
5130
os_n_file_writes_old = os_n_file_writes;
5131
os_n_fsyncs_old = os_n_fsyncs;
5132
os_bytes_read_since_printout = 0;
5134
os_last_printout = current_time;
5137
/**********************************************************************//**
5138
Refreshes the statistics used to print per-second averages. */
5141
os_aio_refresh_stats(void)
5142
/*======================*/
5144
os_n_file_reads_old = os_n_file_reads;
5145
os_n_file_writes_old = os_n_file_writes;
5146
os_n_fsyncs_old = os_n_fsyncs;
5147
os_bytes_read_since_printout = 0;
5149
os_last_printout = time(NULL);
5153
/**********************************************************************//**
5154
Checks that all slots in the system have been freed, that is, there are
5155
no pending io operations.
5156
@return TRUE if all free */
5159
os_aio_all_slots_free(void)
5160
/*=======================*/
5162
os_aio_array_t* array;
5165
array = os_aio_read_array;
5167
os_mutex_enter(array->mutex);
5169
n_res += array->n_reserved;
5171
os_mutex_exit(array->mutex);
5173
array = os_aio_write_array;
5175
os_mutex_enter(array->mutex);
5177
n_res += array->n_reserved;
5179
os_mutex_exit(array->mutex);
5181
array = os_aio_ibuf_array;
5183
os_mutex_enter(array->mutex);
5185
n_res += array->n_reserved;
5187
os_mutex_exit(array->mutex);
5189
array = os_aio_log_array;
5191
os_mutex_enter(array->mutex);
5193
n_res += array->n_reserved;
5195
os_mutex_exit(array->mutex);
5197
array = os_aio_sync_array;
5199
os_mutex_enter(array->mutex);
5201
n_res += array->n_reserved;
5203
os_mutex_exit(array->mutex);
5212
#endif /* UNIV_DEBUG */
5214
#endif /* !UNIV_HOTBACKUP */