1
/*****************************************************************************
3
Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4
Copyright (C) 2009, Percona Inc.
6
Portions of this file contain modifications contributed and copyrighted
7
by Percona Inc.. Those modifications are
8
gratefully acknowledged and are described briefly in the InnoDB
9
documentation. The contributions by Percona Inc. are incorporated with
10
their permission, and subject to the conditions contained in the file
13
This program is free software; you can redistribute it and/or modify it under
14
the terms of the GNU General Public License as published by the Free Software
15
Foundation; version 2 of the License.
17
This program is distributed in the hope that it will be useful, but WITHOUT
18
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
You should have received a copy of the GNU General Public License along with
22
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23
St, Fifth Floor, Boston, MA 02110-1301 USA
25
*****************************************************************************/
27
/**************************************************//**
29
The interface to the operating system file i/o primitives
31
Created 10/21/1995 Heikki Tuuri
32
*******************************************************/
42
#include "srv0start.h"
49
#ifndef UNIV_HOTBACKUP
51
# include "os0thread.h"
52
#else /* !UNIV_HOTBACKUP */
54
/* Add includes for the _stat() call to compile on Windows */
55
# include <sys/types.h>
56
# include <sys/stat.h>
58
#endif /* !UNIV_HOTBACKUP */
60
#if defined(LINUX_NATIVE_AIO)
64
/* This specifies the file permissions InnoDB uses when it creates files in
65
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
69
/** Umask for creating files */
70
UNIV_INTERN ulint os_innodb_umask
71
= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
73
/** Umask for creating files */
74
UNIV_INTERN ulint os_innodb_umask = 0;
78
/* If the following is set to TRUE, we do not call os_file_flush in every
79
os_file_write. We can set this TRUE when the doublewrite buffer is used. */
80
UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
82
/* We do not call os_file_flush in every os_file_write. */
83
#endif /* UNIV_DO_FLUSH */
85
#ifndef UNIV_HOTBACKUP
86
/* We use these mutexes to protect lseek + file i/o operation, if the
87
OS does not provide an atomic pread or pwrite, or similar */
88
#define OS_FILE_N_SEEK_MUTEXES 16
89
UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
91
/* In simulated aio, merge at most this many consecutive i/os */
92
#define OS_AIO_MERGE_N_CONSECUTIVE 64
94
/**********************************************************************
96
InnoDB AIO Implementation:
97
=========================
99
We support native AIO for windows and linux. For rest of the platforms
100
we simulate AIO by special io-threads servicing the IO-requests.
105
In platforms where we 'simulate' AIO following is a rough explanation
106
of the high level design.
107
There are four io-threads (for ibuf, log, read, write).
108
All synchronous IO requests are serviced by the calling thread using
109
os_file_write/os_file_read. The Asynchronous requests are queued up
110
in an array (there are four such arrays) by the calling thread.
111
Later these requests are picked up by the io-thread and are serviced
117
If srv_use_native_aio is not set then windows follow the same
118
code as simulated AIO. If the flag is set then native AIO interface
119
is used. On windows, one of the limitation is that if a file is opened
120
for AIO no synchronous IO can be done on it. Therefore we have an
121
extra fifth array to queue up synchronous IO requests.
122
There are innodb_file_io_threads helper threads. These threads work
123
on the four arrays mentioned above in Simulated AIO. No thread is
124
required for the sync array.
125
If a synchronous IO request is made, it is first queued in the sync
126
array. Then the calling thread itself waits on the request, thus
127
making the call synchronous.
128
If an AIO request is made the calling thread not only queues it in the
129
array but also submits the requests. The helper thread then collects
130
the completed IO request and calls completion routine on it.
135
If we have libaio installed on the system and innodb_use_native_aio
136
is set to TRUE we follow the code path of native AIO, otherwise we
138
There are innodb_file_io_threads helper threads. These threads work
139
on the four arrays mentioned above in Simulated AIO.
140
If a synchronous IO request is made, it is handled by calling
141
os_file_write/os_file_read.
142
If an AIO request is made the calling thread not only queues it in the
143
array but also submits the requests. The helper thread then collects
144
the completed IO request and calls completion routine on it.
146
**********************************************************************/
148
/** Flag: enable debug printout for asynchronous i/o */
149
UNIV_INTERN ibool os_aio_print_debug = FALSE;
152
/* Keys to register InnoDB I/O with performance schema */
153
UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154
UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155
UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156
#endif /* UNIV_PFS_IO */
158
/** The asynchronous i/o array slot structure */
159
typedef struct os_aio_slot_struct os_aio_slot_t;
161
/** The asynchronous i/o array slot structure */
162
struct os_aio_slot_struct{
163
ibool is_read; /*!< TRUE if a read operation */
164
ulint pos; /*!< index of the slot in the aio
166
ibool reserved; /*!< TRUE if this slot is reserved */
167
time_t reservation_time;/*!< time when reserved */
168
ulint len; /*!< length of the block to read or
170
byte* buf; /*!< buffer used in i/o */
171
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
172
ulint offset; /*!< 32 low bits of file offset in
174
ulint offset_high; /*!< 32 high bits of file offset */
175
os_file_t file; /*!< file where to read or write */
176
const char* name; /*!< file name or path */
177
ibool io_already_done;/*!< used only in simulated aio:
178
TRUE if the physical i/o already
179
made and only the slot message
180
needs to be passed to the caller
181
of os_aio_simulated_handle */
182
fil_node_t* message1; /*!< message which is given by the */
183
void* message2; /*!< the requester of an aio operation
184
and which can be used to identify
185
which pending aio operation was
188
HANDLE handle; /*!< handle object we need in the
190
OVERLAPPED control; /*!< Windows control block for the
192
#elif defined(LINUX_NATIVE_AIO)
193
struct iocb control; /* Linux control block for aio */
194
int n_bytes; /* bytes written/read. */
195
int ret; /* AIO return code */
199
/** The asynchronous i/o array structure */
200
typedef struct os_aio_array_struct os_aio_array_t;
202
/** The asynchronous i/o array structure */
203
struct os_aio_array_struct{
204
os_mutex_t mutex; /*!< the mutex protecting the aio array */
206
/*!< The event which is set to the
207
signaled state when there is space in
208
the aio outside the ibuf segment */
210
/*!< The event which is set to the
211
signaled state when there are no
212
pending i/os in this array */
213
ulint n_slots;/*!< Total number of slots in the aio
214
array. This must be divisible by
217
/*!< Number of segments in the aio
218
array of pending aio requests. A
219
thread can wait separately for any one
221
ulint cur_seg;/*!< We reserve IO requests in round
222
robin fashion to different segments.
223
This points to the segment that is to
224
be used to service next IO request. */
226
/*!< Number of reserved slots in the
227
aio array outside the ibuf segment */
228
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
231
/*!< Pointer to an array of OS native
232
event handles where we copied the
233
handles from slots, in the same
234
order. This can be used in
235
WaitForMultipleObjects; used only in
239
#if defined(LINUX_NATIVE_AIO)
240
io_context_t* aio_ctx;
241
/* completion queue for IO. There is
242
one such queue per segment. Each thread
243
will work on one ctx exclusively. */
244
struct io_event* aio_events;
245
/* The array to collect completed IOs.
246
There is one such event for each
247
possible pending IO. The size of the
248
array is equal to n_slots. */
252
#if defined(LINUX_NATIVE_AIO)
253
/** timeout for each io_getevents() call = 500ms. */
254
#define OS_AIO_REAP_TIMEOUT (500000000UL)
256
/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
257
#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
259
/** number of attempts before giving up on io_setup(). */
260
#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
263
/** Array of events used in simulated aio */
264
static os_event_t* os_aio_segment_wait_events = NULL;
266
/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
267
are NULL when the module has not yet been initialized. @{ */
268
static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
269
static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
270
static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
271
static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
272
static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
275
/** Number of asynchronous I/O segments. Set by os_aio_init(). */
276
static ulint os_aio_n_segments = ULINT_UNDEFINED;
278
/** If the following is TRUE, read i/o handler threads try to
279
wait until a batch of new read requests have been posted */
280
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281
#endif /* !UNIV_HOTBACKUP */
283
UNIV_INTERN ulint os_n_file_reads = 0;
284
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
285
UNIV_INTERN ulint os_n_file_writes = 0;
286
UNIV_INTERN ulint os_n_fsyncs = 0;
287
UNIV_INTERN ulint os_n_file_reads_old = 0;
288
UNIV_INTERN ulint os_n_file_writes_old = 0;
289
UNIV_INTERN ulint os_n_fsyncs_old = 0;
290
UNIV_INTERN time_t os_last_printout;
292
UNIV_INTERN ibool os_has_said_disk_full = FALSE;
294
#ifndef UNIV_HOTBACKUP
295
/** The mutex protecting the following counts of pending I/O operations */
296
static os_mutex_t os_file_count_mutex;
297
#endif /* !UNIV_HOTBACKUP */
298
/** Number of pending os_file_pread() operations */
299
UNIV_INTERN ulint os_file_n_pending_preads = 0;
300
/** Number of pending os_file_pwrite() operations */
301
UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
302
/** Number of pending write operations */
303
UNIV_INTERN ulint os_n_pending_writes = 0;
304
/** Number of pending read operations */
305
UNIV_INTERN ulint os_n_pending_reads = 0;
307
/***********************************************************************//**
308
Gets the operating system version. Currently works only on Windows.
309
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
313
os_get_os_version(void)
314
/*===================*/
317
OSVERSIONINFO os_info;
319
os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
321
ut_a(GetVersionEx(&os_info));
323
if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
325
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
327
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
328
switch (os_info.dwMajorVersion) {
333
return (os_info.dwMinorVersion == 0) ? OS_WIN2000
336
return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
352
/***********************************************************************//**
353
Retrieves the last error number if an error occurs in a file io function.
354
The number should be retrieved before any other OS calls (because they may
355
overwrite the error number). If the number is not known to this program,
356
the OS error number + 100 is returned.
357
@return error number, or OS error number + 100 */
360
os_file_get_last_error(
361
/*===================*/
362
ibool report_all_errors) /*!< in: TRUE if we want an error message
363
printed of all errors */
369
err = (ulint) GetLastError();
371
if (report_all_errors
372
|| (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
374
ut_print_timestamp(stderr);
376
" InnoDB: Operating system error number %lu"
377
" in a file operation.\n", (ulong) err);
379
if (err == ERROR_PATH_NOT_FOUND) {
381
"InnoDB: The error means the system"
382
" cannot find the path specified.\n");
384
if (srv_is_being_started) {
386
"InnoDB: If you are installing InnoDB,"
387
" remember that you must create\n"
388
"InnoDB: directories yourself, InnoDB"
389
" does not create them.\n");
391
} else if (err == ERROR_ACCESS_DENIED) {
393
"InnoDB: The error means mysqld does not have"
394
" the access rights to\n"
395
"InnoDB: the directory. It may also be"
396
" you have created a subdirectory\n"
397
"InnoDB: of the same name as a data file.\n");
398
} else if (err == ERROR_SHARING_VIOLATION
399
|| err == ERROR_LOCK_VIOLATION) {
401
"InnoDB: The error means that another program"
402
" is using InnoDB's files.\n"
403
"InnoDB: This might be a backup or antivirus"
404
" software or another instance\n"
406
" Please close it to get rid of this error.\n");
407
} else if (err == ERROR_WORKING_SET_QUOTA
408
|| err == ERROR_NO_SYSTEM_RESOURCES) {
410
"InnoDB: The error means that there are no"
411
" sufficient system resources or quota to"
412
" complete the operation.\n");
413
} else if (err == ERROR_OPERATION_ABORTED) {
415
"InnoDB: The error means that the I/O"
416
" operation has been aborted\n"
417
"InnoDB: because of either a thread exit"
418
" or an application request.\n"
419
"InnoDB: Retry attempt is made.\n");
422
"InnoDB: Some operating system error numbers"
423
" are described at\n"
426
"operating-system-error-codes.html\n");
432
if (err == ERROR_FILE_NOT_FOUND) {
433
return(OS_FILE_NOT_FOUND);
434
} else if (err == ERROR_DISK_FULL) {
435
return(OS_FILE_DISK_FULL);
436
} else if (err == ERROR_FILE_EXISTS) {
437
return(OS_FILE_ALREADY_EXISTS);
438
} else if (err == ERROR_SHARING_VIOLATION
439
|| err == ERROR_LOCK_VIOLATION) {
440
return(OS_FILE_SHARING_VIOLATION);
441
} else if (err == ERROR_WORKING_SET_QUOTA
442
|| err == ERROR_NO_SYSTEM_RESOURCES) {
443
return(OS_FILE_INSUFFICIENT_RESOURCE);
444
} else if (err == ERROR_OPERATION_ABORTED) {
445
return(OS_FILE_OPERATION_ABORTED);
452
if (report_all_errors
453
|| (err != ENOSPC && err != EEXIST)) {
455
ut_print_timestamp(stderr);
457
" InnoDB: Operating system error number %lu"
458
" in a file operation.\n", (ulong) err);
462
"InnoDB: The error means the system"
463
" cannot find the path specified.\n");
465
if (srv_is_being_started) {
467
"InnoDB: If you are installing InnoDB,"
468
" remember that you must create\n"
469
"InnoDB: directories yourself, InnoDB"
470
" does not create them.\n");
472
} else if (err == EACCES) {
474
"InnoDB: The error means mysqld does not have"
475
" the access rights to\n"
476
"InnoDB: the directory.\n");
478
if (strerror((int)err) != NULL) {
480
"InnoDB: Error number %lu"
482
err, strerror((int)err));
486
"InnoDB: Some operating system"
487
" error numbers are described at\n"
490
"operating-system-error-codes.html\n");
498
return(OS_FILE_DISK_FULL);
500
return(OS_FILE_NOT_FOUND);
502
return(OS_FILE_ALREADY_EXISTS);
506
return(OS_FILE_PATH_ERROR);
508
if (srv_use_native_aio) {
509
return(OS_FILE_AIO_RESOURCES_RESERVED);
513
if (srv_use_native_aio) {
514
return(OS_FILE_AIO_INTERRUPTED);
522
/****************************************************************//**
523
Does error handling when a file operation fails.
524
Conditionally exits (calling exit(3)) based on should_exit value and the
526
@return TRUE if we should retry the operation */
529
os_file_handle_error_cond_exit(
530
/*===========================*/
531
const char* name, /*!< in: name of a file or NULL */
532
const char* operation, /*!< in: operation */
533
ibool should_exit) /*!< in: call exit(3) if unknown error
534
and this parameter is TRUE */
538
err = os_file_get_last_error(FALSE);
540
if (err == OS_FILE_DISK_FULL) {
541
/* We only print a warning about disk full once */
543
if (os_has_said_disk_full) {
549
ut_print_timestamp(stderr);
551
" InnoDB: Encountered a problem with"
555
ut_print_timestamp(stderr);
557
" InnoDB: Disk is full. Try to clean the disk"
558
" to free space.\n");
560
os_has_said_disk_full = TRUE;
565
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
568
} else if (err == OS_FILE_AIO_INTERRUPTED) {
571
} else if (err == OS_FILE_ALREADY_EXISTS
572
|| err == OS_FILE_PATH_ERROR) {
575
} else if (err == OS_FILE_SHARING_VIOLATION) {
577
os_thread_sleep(10000000); /* 10 sec */
579
} else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
581
os_thread_sleep(100000); /* 100 ms */
583
} else if (err == OS_FILE_OPERATION_ABORTED) {
585
os_thread_sleep(100000); /* 100 ms */
589
fprintf(stderr, "InnoDB: File name %s\n", name);
592
fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
596
fprintf(stderr, "InnoDB: Cannot continue operation.\n");
607
/****************************************************************//**
608
Does error handling when a file operation fails.
609
@return TRUE if we should retry the operation */
612
os_file_handle_error(
613
/*=================*/
614
const char* name, /*!< in: name of a file or NULL */
615
const char* operation)/*!< in: operation */
617
/* exit in case of unknown error */
618
return(os_file_handle_error_cond_exit(name, operation, TRUE));
621
/****************************************************************//**
622
Does error handling when a file operation fails.
623
@return TRUE if we should retry the operation */
626
os_file_handle_error_no_exit(
627
/*=========================*/
628
const char* name, /*!< in: name of a file or NULL */
629
const char* operation)/*!< in: operation */
631
/* don't exit in case of unknown error */
632
return(os_file_handle_error_cond_exit(name, operation, FALSE));
636
#define USE_FILE_LOCK
637
#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
638
/* InnoDB Hot Backup does not lock the data files.
639
* On Windows, mandatory locking is used.
641
# undef USE_FILE_LOCK
644
/****************************************************************//**
645
Obtain an exclusive lock on a file.
646
@return 0 on success */
651
int fd, /*!< in: file descriptor */
652
const char* name) /*!< in: file name */
656
lk.l_whence = SEEK_SET;
657
lk.l_start = lk.l_len = 0;
658
if (fcntl(fd, F_SETLK, &lk) == -1) {
660
"InnoDB: Unable to lock %s, error: %d\n", name, errno);
662
if (errno == EAGAIN || errno == EACCES) {
664
"InnoDB: Check that you do not already have"
665
" another drizzled process\n"
666
"InnoDB: using the same InnoDB data"
675
#endif /* USE_FILE_LOCK */
677
#ifndef UNIV_HOTBACKUP
678
/****************************************************************//**
679
Creates the seek mutexes used in positioned reads and writes. */
682
os_io_init_simple(void)
683
/*===================*/
687
os_file_count_mutex = os_mutex_create();
689
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
690
os_file_seek_mutexes[i] = os_mutex_create();
694
/***********************************************************************//**
695
Creates a temporary file. This function is like tmpfile(3), but
696
the temporary file is created in the MySQL temporary directory.
697
@return temporary file handle, or NULL on error */
700
os_file_create_tmpfile(void)
701
/*========================*/
704
int fd = innobase_mysql_tmpfile();
707
file = fdopen(fd, "w+b");
711
ut_print_timestamp(stderr);
713
" InnoDB: Error: unable to create temporary file;"
714
" errno: %d\n", errno);
722
#endif /* !UNIV_HOTBACKUP */
724
/***********************************************************************//**
725
The os_file_opendir() function opens a directory stream corresponding to the
726
directory named by the dirname argument. The directory stream is positioned
727
at the first entry. In both Unix and Windows we automatically skip the '.'
728
and '..' items at the start of the directory listing.
729
@return directory stream, NULL if error */
734
const char* dirname, /*!< in: directory name; it must not
735
contain a trailing '\' or '/' */
736
ibool error_is_fatal) /*!< in: TRUE if we should treat an
737
error as a fatal error; if we try to
738
open symlinks then we do not wish a
739
fatal error if it happens not to be
744
LPWIN32_FIND_DATA lpFindFileData;
745
char path[OS_FILE_MAX_PATH + 3];
747
ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
749
strcpy(path, dirname);
750
strcpy(path + strlen(path), "\\*");
752
/* Note that in Windows opening the 'directory stream' also retrieves
753
the first entry in the directory. Since it is '.', that is no problem,
754
as we will skip over the '.' and '..' entries anyway. */
756
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
758
dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
760
ut_free(lpFindFileData);
762
if (dir == INVALID_HANDLE_VALUE) {
764
if (error_is_fatal) {
765
os_file_handle_error(dirname, "opendir");
773
dir = opendir(dirname);
775
if (dir == NULL && error_is_fatal) {
776
os_file_handle_error(dirname, "opendir");
783
/***********************************************************************//**
784
Closes a directory stream.
785
@return 0 if success, -1 if failure */
790
os_file_dir_t dir) /*!< in: directory stream */
795
ret = FindClose(dir);
798
os_file_handle_error_no_exit(NULL, "closedir");
810
os_file_handle_error_no_exit(NULL, "closedir");
817
/***********************************************************************//**
818
This function returns information of the next file in the directory. We jump
819
over the '.' and '..' entries in the directory.
820
@return 0 if ok, -1 if error, 1 if at the end of the directory */
823
os_file_readdir_next_file(
824
/*======================*/
825
const char* dirname,/*!< in: directory name or path */
826
os_file_dir_t dir, /*!< in: directory stream */
827
os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
830
LPWIN32_FIND_DATA lpFindFileData;
833
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
835
ret = FindNextFile(dir, lpFindFileData);
838
ut_a(strlen((char *) lpFindFileData->cFileName)
841
if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
842
|| strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
847
strcpy(info->name, (char *) lpFindFileData->cFileName);
849
info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
850
+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
853
if (lpFindFileData->dwFileAttributes
854
& FILE_ATTRIBUTE_REPARSE_POINT) {
855
/* TODO: test Windows symlinks */
856
/* TODO: MySQL has apparently its own symlink
857
implementation in Windows, dbname.sym can
858
redirect a database directory:
859
REFMAN "windows-symbolic-links.html" */
860
info->type = OS_FILE_TYPE_LINK;
861
} else if (lpFindFileData->dwFileAttributes
862
& FILE_ATTRIBUTE_DIRECTORY) {
863
info->type = OS_FILE_TYPE_DIR;
865
/* It is probably safest to assume that all other
866
file types are normal. Better to check them rather
867
than blindly skip them. */
869
info->type = OS_FILE_TYPE_FILE;
873
ut_free(lpFindFileData);
877
} else if (GetLastError() == ERROR_NO_MORE_FILES) {
881
os_file_handle_error_no_exit(dirname,
882
"readdir_next_file");
889
struct stat statinfo;
890
#ifdef HAVE_READDIR_R
891
char dirent_buf[sizeof(struct dirent)
892
+ _POSIX_PATH_MAX + 100];
893
/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
894
the max file name len; but in most standards, the
895
length is NAME_MAX; we add 100 to be even safer */
900
#ifdef HAVE_READDIR_R
901
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
905
/* On AIX, only if we got non-NULL 'ent' (result) value and
906
a non-zero 'ret' (return) value, it indicates a failed
907
readdir_r() call. An NULL 'ent' with an non-zero 'ret'
908
would indicate the "end of the directory" is reached. */
913
"InnoDB: cannot read directory %s, error %lu\n",
914
dirname, (ulong)ret);
920
/* End of directory */
925
ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
934
ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
936
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
941
strcpy(info->name, ent->d_name);
943
full_path = static_cast<char* >(ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
945
sprintf(full_path, "%s/%s", dirname, ent->d_name);
947
ret = stat(full_path, &statinfo);
951
if (errno == ENOENT) {
952
/* readdir() returned a file that does not exist,
953
it must have been deleted in the meantime. Do what
954
would have happened if the file was deleted before
955
readdir() - ignore and go to the next entry.
956
If this is the last entry then info->name will still
957
contain the name of the deleted file when this
958
function returns, but this is not an issue since the
959
caller shouldn't be looking at info when end of
960
directory is returned. */
967
os_file_handle_error_no_exit(full_path, "stat");
974
info->size = (ib_int64_t)statinfo.st_size;
976
if (S_ISDIR(statinfo.st_mode)) {
977
info->type = OS_FILE_TYPE_DIR;
978
} else if (S_ISLNK(statinfo.st_mode)) {
979
info->type = OS_FILE_TYPE_LINK;
980
} else if (S_ISREG(statinfo.st_mode)) {
981
info->type = OS_FILE_TYPE_FILE;
983
info->type = OS_FILE_TYPE_UNKNOWN;
992
/*****************************************************************//**
993
This function attempts to create a directory named pathname. The new directory
994
gets default permissions. On Unix the permissions are (0770 & ~umask). If the
995
directory exists already, nothing is done and the call succeeds, unless the
996
fail_if_exists arguments is true.
997
@return TRUE if call succeeds, FALSE on error */
1000
os_file_create_directory(
1001
/*=====================*/
1002
const char* pathname, /*!< in: directory name as
1003
null-terminated string */
1004
ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
1005
is treated as an error. */
1010
rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1012
|| (GetLastError() == ERROR_ALREADY_EXISTS
1013
&& !fail_if_exists))) {
1015
os_file_handle_error(pathname, "CreateDirectory");
1024
rcode = mkdir(pathname, 0770);
1026
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1028
os_file_handle_error(pathname, "mkdir");
1037
/****************************************************************//**
1038
NOTE! Use the corresponding macro os_file_create_simple(), not directly
1040
A simple function to open or create a file.
1041
@return own: handle to the file, not defined if error, error number
1042
can be retrieved with os_file_get_last_error */
1045
os_file_create_simple_func(
1046
/*=======================*/
1047
const char* name, /*!< in: name of the file or path as a
1048
null-terminated string */
1049
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
1050
opened (if does not exist, error), or
1051
OS_FILE_CREATE if a new file is created
1052
(if exists, error), or
1053
OS_FILE_CREATE_PATH if new file
1054
(if exists, error) and subdirectories along
1055
its path are created (if needed)*/
1056
ulint access_type,/*!< in: OS_FILE_READ_ONLY or
1057
OS_FILE_READ_WRITE */
1058
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1064
DWORD attributes = 0;
1070
if (create_mode == OS_FILE_OPEN) {
1071
create_flag = OPEN_EXISTING;
1072
} else if (create_mode == OS_FILE_CREATE) {
1073
create_flag = CREATE_NEW;
1074
} else if (create_mode == OS_FILE_CREATE_PATH) {
1075
/* create subdirs along the path if needed */
1076
*success = os_file_create_subdirs_if_needed(name);
1080
create_flag = CREATE_NEW;
1081
create_mode = OS_FILE_CREATE;
1087
if (access_type == OS_FILE_READ_ONLY) {
1088
access = GENERIC_READ;
1089
} else if (access_type == OS_FILE_READ_WRITE) {
1090
access = GENERIC_READ | GENERIC_WRITE;
1096
file = CreateFile((LPCTSTR) name,
1098
FILE_SHARE_READ | FILE_SHARE_WRITE,
1099
/* file can be read and written also
1100
by other processes */
1101
NULL, /* default security attributes */
1104
NULL); /*!< no template file */
1106
if (file == INVALID_HANDLE_VALUE) {
1109
retry = os_file_handle_error(name,
1110
create_mode == OS_FILE_OPEN ?
1128
if (create_mode == OS_FILE_OPEN) {
1129
if (access_type == OS_FILE_READ_ONLY) {
1130
create_flag = O_RDONLY;
1132
create_flag = O_RDWR;
1134
} else if (create_mode == OS_FILE_CREATE) {
1135
create_flag = O_RDWR | O_CREAT | O_EXCL;
1136
} else if (create_mode == OS_FILE_CREATE_PATH) {
1137
/* create subdirs along the path if needed */
1138
*success = os_file_create_subdirs_if_needed(name);
1142
create_flag = O_RDWR | O_CREAT | O_EXCL;
1143
create_mode = OS_FILE_CREATE;
1149
if (create_mode == OS_FILE_CREATE) {
1150
file = open(name, create_flag, S_IRUSR | S_IWUSR
1151
| S_IRGRP | S_IWGRP);
1153
file = open(name, create_flag);
1159
retry = os_file_handle_error(name,
1160
create_mode == OS_FILE_OPEN ?
1165
#ifdef USE_FILE_LOCK
1166
} else if (access_type == OS_FILE_READ_WRITE
1167
&& os_file_lock(file, name)) {
1177
#endif /* __WIN__ */
1180
/****************************************************************//**
1181
NOTE! Use the corresponding macro
1182
os_file_create_simple_no_error_handling(), not directly this function!
1183
A simple function to open or create a file.
1184
@return own: handle to the file, not defined if error, error number
1185
can be retrieved with os_file_get_last_error */
1188
os_file_create_simple_no_error_handling_func(
1189
/*=========================================*/
1190
const char* name, /*!< in: name of the file or path as a
1191
null-terminated string */
1192
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
1193
is opened (if does not exist, error), or
1194
OS_FILE_CREATE if a new file is created
1195
(if exists, error) */
1196
ulint access_type,/*!< in: OS_FILE_READ_ONLY,
1197
OS_FILE_READ_WRITE, or
1198
OS_FILE_READ_ALLOW_DELETE; the last option is
1199
used by a backup program reading the file */
1200
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1206
DWORD attributes = 0;
1207
DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1211
if (create_mode == OS_FILE_OPEN) {
1212
create_flag = OPEN_EXISTING;
1213
} else if (create_mode == OS_FILE_CREATE) {
1214
create_flag = CREATE_NEW;
1220
if (access_type == OS_FILE_READ_ONLY) {
1221
access = GENERIC_READ;
1222
} else if (access_type == OS_FILE_READ_WRITE) {
1223
access = GENERIC_READ | GENERIC_WRITE;
1224
} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1225
access = GENERIC_READ;
1226
share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1227
| FILE_SHARE_WRITE; /*!< A backup program has to give
1228
mysqld the maximum freedom to
1229
do what it likes with the
1236
file = CreateFile((LPCTSTR) name,
1239
NULL, /* default security attributes */
1242
NULL); /*!< no template file */
1244
if (file == INVALID_HANDLE_VALUE) {
1257
if (create_mode == OS_FILE_OPEN) {
1258
if (access_type == OS_FILE_READ_ONLY) {
1259
create_flag = O_RDONLY;
1261
create_flag = O_RDWR;
1263
} else if (create_mode == OS_FILE_CREATE) {
1264
create_flag = O_RDWR | O_CREAT | O_EXCL;
1270
if (create_mode == OS_FILE_CREATE) {
1271
file = open(name, create_flag, S_IRUSR | S_IWUSR
1272
| S_IRGRP | S_IWGRP);
1274
file = open(name, create_flag);
1279
#ifdef USE_FILE_LOCK
1280
} else if (access_type == OS_FILE_READ_WRITE
1281
&& os_file_lock(file, name)) {
1291
#endif /* __WIN__ */
1294
/****************************************************************//**
1295
Tries to disable OS caching on an opened file descriptor. */
1298
os_file_set_nocache(
1299
/*================*/
1300
int fd, /*!< in: file descriptor to alter */
1301
const char* file_name, /*!< in: used in the diagnostic message */
1302
const char* operation_name)
1303
/*!< in: "open" or "create"; used in the
1304
diagnostic message */
1306
/* some versions of Solaris may not have DIRECTIO_ON */
1307
#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1308
if (directio(fd, DIRECTIO_ON) == -1) {
1310
errno_save = (int)errno;
1311
ut_print_timestamp(stderr);
1313
" InnoDB: Failed to set DIRECTIO_ON "
1314
"on file %s: %s: %s, continuing anyway\n",
1315
file_name, operation_name, strerror(errno_save));
1317
#elif defined(O_DIRECT)
1318
if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1320
errno_save = (int)errno;
1321
ut_print_timestamp(stderr);
1323
" InnoDB: Failed to set O_DIRECT "
1324
"on file %s: %s: %s, continuing anyway\n",
1325
file_name, operation_name, strerror(errno_save));
1326
if (errno_save == EINVAL) {
1327
ut_print_timestamp(stderr);
1329
" InnoDB: O_DIRECT is known to result in "
1330
"'Invalid argument' on Linux on tmpfs, "
1331
"see MySQL Bug#26662\n");
1334
#else /* Required for OSX */
1337
(void)operation_name;
1341
/****************************************************************//**
1342
NOTE! Use the corresponding macro os_file_create(), not directly
1344
Opens an existing file or creates a new.
1345
@return own: handle to the file, not defined if error, error number
1346
can be retrieved with os_file_get_last_error */
1349
os_file_create_func(
1350
/*================*/
1351
const char* name, /*!< in: name of the file or path as a
1352
null-terminated string */
1353
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
1354
is opened (if does not exist, error), or
1355
OS_FILE_CREATE if a new file is created
1357
OS_FILE_OVERWRITE if a new file is created
1358
or an old overwritten;
1359
OS_FILE_OPEN_RAW, if a raw device or disk
1360
partition should be opened */
1361
ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1362
non-buffered i/o is desired,
1363
OS_FILE_NORMAL, if any normal file;
1364
NOTE that it also depends on type, os_aio_..
1365
and srv_.. variables whether we really use
1366
async i/o or unbuffered i/o: look in the
1367
function source code for the exact rules */
1368
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
1369
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1373
DWORD share_mode = FILE_SHARE_READ;
1380
if (create_mode == OS_FILE_OPEN_RAW) {
1381
create_flag = OPEN_EXISTING;
1382
share_mode = FILE_SHARE_WRITE;
1383
} else if (create_mode == OS_FILE_OPEN
1384
|| create_mode == OS_FILE_OPEN_RETRY) {
1385
create_flag = OPEN_EXISTING;
1386
} else if (create_mode == OS_FILE_CREATE) {
1387
create_flag = CREATE_NEW;
1388
} else if (create_mode == OS_FILE_OVERWRITE) {
1389
create_flag = CREATE_ALWAYS;
1395
if (purpose == OS_FILE_AIO) {
1396
/* If specified, use asynchronous (overlapped) io and no
1397
buffering of writes in the OS */
1400
if (srv_use_native_aio) {
1401
attributes = attributes | FILE_FLAG_OVERLAPPED;
1404
#ifdef UNIV_NON_BUFFERED_IO
1405
# ifndef UNIV_HOTBACKUP
1406
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1407
/* Do not use unbuffered i/o to log files because
1408
value 2 denotes that we do not flush the log at every
1409
commit, but only once per second */
1410
} else if (srv_win_file_flush_method
1411
== SRV_WIN_IO_UNBUFFERED) {
1412
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1414
# else /* !UNIV_HOTBACKUP */
1415
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1416
# endif /* !UNIV_HOTBACKUP */
1417
#endif /* UNIV_NON_BUFFERED_IO */
1418
} else if (purpose == OS_FILE_NORMAL) {
1420
#ifdef UNIV_NON_BUFFERED_IO
1421
# ifndef UNIV_HOTBACKUP
1422
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1423
/* Do not use unbuffered i/o to log files because
1424
value 2 denotes that we do not flush the log at every
1425
commit, but only once per second */
1426
} else if (srv_win_file_flush_method
1427
== SRV_WIN_IO_UNBUFFERED) {
1428
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1430
# else /* !UNIV_HOTBACKUP */
1431
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1432
# endif /* !UNIV_HOTBACKUP */
1433
#endif /* UNIV_NON_BUFFERED_IO */
1439
file = CreateFile((LPCTSTR) name,
1440
GENERIC_READ | GENERIC_WRITE, /* read and write
1442
share_mode, /* File can be read also by other
1443
processes; we must give the read
1444
permission because of ibbackup. We do
1445
not give the write permission to
1446
others because if one would succeed to
1447
start 2 instances of mysqld on the
1448
SAME files, that could cause severe
1449
database corruption! When opening
1450
raw disk partitions, Microsoft manuals
1451
say that we must give also the write
1453
NULL, /* default security attributes */
1456
NULL); /*!< no template file */
1458
if (file == INVALID_HANDLE_VALUE) {
1461
/* When srv_file_per_table is on, file creation failure may not
1462
be critical to the whole instance. Do not crash the server in
1463
case of unknown errors.
1464
Please note "srv_file_per_table" is a global variable with
1465
no explicit synchronization protection. It could be
1466
changed during this execution path. It might not have the
1467
same value as the one when building the table definition */
1468
if (srv_file_per_table) {
1469
retry = os_file_handle_error_no_exit(name,
1470
create_mode == OS_FILE_CREATE ?
1473
retry = os_file_handle_error(name,
1474
create_mode == OS_FILE_CREATE ?
1490
const char* mode_str = NULL;
1495
if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1496
|| create_mode == OS_FILE_OPEN_RETRY) {
1498
create_flag = O_RDWR;
1499
} else if (create_mode == OS_FILE_CREATE) {
1500
mode_str = "CREATE";
1501
create_flag = O_RDWR | O_CREAT | O_EXCL;
1502
} else if (create_mode == OS_FILE_OVERWRITE) {
1503
mode_str = "OVERWRITE";
1504
create_flag = O_RDWR | O_CREAT | O_TRUNC;
1510
ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1511
ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1514
/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1515
O_SYNC because the datasync options seemed to corrupt files in 2001
1516
in both Linux and Solaris */
1517
if (type == OS_LOG_FILE
1518
&& srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1521
fprintf(stderr, "Using O_SYNC for file %s\n", name);
1524
create_flag = create_flag | O_SYNC;
1528
file = open(name, create_flag, os_innodb_umask);
1533
/* When srv_file_per_table is on, file creation failure may not
1534
be critical to the whole instance. Do not crash the server in
1535
case of unknown errors.
1536
Please note "srv_file_per_table" is a global variable with
1537
no explicit synchronization protection. It could be
1538
changed during this execution path. It might not have the
1539
same value as the one when building the table definition */
1540
if (srv_file_per_table) {
1541
retry = os_file_handle_error_no_exit(name,
1542
create_mode == OS_FILE_CREATE ?
1545
retry = os_file_handle_error(name,
1546
create_mode == OS_FILE_CREATE ?
1553
return(file /* -1 */);
1560
/* We disable OS caching (O_DIRECT) only on data files */
1561
if (type != OS_LOG_FILE
1562
&& srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1564
os_file_set_nocache(file, name, mode_str);
1567
#ifdef USE_FILE_LOCK
1568
if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1570
if (create_mode == OS_FILE_OPEN_RETRY) {
1572
ut_print_timestamp(stderr);
1573
fputs(" InnoDB: Retrying to lock"
1574
" the first data file\n",
1576
for (i = 0; i < 100; i++) {
1577
os_thread_sleep(1000000);
1578
if (!os_file_lock(file, name)) {
1583
ut_print_timestamp(stderr);
1584
fputs(" InnoDB: Unable to open the first data file\n",
1592
#endif /* USE_FILE_LOCK */
1595
#endif /* __WIN__ */
1598
/***********************************************************************//**
1599
Deletes a file if it exists. The file has to be closed before calling this.
1600
@return TRUE if success */
1603
os_file_delete_if_exists(
1604
/*=====================*/
1605
const char* name) /*!< in: file path as a null-terminated string */
1611
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1614
ret = DeleteFile((LPCTSTR)name);
1620
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1621
/* the file does not exist, this not an error */
1628
if (count > 100 && 0 == (count % 10)) {
1630
"InnoDB: Warning: cannot delete file %s\n"
1631
"InnoDB: Are you running ibbackup"
1632
" to back up the file?\n", name);
1634
os_file_get_last_error(TRUE); /* print error information */
1637
os_thread_sleep(1000000); /* sleep for a second */
1650
if (ret != 0 && errno != ENOENT) {
1651
os_file_handle_error_no_exit(name, "delete");
1660
/***********************************************************************//**
1661
Deletes a file. The file has to be closed before calling this.
1662
@return TRUE if success */
1667
const char* name) /*!< in: file path as a null-terminated string */
1673
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1676
ret = DeleteFile((LPCTSTR)name);
1682
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1683
/* If the file does not exist, we classify this as a 'mild'
1691
if (count > 100 && 0 == (count % 10)) {
1693
"InnoDB: Warning: cannot delete file %s\n"
1694
"InnoDB: Are you running ibbackup"
1695
" to back up the file?\n", name);
1697
os_file_get_last_error(TRUE); /* print error information */
1700
os_thread_sleep(1000000); /* sleep for a second */
1714
os_file_handle_error_no_exit(name, "delete");
1723
/***********************************************************************//**
1724
NOTE! Use the corresponding macro os_file_rename(), not directly this function!
1725
Renames a file (can also move it to another directory). It is safest that the
1726
file is closed before calling this function.
1727
@return TRUE if success */
1730
os_file_rename_func(
1731
/*================*/
1732
const char* oldpath,/*!< in: old file path as a null-terminated
1734
const char* newpath)/*!< in: new file path */
1739
ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1745
os_file_handle_error_no_exit(oldpath, "rename");
1751
ret = rename(oldpath, newpath);
1754
os_file_handle_error_no_exit(oldpath, "rename");
1763
/***********************************************************************//**
1764
NOTE! Use the corresponding macro os_file_close(), not directly this function!
1765
Closes a file handle. In case of error, error number can be retrieved with
1766
os_file_get_last_error.
1767
@return TRUE if success */
1772
os_file_t file) /*!< in, own: handle to a file */
1779
ret = CloseHandle(file);
1785
os_file_handle_error(NULL, "close");
1794
os_file_handle_error(NULL, "close");
1803
#ifdef UNIV_HOTBACKUP
1804
/***********************************************************************//**
1805
Closes a file handle.
1806
@return TRUE if success */
1809
os_file_close_no_error_handling(
1810
/*============================*/
1811
os_file_t file) /*!< in, own: handle to a file */
1818
ret = CloseHandle(file);
1838
#endif /* UNIV_HOTBACKUP */
1840
/***********************************************************************//**
1842
@return TRUE if success */
1847
os_file_t file, /*!< in: handle to a file */
1848
ulint* size, /*!< out: least significant 32 bits of file
1850
ulint* size_high)/*!< out: most significant 32 bits of size */
1856
low = GetFileSize(file, &high);
1858
if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1869
offs = lseek(file, 0, SEEK_END);
1871
if (offs == ((off_t)-1)) {
1876
if (sizeof(off_t) > 4) {
1877
*size = (ulint)(offs & 0xFFFFFFFFUL);
1878
*size_high = (ulint)(offs >> 32);
1880
*size = (ulint) offs;
1888
/***********************************************************************//**
1889
Gets file size as a 64-bit integer ib_int64_t.
1890
@return size in bytes, -1 if error */
1893
os_file_get_size_as_iblonglong(
1894
/*===========================*/
1895
os_file_t file) /*!< in: handle to a file */
1901
success = os_file_get_size(file, &size, &size_high);
1908
return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
1911
/***********************************************************************//**
1912
Write the specified number of zeros to a newly created file.
1913
@return TRUE if success */
1918
const char* name, /*!< in: name of the file or path as a
1919
null-terminated string */
1920
os_file_t file, /*!< in: handle to a file */
1921
ulint size, /*!< in: least significant 32 bits of file
1923
ulint size_high)/*!< in: most significant 32 bits of size */
1925
ib_int64_t current_size;
1926
ib_int64_t desired_size;
1932
ut_a(size == (size & 0xFFFFFFFF));
1935
desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
1937
/* Write up to 1 megabyte at a time. */
1938
buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1940
buf2 = static_cast<unsigned char *>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
1942
/* Align the buffer for possible raw i/o */
1943
buf = static_cast<unsigned char *>(ut_align(buf2, UNIV_PAGE_SIZE));
1945
/* Write buffer full of zeros */
1946
memset(buf, 0, buf_size);
1948
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1950
fprintf(stderr, "InnoDB: Progress in MB:");
1953
while (current_size < desired_size) {
1956
if (desired_size - current_size < (ib_int64_t) buf_size) {
1957
n_bytes = (ulint) (desired_size - current_size);
1962
ret = os_file_write(name, file, buf,
1963
(ulint)(current_size & 0xFFFFFFFF),
1964
(ulint)(current_size >> 32),
1968
goto error_handling;
1971
/* Print about progress for each 100 MB written */
1972
if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
1973
!= current_size / (ib_int64_t)(100 * 1024 * 1024)) {
1975
fprintf(stderr, " %lu00",
1976
(ulong) ((current_size + n_bytes)
1977
/ (ib_int64_t)(100 * 1024 * 1024)));
1980
current_size += n_bytes;
1983
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1985
fprintf(stderr, "\n");
1990
ret = os_file_flush(file);
2000
/***********************************************************************//**
2001
Truncates a file at its current position.
2002
@return TRUE if success */
2007
FILE* file) /*!< in: file to be truncated */
2010
HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2011
return(SetEndOfFile(h));
2013
return(!ftruncate(fileno(file), ftell(file)));
2014
#endif /* __WIN__ */
2018
/***********************************************************************//**
2019
Wrapper to fsync(2) that retries the call on some errors.
2020
Returns the value 0 if successful; otherwise the value -1 is returned and
2021
the global variable errno is set to indicate the error.
2022
@return 0 if success, -1 otherwise */
2028
os_file_t file) /*!< in: handle to a file */
2041
if (ret == -1 && errno == ENOLCK) {
2043
if (failures % 100 == 0) {
2045
ut_print_timestamp(stderr);
2047
" InnoDB: fsync(): "
2048
"No locks available; retrying\n");
2051
os_thread_sleep(200000 /* 0.2 sec */);
2064
#endif /* !__WIN__ */
2066
/***********************************************************************//**
2067
NOTE! Use the corresponding macro os_file_flush(), not directly this function!
2068
Flushes the write buffers of a given file to the disk.
2069
@return TRUE if success */
2074
os_file_t file) /*!< in, own: handle to a file */
2083
ret = FlushFileBuffers(file);
2089
/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2090
actually a raw device, we choose to ignore that error if we are using
2093
if (srv_start_raw_disk_in_use && GetLastError()
2094
== ERROR_INVALID_FUNCTION) {
2098
os_file_handle_error(NULL, "flush");
2100
/* It is a fatal error if a file flush does not succeed, because then
2101
the database can get corrupt on disk */
2108
#if defined(HAVE_DARWIN_THREADS)
2109
# ifndef F_FULLFSYNC
2110
/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2111
# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2112
# elif F_FULLFSYNC != 51
2113
# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2115
/* Apple has disabled fsync() for internal disk drives in OS X. That
2116
caused corruption for a user when he tested a power outage. Let us in
2117
OS X use a nonstandard flush method recommended by an Apple
2120
if (!srv_have_fullfsync) {
2121
/* If we are not on an operating system that supports this,
2122
then fall back to a plain fsync. */
2124
ret = os_file_fsync(file);
2126
ret = fcntl(file, F_FULLFSYNC, NULL);
2129
/* If we are not on a file system that supports this,
2130
then fall back to a plain fsync. */
2131
ret = os_file_fsync(file);
2135
ret = os_file_fsync(file);
2142
/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2143
we choose to ignore that error if we are using raw disks */
2145
if (srv_start_raw_disk_in_use && errno == EINVAL) {
2150
ut_print_timestamp(stderr);
2153
" InnoDB: Error: the OS said file flush did not succeed\n");
2155
os_file_handle_error(NULL, "flush");
2157
/* It is a fatal error if a file flush does not succeed, because then
2158
the database can get corrupt on disk */
2166
/*******************************************************************//**
2167
Does a synchronous read operation in Posix.
2168
@return number of bytes read, -1 if error */
2173
os_file_t file, /*!< in: handle to a file */
2174
void* buf, /*!< in: buffer where to read */
2175
ulint n, /*!< in: number of bytes to read */
2176
ulint offset, /*!< in: least significant 32 bits of file
2177
offset from where to read */
2178
ulint offset_high) /*!< in: most significant 32 bits of
2182
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2184
#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2186
ut_a((offset & 0xFFFFFFFFUL) == offset);
2188
/* If off_t is > 4 bytes in size, then we assume we can pass a
2191
if (sizeof(off_t) > 4) {
2192
offs = (off_t)offset + (((off_t)offset_high) << 32);
2195
offs = (off_t)offset;
2197
if (offset_high > 0) {
2199
"InnoDB: Error: file read at offset > 4 GB\n");
2205
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2206
os_mutex_enter(os_file_count_mutex);
2207
os_file_n_pending_preads++;
2208
os_n_pending_reads++;
2209
os_mutex_exit(os_file_count_mutex);
2211
n_bytes = pread(file, buf, (ssize_t)n, offs);
2213
os_mutex_enter(os_file_count_mutex);
2214
os_file_n_pending_preads--;
2215
os_n_pending_reads--;
2216
os_mutex_exit(os_file_count_mutex);
2223
#ifndef UNIV_HOTBACKUP
2225
#endif /* !UNIV_HOTBACKUP */
2227
os_mutex_enter(os_file_count_mutex);
2228
os_n_pending_reads++;
2229
os_mutex_exit(os_file_count_mutex);
2231
#ifndef UNIV_HOTBACKUP
2232
/* Protect the seek / read operation with a mutex */
2233
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2235
os_mutex_enter(os_file_seek_mutexes[i]);
2236
#endif /* !UNIV_HOTBACKUP */
2238
ret_offset = lseek(file, offs, SEEK_SET);
2240
if (ret_offset < 0) {
2243
ret = read(file, buf, (ssize_t)n);
2246
#ifndef UNIV_HOTBACKUP
2247
os_mutex_exit(os_file_seek_mutexes[i]);
2248
#endif /* !UNIV_HOTBACKUP */
2250
os_mutex_enter(os_file_count_mutex);
2251
os_n_pending_reads--;
2252
os_mutex_exit(os_file_count_mutex);
2259
/*******************************************************************//**
2260
Does a synchronous write operation in Posix.
2261
@return number of bytes written, -1 if error */
2266
os_file_t file, /*!< in: handle to a file */
2267
const void* buf, /*!< in: buffer from where to write */
2268
ulint n, /*!< in: number of bytes to write */
2269
ulint offset, /*!< in: least significant 32 bits of file
2270
offset where to write */
2271
ulint offset_high) /*!< in: most significant 32 bits of
2277
ut_a((offset & 0xFFFFFFFFUL) == offset);
2279
/* If off_t is > 4 bytes in size, then we assume we can pass a
2282
if (sizeof(off_t) > 4) {
2283
offs = (off_t)offset + (((off_t)offset_high) << 32);
2285
offs = (off_t)offset;
2287
if (offset_high > 0) {
2289
"InnoDB: Error: file write"
2290
" at offset > 4 GB\n");
2296
#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2297
os_mutex_enter(os_file_count_mutex);
2298
os_file_n_pending_pwrites++;
2299
os_n_pending_writes++;
2300
os_mutex_exit(os_file_count_mutex);
2302
ret = pwrite(file, buf, (ssize_t)n, offs);
2304
os_mutex_enter(os_file_count_mutex);
2305
os_file_n_pending_pwrites--;
2306
os_n_pending_writes--;
2307
os_mutex_exit(os_file_count_mutex);
2309
# ifdef UNIV_DO_FLUSH
2310
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2311
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2312
&& !os_do_not_call_flush_at_each_write) {
2314
/* Always do fsync to reduce the probability that when
2315
the OS crashes, a database page is only partially
2316
physically written to disk. */
2318
ut_a(TRUE == os_file_flush(file));
2320
# endif /* UNIV_DO_FLUSH */
2326
# ifndef UNIV_HOTBACKUP
2328
# endif /* !UNIV_HOTBACKUP */
2330
os_mutex_enter(os_file_count_mutex);
2331
os_n_pending_writes++;
2332
os_mutex_exit(os_file_count_mutex);
2334
# ifndef UNIV_HOTBACKUP
2335
/* Protect the seek / write operation with a mutex */
2336
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2338
os_mutex_enter(os_file_seek_mutexes[i]);
2339
# endif /* UNIV_HOTBACKUP */
2341
ret_offset = lseek(file, offs, SEEK_SET);
2343
if (ret_offset < 0) {
2349
ret = write(file, buf, (ssize_t)n);
2351
# ifdef UNIV_DO_FLUSH
2352
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2353
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2354
&& !os_do_not_call_flush_at_each_write) {
2356
/* Always do fsync to reduce the probability that when
2357
the OS crashes, a database page is only partially
2358
physically written to disk. */
2360
ut_a(TRUE == os_file_flush(file));
2362
# endif /* UNIV_DO_FLUSH */
2365
# ifndef UNIV_HOTBACKUP
2366
os_mutex_exit(os_file_seek_mutexes[i]);
2367
# endif /* !UNIV_HOTBACKUP */
2369
os_mutex_enter(os_file_count_mutex);
2370
os_n_pending_writes--;
2371
os_mutex_exit(os_file_count_mutex);
2379
/*******************************************************************//**
2380
NOTE! Use the corresponding macro os_file_read(), not directly this
2382
Requests a synchronous positioned read operation.
2383
@return TRUE if request was successful, FALSE if fail */
2388
os_file_t file, /*!< in: handle to a file */
2389
void* buf, /*!< in: buffer where to read */
2390
ulint offset, /*!< in: least significant 32 bits of file
2391
offset where to read */
2392
ulint offset_high, /*!< in: most significant 32 bits of
2394
ulint n) /*!< in: number of bytes to read */
2403
#ifndef UNIV_HOTBACKUP
2405
#endif /* !UNIV_HOTBACKUP */
2407
/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2408
no more than 32 bits. */
2409
ut_a((offset & 0xFFFFFFFFUL) == offset);
2410
ut_a((n & 0xFFFFFFFFUL) == n);
2413
os_bytes_read_since_printout += n;
2420
low = (DWORD) offset;
2421
high = (DWORD) offset_high;
2423
os_mutex_enter(os_file_count_mutex);
2424
os_n_pending_reads++;
2425
os_mutex_exit(os_file_count_mutex);
2427
#ifndef UNIV_HOTBACKUP
2428
/* Protect the seek / read operation with a mutex */
2429
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2431
os_mutex_enter(os_file_seek_mutexes[i]);
2432
#endif /* !UNIV_HOTBACKUP */
2434
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2436
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2438
#ifndef UNIV_HOTBACKUP
2439
os_mutex_exit(os_file_seek_mutexes[i]);
2440
#endif /* !UNIV_HOTBACKUP */
2442
os_mutex_enter(os_file_count_mutex);
2443
os_n_pending_reads--;
2444
os_mutex_exit(os_file_count_mutex);
2446
goto error_handling;
2449
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2451
#ifndef UNIV_HOTBACKUP
2452
os_mutex_exit(os_file_seek_mutexes[i]);
2453
#endif /* !UNIV_HOTBACKUP */
2455
os_mutex_enter(os_file_count_mutex);
2456
os_n_pending_reads--;
2457
os_mutex_exit(os_file_count_mutex);
2459
if (ret && len == n) {
2466
os_bytes_read_since_printout += n;
2469
ret = os_file_pread(file, buf, n, offset, offset_high);
2471
if ((ulint)ret == n) {
2477
"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2478
"InnoDB: Was only able to read %ld.\n",
2479
(ulong)n, (ulong)offset_high,
2480
(ulong)offset, (long)ret);
2481
#endif /* __WIN__ */
2485
retry = os_file_handle_error(NULL, "read");
2492
"InnoDB: Fatal error: cannot read from file."
2493
" OS error number %lu.\n",
2495
(ulong) GetLastError()
2507
/*******************************************************************//**
2508
NOTE! Use the corresponding macro os_file_read_no_error_handling(),
2509
not directly this function!
2510
Requests a synchronous positioned read operation. This function does not do
2511
any error handling. In case of error it returns FALSE.
2512
@return TRUE if request was successful, FALSE if fail */
2515
os_file_read_no_error_handling_func(
2516
/*================================*/
2517
os_file_t file, /*!< in: handle to a file */
2518
void* buf, /*!< in: buffer where to read */
2519
ulint offset, /*!< in: least significant 32 bits of file
2520
offset where to read */
2521
ulint offset_high, /*!< in: most significant 32 bits of
2523
ulint n) /*!< in: number of bytes to read */
2532
#ifndef UNIV_HOTBACKUP
2534
#endif /* !UNIV_HOTBACKUP */
2536
/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2537
no more than 32 bits. */
2538
ut_a((offset & 0xFFFFFFFFUL) == offset);
2539
ut_a((n & 0xFFFFFFFFUL) == n);
2542
os_bytes_read_since_printout += n;
2549
low = (DWORD) offset;
2550
high = (DWORD) offset_high;
2552
os_mutex_enter(os_file_count_mutex);
2553
os_n_pending_reads++;
2554
os_mutex_exit(os_file_count_mutex);
2556
#ifndef UNIV_HOTBACKUP
2557
/* Protect the seek / read operation with a mutex */
2558
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2560
os_mutex_enter(os_file_seek_mutexes[i]);
2561
#endif /* !UNIV_HOTBACKUP */
2563
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2565
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2567
#ifndef UNIV_HOTBACKUP
2568
os_mutex_exit(os_file_seek_mutexes[i]);
2569
#endif /* !UNIV_HOTBACKUP */
2571
os_mutex_enter(os_file_count_mutex);
2572
os_n_pending_reads--;
2573
os_mutex_exit(os_file_count_mutex);
2575
goto error_handling;
2578
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2580
#ifndef UNIV_HOTBACKUP
2581
os_mutex_exit(os_file_seek_mutexes[i]);
2582
#endif /* !UNIV_HOTBACKUP */
2584
os_mutex_enter(os_file_count_mutex);
2585
os_n_pending_reads--;
2586
os_mutex_exit(os_file_count_mutex);
2588
if (ret && len == n) {
2595
os_bytes_read_since_printout += n;
2598
ret = os_file_pread(file, buf, n, offset, offset_high);
2600
if ((ulint)ret == n) {
2604
#endif /* __WIN__ */
2608
retry = os_file_handle_error_no_exit(NULL, "read");
2617
/*******************************************************************//**
2618
Rewind file to its start, read at most size - 1 bytes from it to str, and
2619
NUL-terminate str. All errors are silently ignored. This function is
2620
mostly meant to be used with temporary files. */
2623
os_file_read_string(
2624
/*================*/
2625
FILE* file, /*!< in: file to read from */
2626
char* str, /*!< in: buffer where to read */
2627
ulint size) /*!< in: size of buffer */
2636
flen = fread(str, 1, size - 1, file);
2640
/*******************************************************************//**
2641
NOTE! Use the corresponding macro os_file_write(), not directly
2643
Requests a synchronous write operation.
2644
@return TRUE if request was successful, FALSE if fail */
2649
const char* name, /*!< in: name of the file or path as a
2650
null-terminated string */
2651
os_file_t file, /*!< in: handle to a file */
2652
const void* buf, /*!< in: buffer from which to write */
2653
ulint offset, /*!< in: least significant 32 bits of file
2654
offset where to write */
2655
ulint offset_high, /*!< in: most significant 32 bits of
2657
ulint n) /*!< in: number of bytes to write */
2665
ulint n_retries = 0;
2667
#ifndef UNIV_HOTBACKUP
2669
#endif /* !UNIV_HOTBACKUP */
2671
/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2672
no more than 32 bits. */
2673
ut_a((offset & 0xFFFFFFFFUL) == offset);
2674
ut_a((n & 0xFFFFFFFFUL) == n);
2682
low = (DWORD) offset;
2683
high = (DWORD) offset_high;
2685
os_mutex_enter(os_file_count_mutex);
2686
os_n_pending_writes++;
2687
os_mutex_exit(os_file_count_mutex);
2689
#ifndef UNIV_HOTBACKUP
2690
/* Protect the seek / write operation with a mutex */
2691
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2693
os_mutex_enter(os_file_seek_mutexes[i]);
2694
#endif /* !UNIV_HOTBACKUP */
2696
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2698
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2700
#ifndef UNIV_HOTBACKUP
2701
os_mutex_exit(os_file_seek_mutexes[i]);
2702
#endif /* !UNIV_HOTBACKUP */
2704
os_mutex_enter(os_file_count_mutex);
2705
os_n_pending_writes--;
2706
os_mutex_exit(os_file_count_mutex);
2708
ut_print_timestamp(stderr);
2711
" InnoDB: Error: File pointer positioning to"
2712
" file %s failed at\n"
2713
"InnoDB: offset %lu %lu. Operating system"
2714
" error number %lu.\n"
2715
"InnoDB: Some operating system error numbers"
2716
" are described at\n"
2718
REFMAN "operating-system-error-codes.html\n",
2719
name, (ulong) offset_high, (ulong) offset,
2720
(ulong) GetLastError());
2725
ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2727
/* Always do fsync to reduce the probability that when the OS crashes,
2728
a database page is only partially physically written to disk. */
2730
# ifdef UNIV_DO_FLUSH
2731
if (!os_do_not_call_flush_at_each_write) {
2732
ut_a(TRUE == os_file_flush(file));
2734
# endif /* UNIV_DO_FLUSH */
2736
#ifndef UNIV_HOTBACKUP
2737
os_mutex_exit(os_file_seek_mutexes[i]);
2738
#endif /* !UNIV_HOTBACKUP */
2740
os_mutex_enter(os_file_count_mutex);
2741
os_n_pending_writes--;
2742
os_mutex_exit(os_file_count_mutex);
2744
if (ret && len == n) {
2749
/* If some background file system backup tool is running, then, at
2750
least in Windows 2000, we may get here a specific error. Let us
2751
retry the operation 100 times, with 1 second waits. */
2753
if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2755
os_thread_sleep(1000000);
2762
if (!os_has_said_disk_full) {
2764
err = (ulint)GetLastError();
2766
ut_print_timestamp(stderr);
2769
" InnoDB: Error: Write to file %s failed"
2770
" at offset %lu %lu.\n"
2771
"InnoDB: %lu bytes should have been written,"
2772
" only %lu were written.\n"
2773
"InnoDB: Operating system error number %lu.\n"
2774
"InnoDB: Check that your OS and file system"
2775
" support files of this size.\n"
2776
"InnoDB: Check also that the disk is not full"
2777
" or a disk quota exceeded.\n",
2778
name, (ulong) offset_high, (ulong) offset,
2779
(ulong) n, (ulong) len, (ulong) err);
2781
if (strerror((int)err) != NULL) {
2783
"InnoDB: Error number %lu means '%s'.\n",
2784
(ulong) err, strerror((int)err));
2788
"InnoDB: Some operating system error numbers"
2789
" are described at\n"
2791
REFMAN "operating-system-error-codes.html\n");
2793
os_has_said_disk_full = TRUE;
2800
ret = os_file_pwrite(file, buf, n, offset, offset_high);
2802
if ((ulint)ret == n) {
2807
if (!os_has_said_disk_full) {
2809
ut_print_timestamp(stderr);
2812
" InnoDB: Error: Write to file %s failed"
2813
" at offset %lu %lu.\n"
2814
"InnoDB: %lu bytes should have been written,"
2815
" only %ld were written.\n"
2816
"InnoDB: Operating system error number %lu.\n"
2817
"InnoDB: Check that your OS and file system"
2818
" support files of this size.\n"
2819
"InnoDB: Check also that the disk is not full"
2820
" or a disk quota exceeded.\n",
2821
name, offset_high, offset, n, (long int)ret,
2823
if (strerror(errno) != NULL) {
2825
"InnoDB: Error number %lu means '%s'.\n",
2826
(ulint)errno, strerror(errno));
2830
"InnoDB: Some operating system error numbers"
2831
" are described at\n"
2833
REFMAN "operating-system-error-codes.html\n");
2835
os_has_said_disk_full = TRUE;
2842
/*******************************************************************//**
2843
Check the existence and type of the given file.
2844
@return TRUE if call succeeded */
2849
const char* path, /*!< in: pathname of the file */
2850
ibool* exists, /*!< out: TRUE if file exists */
2851
os_file_type_t* type) /*!< out: type of the file (if it exists) */
2855
struct _stat statinfo;
2857
ret = _stat(path, &statinfo);
2858
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2859
/* file does not exist */
2863
/* file exists, but stat call failed */
2865
os_file_handle_error_no_exit(path, "stat");
2870
if (_S_IFDIR & statinfo.st_mode) {
2871
*type = OS_FILE_TYPE_DIR;
2872
} else if (_S_IFREG & statinfo.st_mode) {
2873
*type = OS_FILE_TYPE_FILE;
2875
*type = OS_FILE_TYPE_UNKNOWN;
2883
struct stat statinfo;
2885
ret = stat(path, &statinfo);
2886
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2887
/* file does not exist */
2891
/* file exists, but stat call failed */
2893
os_file_handle_error_no_exit(path, "stat");
2898
if (S_ISDIR(statinfo.st_mode)) {
2899
*type = OS_FILE_TYPE_DIR;
2900
} else if (S_ISLNK(statinfo.st_mode)) {
2901
*type = OS_FILE_TYPE_LINK;
2902
} else if (S_ISREG(statinfo.st_mode)) {
2903
*type = OS_FILE_TYPE_FILE;
2905
*type = OS_FILE_TYPE_UNKNOWN;
2914
/*******************************************************************//**
2915
This function returns information about the specified file
2916
@return TRUE if stat information found */
2921
const char* path, /*!< in: pathname of the file */
2922
os_file_stat_t* stat_info) /*!< information of a file in a
2927
struct _stat statinfo;
2929
ret = _stat(path, &statinfo);
2930
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2931
/* file does not exist */
2935
/* file exists, but stat call failed */
2937
os_file_handle_error_no_exit(path, "stat");
2941
if (_S_IFDIR & statinfo.st_mode) {
2942
stat_info->type = OS_FILE_TYPE_DIR;
2943
} else if (_S_IFREG & statinfo.st_mode) {
2944
stat_info->type = OS_FILE_TYPE_FILE;
2946
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2949
stat_info->ctime = statinfo.st_ctime;
2950
stat_info->atime = statinfo.st_atime;
2951
stat_info->mtime = statinfo.st_mtime;
2952
stat_info->size = statinfo.st_size;
2957
struct stat statinfo;
2959
ret = stat(path, &statinfo);
2961
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2962
/* file does not exist */
2966
/* file exists, but stat call failed */
2968
os_file_handle_error_no_exit(path, "stat");
2973
if (S_ISDIR(statinfo.st_mode)) {
2974
stat_info->type = OS_FILE_TYPE_DIR;
2975
} else if (S_ISLNK(statinfo.st_mode)) {
2976
stat_info->type = OS_FILE_TYPE_LINK;
2977
} else if (S_ISREG(statinfo.st_mode)) {
2978
stat_info->type = OS_FILE_TYPE_FILE;
2980
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2983
stat_info->ctime = statinfo.st_ctime;
2984
stat_info->atime = statinfo.st_atime;
2985
stat_info->mtime = statinfo.st_mtime;
2986
stat_info->size = statinfo.st_size;
2992
/* path name separator character */
2994
# define OS_FILE_PATH_SEPARATOR '\\'
2996
# define OS_FILE_PATH_SEPARATOR '/'
2999
/****************************************************************//**
3000
The function os_file_dirname returns a directory component of a
3001
null-terminated pathname string. In the usual case, dirname returns
3002
the string up to, but not including, the final '/', and basename
3003
is the component following the final '/'. Trailing '/' charac�
3004
ters are not counted as part of the pathname.
3006
If path does not contain a slash, dirname returns the string ".".
3008
Concatenating the string returned by dirname, a "/", and the basename
3009
yields a complete pathname.
3011
The return value is a copy of the directory component of the pathname.
3012
The copy is allocated from heap. It is the caller responsibility
3013
to free it after it is no longer needed.
3015
The following list of examples (taken from SUSv2) shows the strings
3016
returned by dirname and basename for different paths:
3018
path dirname basename
3019
"/usr/lib" "/usr" "lib"
3026
@return own: directory component of the pathname */
3031
const char* path) /*!< in: pathname */
3033
/* Find the offset of the last slash */
3034
const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3036
/* No slash in the path, return "." */
3038
return(mem_strdup("."));
3041
/* Ok, there is a slash */
3043
if (last_slash == path) {
3044
/* last slash is the first char of the path */
3046
return(mem_strdup("/"));
3049
/* Non-trivial directory component */
3051
return(mem_strdupl(path, last_slash - path));
3054
/****************************************************************//**
3055
Creates all missing subdirectories along the given path.
3056
@return TRUE if call succeeded FALSE otherwise */
3059
os_file_create_subdirs_if_needed(
3060
/*=============================*/
3061
const char* path) /*!< in: path name */
3064
ibool success, subdir_exists;
3065
os_file_type_t type;
3067
subdir = os_file_dirname(path);
3068
if (strlen(subdir) == 1
3069
&& (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3070
/* subdir is root or cwd, nothing to do */
3076
/* Test if subdir exists */
3077
success = os_file_status(subdir, &subdir_exists, &type);
3078
if (success && !subdir_exists) {
3079
/* subdir does not exist, create it */
3080
success = os_file_create_subdirs_if_needed(subdir);
3086
success = os_file_create_directory(subdir, FALSE);
3094
#ifndef UNIV_HOTBACKUP
3095
/****************************************************************//**
3096
Returns a pointer to the nth slot in the aio array.
3097
@return pointer to slot */
3100
os_aio_array_get_nth_slot(
3101
/*======================*/
3102
os_aio_array_t* array, /*!< in: aio array */
3103
ulint index) /*!< in: index of the slot */
3105
ut_a(index < array->n_slots);
3107
return((array->slots) + index);
3110
#if defined(LINUX_NATIVE_AIO)
3111
/******************************************************************//**
3112
Creates an io_context for native linux AIO.
3113
@return TRUE on success. */
3116
os_aio_linux_create_io_ctx(
3117
/*=======================*/
3118
ulint max_events, /*!< in: number of events. */
3119
io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3125
memset(io_ctx, 0x0, sizeof(*io_ctx));
3127
/* Initialize the io_ctx. Tell it how many pending
3128
IO requests this context will handle. */
3130
ret = io_setup(max_events, io_ctx);
3132
#if defined(UNIV_AIO_DEBUG)
3134
"InnoDB: Linux native AIO:"
3135
" initialized io_ctx for segment\n");
3137
/* Success. Return now. */
3141
/* If we hit EAGAIN we'll make a few attempts before failing. */
3146
/* First time around. */
3147
ut_print_timestamp(stderr);
3149
" InnoDB: Warning: io_setup() failed"
3150
" with EAGAIN. Will make %d attempts"
3151
" before giving up.\n",
3152
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3155
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3158
"InnoDB: Warning: io_setup() attempt"
3161
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3165
/* Have tried enough. Better call it a day. */
3166
ut_print_timestamp(stderr);
3168
" InnoDB: Error: io_setup() failed"
3169
" with EAGAIN after %d attempts.\n",
3170
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3174
ut_print_timestamp(stderr);
3176
" InnoDB: Error: Linux Native AIO interface"
3177
" is not supported on this platform. Please"
3178
" check your OS documentation and install"
3179
" appropriate binary of InnoDB.\n");
3184
ut_print_timestamp(stderr);
3186
" InnoDB: Error: Linux Native AIO setup"
3187
" returned following error[%d]\n", -ret);
3192
"InnoDB: You can disable Linux Native AIO by"
3193
" setting innodb_native_aio = off in my.cnf\n");
3196
#endif /* LINUX_NATIVE_AIO */
3198
/******************************************************************//**
3199
Creates an aio wait array. Note that we return NULL in case of failure.
3200
We don't care about freeing memory here because we assume that a
3201
failure will result in server refusing to start up.
3202
@return own: aio array, NULL on failure */
3205
os_aio_array_create(
3206
/*================*/
3207
ulint n, /*!< in: maximum number of pending aio
3208
operations allowed; n must be
3209
divisible by n_segments */
3210
ulint n_segments) /*!< in: number of segments in the aio array */
3212
os_aio_array_t* array;
3214
os_aio_slot_t* slot;
3217
#elif defined(LINUX_NATIVE_AIO)
3218
struct io_event* aio_event = NULL;
3221
ut_a(n_segments > 0);
3223
array = static_cast<os_aio_array_t *>(ut_malloc(sizeof(os_aio_array_t)));
3225
array->mutex = os_mutex_create();
3226
array->not_full = os_event_create(NULL);
3227
array->is_empty = os_event_create(NULL);
3229
os_event_set(array->is_empty);
3232
array->n_segments = n_segments;
3233
array->n_reserved = 0;
3235
array->slots = static_cast<os_aio_slot_t *>(ut_malloc(n * sizeof(os_aio_slot_t)));
3237
array->handles = ut_malloc(n * sizeof(HANDLE));
3240
#if defined(LINUX_NATIVE_AIO)
3241
array->aio_ctx = NULL;
3242
array->aio_events = NULL;
3244
/* If we are not using native aio interface then skip this
3245
part of initialization. */
3246
if (!srv_use_native_aio) {
3247
goto skip_native_aio;
3250
/* Initialize the io_context array. One io_context
3251
per segment in the array. */
3253
array->aio_ctx = (io_context**) ut_malloc(n_segments *
3254
sizeof(*array->aio_ctx));
3255
for (i = 0; i < n_segments; ++i) {
3256
if (!os_aio_linux_create_io_ctx(n/n_segments,
3257
&array->aio_ctx[i])) {
3258
/* If something bad happened during aio setup
3259
we should call it a day and return right away.
3260
We don't care about any leaks because a failure
3261
to initialize the io subsystem means that the
3262
server (or atleast the innodb storage engine)
3263
is not going to startup. */
3268
/* Initialize the event array. One event per slot. */
3269
aio_event = (io_event*) ut_malloc(n * sizeof(io_event));
3270
memset(aio_event, 0x0, sizeof(io_event) * n);
3271
array->aio_events = aio_event;
3274
#endif /* LINUX_NATIVE_AIO */
3275
for (i = 0; i < n; i++) {
3276
slot = os_aio_array_get_nth_slot(array, i);
3279
slot->reserved = FALSE;
3281
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3283
over = &(slot->control);
3285
over->hEvent = slot->handle;
3287
*((array->handles) + i) = over->hEvent;
3289
#elif defined(LINUX_NATIVE_AIO)
3291
memset(&slot->control, 0x0, sizeof(slot->control));
3300
/************************************************************************//**
3301
Frees an aio wait array. */
3306
os_aio_array_t* array) /*!< in, own: array to free */
3311
for (i = 0; i < array->n_slots; i++) {
3312
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3313
CloseHandle(slot->handle);
3315
#endif /* WIN_ASYNC_IO */
3318
ut_free(array->handles);
3319
#endif /* __WIN__ */
3320
os_mutex_free(array->mutex);
3321
os_event_free(array->not_full);
3322
os_event_free(array->is_empty);
3324
#if defined(LINUX_NATIVE_AIO)
3325
if (srv_use_native_aio) {
3326
ut_free(array->aio_events);
3327
ut_free(array->aio_ctx);
3329
#endif /* LINUX_NATIVE_AIO */
3331
ut_free(array->slots);
3335
/***********************************************************************
3336
Initializes the asynchronous io system. Creates one array each for ibuf
3337
and log i/o. Also creates one array each for read and write where each
3338
array is divided logically into n_read_segs and n_write_segs
3339
respectively. The caller must create an i/o handler thread for each
3340
segment in these arrays. This function also creates the sync array.
3341
No i/o handler thread needs to be created for that */
3346
ulint n_per_seg, /*<! in: maximum number of pending aio
3347
operations allowed per segment */
3348
ulint n_read_segs, /*<! in: number of reader threads */
3349
ulint n_write_segs, /*<! in: number of writer threads */
3350
ulint n_slots_sync) /*<! in: number of slots in the sync aio
3354
ulint n_segments = 2 + n_read_segs + n_write_segs;
3356
ut_ad(n_segments >= 4);
3358
os_io_init_simple();
3360
for (i = 0; i < n_segments; i++) {
3361
srv_set_io_thread_op_info(i, "not started yet");
3365
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3367
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3368
if (os_aio_ibuf_array == NULL) {
3372
srv_io_thread_function[0] = "insert buffer thread";
3374
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3375
if (os_aio_log_array == NULL) {
3379
srv_io_thread_function[1] = "log thread";
3381
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
3383
if (os_aio_read_array == NULL) {
3387
for (i = 2; i < 2 + n_read_segs; i++) {
3388
ut_a(i < SRV_MAX_N_IO_THREADS);
3389
srv_io_thread_function[i] = "read thread";
3392
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
3394
if (os_aio_write_array == NULL) {
3398
for (i = 2 + n_read_segs; i < n_segments; i++) {
3399
ut_a(i < SRV_MAX_N_IO_THREADS);
3400
srv_io_thread_function[i] = "write thread";
3403
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3404
if (os_aio_sync_array == NULL) {
3409
os_aio_n_segments = n_segments;
3413
os_aio_segment_wait_events = static_cast<os_event_t *>(ut_malloc(n_segments * sizeof(void*)));
3415
for (i = 0; i < n_segments; i++) {
3416
os_aio_segment_wait_events[i] = os_event_create(NULL);
3419
os_last_printout = time(NULL);
3428
/***********************************************************************
3429
Frees the asynchronous io system. */
3437
os_aio_array_free(os_aio_ibuf_array);
3438
os_aio_ibuf_array = NULL;
3439
os_aio_array_free(os_aio_log_array);
3440
os_aio_log_array = NULL;
3441
os_aio_array_free(os_aio_read_array);
3442
os_aio_read_array = NULL;
3443
os_aio_array_free(os_aio_write_array);
3444
os_aio_write_array = NULL;
3445
os_aio_array_free(os_aio_sync_array);
3446
os_aio_sync_array = NULL;
3448
for (i = 0; i < os_aio_n_segments; i++) {
3449
os_event_free(os_aio_segment_wait_events[i]);
3452
ut_free(os_aio_segment_wait_events);
3453
os_aio_segment_wait_events = 0;
3454
os_aio_n_segments = 0;
3458
/************************************************************************//**
3459
Wakes up all async i/o threads in the array in Windows async i/o at
3463
os_aio_array_wake_win_aio_at_shutdown(
3464
/*==================================*/
3465
os_aio_array_t* array) /*!< in: aio array */
3469
for (i = 0; i < array->n_slots; i++) {
3471
SetEvent((array->slots + i)->handle);
3476
/************************************************************************//**
3477
Wakes up all async i/o threads so that they know to exit themselves in
3481
os_aio_wake_all_threads_at_shutdown(void)
3482
/*=====================================*/
3487
/* This code wakes up all ai/o threads in Windows native aio */
3488
os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3489
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3490
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3491
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3493
#elif defined(LINUX_NATIVE_AIO)
3495
/* When using native AIO interface the io helper threads
3496
wait on io_getevents with a timeout value of 500ms. At
3497
each wake up these threads check the server status.
3498
No need to do anything to wake them up. */
3500
if (srv_use_native_aio) {
3503
/* Fall through to simulated AIO handler wakeup if we are
3504
not using native AIO. */
3506
/* This loop wakes up all simulated ai/o threads */
3508
for (i = 0; i < os_aio_n_segments; i++) {
3510
os_event_set(os_aio_segment_wait_events[i]);
3514
/************************************************************************//**
3515
Waits until there are no pending writes in os_aio_write_array. There can
3516
be other, synchronous, pending writes. */
3519
os_aio_wait_until_no_pending_writes(void)
3520
/*=====================================*/
3522
os_event_wait(os_aio_write_array->is_empty);
3525
/**********************************************************************//**
3526
Calculates segment number for a slot.
3527
@return segment number (which is the number used by, for example,
3528
i/o-handler threads) */
3531
os_aio_get_segment_no_from_slot(
3532
/*============================*/
3533
os_aio_array_t* array, /*!< in: aio wait array */
3534
os_aio_slot_t* slot) /*!< in: slot in this array */
3539
if (array == os_aio_ibuf_array) {
3542
} else if (array == os_aio_log_array) {
3545
} else if (array == os_aio_read_array) {
3546
seg_len = os_aio_read_array->n_slots
3547
/ os_aio_read_array->n_segments;
3549
segment = 2 + slot->pos / seg_len;
3551
ut_a(array == os_aio_write_array);
3552
seg_len = os_aio_write_array->n_slots
3553
/ os_aio_write_array->n_segments;
3555
segment = os_aio_read_array->n_segments + 2
3556
+ slot->pos / seg_len;
3562
/**********************************************************************//**
3563
Calculates local segment number and aio array from global segment number.
3564
@return local segment number within the aio array */
3567
os_aio_get_array_and_local_segment(
3568
/*===============================*/
3569
os_aio_array_t** array, /*!< out: aio wait array */
3570
ulint global_segment)/*!< in: global segment number */
3574
ut_a(global_segment < os_aio_n_segments);
3576
if (global_segment == 0) {
3577
*array = os_aio_ibuf_array;
3580
} else if (global_segment == 1) {
3581
*array = os_aio_log_array;
3584
} else if (global_segment < os_aio_read_array->n_segments + 2) {
3585
*array = os_aio_read_array;
3587
segment = global_segment - 2;
3589
*array = os_aio_write_array;
3591
segment = global_segment - (os_aio_read_array->n_segments + 2);
3597
/*******************************************************************//**
3598
Requests for a slot in the aio array. If no slot is available, waits until
3599
not_full-event becomes signaled.
3600
@return pointer to slot */
3603
os_aio_array_reserve_slot(
3604
/*======================*/
3605
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3606
os_aio_array_t* array, /*!< in: aio array */
3607
fil_node_t* message1,/*!< in: message to be passed along with
3608
the aio operation */
3609
void* message2,/*!< in: message to be passed along with
3610
the aio operation */
3611
os_file_t file, /*!< in: file handle */
3612
const char* name, /*!< in: name of the file or path as a
3613
null-terminated string */
3614
void* buf, /*!< in: buffer where to read or from which
3616
ulint offset, /*!< in: least significant 32 bits of file
3618
ulint offset_high, /*!< in: most significant 32 bits of
3620
ulint len) /*!< in: length of the block to read or write */
3622
os_aio_slot_t* slot = NULL;
3624
OVERLAPPED* control;
3626
#elif defined(LINUX_NATIVE_AIO)
3634
ulint slots_per_seg;
3638
ut_a((len & 0xFFFFFFFFUL) == len);
3641
/* No need of a mutex. Only reading constant fields */
3642
slots_per_seg = array->n_slots / array->n_segments;
3644
/* We attempt to keep adjacent blocks in the same local
3645
segment. This can help in merging IO requests when we are
3646
doing simulated AIO */
3647
local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3648
% array->n_segments;
3651
os_mutex_enter(array->mutex);
3653
if (array->n_reserved == array->n_slots) {
3654
os_mutex_exit(array->mutex);
3656
if (!srv_use_native_aio) {
3657
/* If the handler threads are suspended, wake them
3658
so that we get more slots */
3660
os_aio_simulated_wake_handler_threads();
3663
os_event_wait(array->not_full);
3668
/* We start our search for an available slot from our preferred
3669
local segment and do a full scan of the array. We are
3670
guaranteed to find a slot in full scan. */
3671
for (i = local_seg * slots_per_seg, counter = 0;
3672
counter < array->n_slots; i++, counter++) {
3674
i %= array->n_slots;
3675
slot = os_aio_array_get_nth_slot(array, i);
3677
if (slot->reserved == FALSE) {
3682
/* We MUST always be able to get hold of a reserved slot. */
3686
ut_a(slot->reserved == FALSE);
3687
array->n_reserved++;
3689
if (array->n_reserved == 1) {
3690
os_event_reset(array->is_empty);
3693
if (array->n_reserved == array->n_slots) {
3694
os_event_reset(array->not_full);
3697
slot->reserved = TRUE;
3698
slot->reservation_time = time(NULL);
3699
slot->message1 = message1;
3700
slot->message2 = message2;
3705
slot->buf = static_cast<unsigned char *>(buf);
3706
slot->offset = offset;
3707
slot->offset_high = offset_high;
3708
slot->io_already_done = FALSE;
3711
control = &(slot->control);
3712
control->Offset = (DWORD)offset;
3713
control->OffsetHigh = (DWORD)offset_high;
3714
ResetEvent(slot->handle);
3716
#elif defined(LINUX_NATIVE_AIO)
3718
/* If we are not using native AIO skip this part. */
3719
if (!srv_use_native_aio) {
3720
goto skip_native_aio;
3723
/* Check if we are dealing with 64 bit arch.
3724
If not then make sure that offset fits in 32 bits. */
3725
if (sizeof(aio_offset) == 8) {
3726
aio_offset = offset_high;
3728
aio_offset += offset;
3730
ut_a(offset_high == 0);
3731
aio_offset = offset;
3734
iocb = &slot->control;
3736
if (type == OS_FILE_READ) {
3737
io_prep_pread(iocb, file, buf, len, aio_offset);
3739
ut_a(type == OS_FILE_WRITE);
3740
io_prep_pwrite(iocb, file, buf, len, aio_offset);
3743
iocb->data = (void*)slot;
3746
/*fprintf(stderr, "Filled up Linux native iocb.\n");*/
3750
#endif /* LINUX_NATIVE_AIO */
3751
os_mutex_exit(array->mutex);
3756
/*******************************************************************//**
3757
Frees a slot in the aio array. */
3760
os_aio_array_free_slot(
3761
/*===================*/
3762
os_aio_array_t* array, /*!< in: aio array */
3763
os_aio_slot_t* slot) /*!< in: pointer to slot */
3768
os_mutex_enter(array->mutex);
3770
ut_ad(slot->reserved);
3772
slot->reserved = FALSE;
3774
array->n_reserved--;
3776
if (array->n_reserved == array->n_slots - 1) {
3777
os_event_set(array->not_full);
3780
if (array->n_reserved == 0) {
3781
os_event_set(array->is_empty);
3786
ResetEvent(slot->handle);
3788
#elif defined(LINUX_NATIVE_AIO)
3790
if (srv_use_native_aio) {
3791
memset(&slot->control, 0x0, sizeof(slot->control));
3794
/*fprintf(stderr, "Freed up Linux native slot.\n");*/
3796
/* These fields should not be used if we are not
3797
using native AIO. */
3798
ut_ad(slot->n_bytes == 0);
3799
ut_ad(slot->ret == 0);
3803
os_mutex_exit(array->mutex);
3806
/**********************************************************************//**
3807
Wakes up a simulated aio i/o-handler thread if it has something to do. */
3810
os_aio_simulated_wake_handler_thread(
3811
/*=================================*/
3812
ulint global_segment) /*!< in: the number of the segment in the aio
3815
os_aio_array_t* array;
3816
os_aio_slot_t* slot;
3821
ut_ad(!srv_use_native_aio);
3823
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3825
n = array->n_slots / array->n_segments;
3827
/* Look through n slots after the segment * n'th slot */
3829
os_mutex_enter(array->mutex);
3831
for (i = 0; i < n; i++) {
3832
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3834
if (slot->reserved) {
3835
/* Found an i/o request */
3841
os_mutex_exit(array->mutex);
3844
os_event_set(os_aio_segment_wait_events[global_segment]);
3848
/**********************************************************************//**
3849
Wakes up simulated aio i/o-handler threads if they have something to do. */
3852
os_aio_simulated_wake_handler_threads(void)
3853
/*=======================================*/
3857
if (srv_use_native_aio) {
3858
/* We do not use simulated aio: do nothing */
3863
os_aio_recommend_sleep_for_read_threads = FALSE;
3865
for (i = 0; i < os_aio_n_segments; i++) {
3866
os_aio_simulated_wake_handler_thread(i);
3870
/**********************************************************************//**
3871
This function can be called if one wants to post a batch of reads and
3872
prefers an i/o-handler thread to handle them all at once later. You must
3873
call os_aio_simulated_wake_handler_threads later to ensure the threads
3874
are not left sleeping! */
3877
os_aio_simulated_put_read_threads_to_sleep(void)
3878
/*============================================*/
3881
/* The idea of putting background IO threads to sleep is only for
3882
Windows when using simulated AIO. Windows XP seems to schedule
3883
background threads too eagerly to allow for coalescing during
3884
readahead requests. */
3886
os_aio_array_t* array;
3889
if (srv_use_native_aio) {
3890
/* We do not use simulated aio: do nothing */
3895
os_aio_recommend_sleep_for_read_threads = TRUE;
3897
for (g = 0; g < os_aio_n_segments; g++) {
3898
os_aio_get_array_and_local_segment(&array, g);
3900
if (array == os_aio_read_array) {
3902
os_event_reset(os_aio_segment_wait_events[g]);
3905
#endif /* __WIN__ */
3908
#if defined(LINUX_NATIVE_AIO)
3909
/*******************************************************************//**
3910
Dispatch an AIO request to the kernel.
3911
@return TRUE on success. */
3914
os_aio_linux_dispatch(
3915
/*==================*/
3916
os_aio_array_t* array, /*!< in: io request array. */
3917
os_aio_slot_t* slot) /*!< in: an already reserved slot. */
3923
ut_ad(slot != NULL);
3926
ut_a(slot->reserved);
3928
/* Find out what we are going to work with.
3929
The iocb struct is directly in the slot.
3930
The io_context is one per segment. */
3932
iocb = &slot->control;
3933
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3935
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3937
#if defined(UNIV_AIO_DEBUG)
3939
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3940
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3941
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3944
/* io_submit returns number of successfully
3945
queued requests or -errno. */
3946
if (UNIV_UNLIKELY(ret != 1)) {
3953
#endif /* LINUX_NATIVE_AIO */
3956
/*******************************************************************//**
3957
NOTE! Use the corresponding macro os_aio(), not directly this function!
3958
Requests an asynchronous i/o operation.
3959
@return TRUE if request was queued successfully, FALSE if fail */
3964
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3965
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3966
to OS_AIO_SIMULATED_WAKE_LATER: the
3967
last flag advises this function not to wake
3968
i/o-handler threads, but the caller will
3969
do the waking explicitly later, in this
3970
way the caller can post several requests in
3971
a batch; NOTE that the batch must not be
3972
so big that it exhausts the slots in aio
3973
arrays! NOTE that a simulated batch
3974
may introduce hidden chances of deadlocks,
3975
because i/os are not actually handled until
3976
all have been posted: use with great
3978
const char* name, /*!< in: name of the file or path as a
3979
null-terminated string */
3980
os_file_t file, /*!< in: handle to a file */
3981
void* buf, /*!< in: buffer where to read or from which
3983
ulint offset, /*!< in: least significant 32 bits of file
3984
offset where to read or write */
3985
ulint offset_high, /*!< in: most significant 32 bits of
3987
ulint n, /*!< in: number of bytes to read or write */
3988
fil_node_t* message1,/*!< in: message for the aio handler
3989
(can be used to identify a completed
3990
aio operation); ignored if mode is
3992
void* message2)/*!< in: message for the aio handler
3993
(can be used to identify a completed
3994
aio operation); ignored if mode is
3997
os_aio_array_t* array;
3998
os_aio_slot_t* slot;
4002
DWORD len = (DWORD) n;
4003
struct fil_node_struct * dummy_mess1;
4006
#endif /* WIN_ASYNC_IO */
4007
#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4015
ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4016
ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4017
ut_ad(os_aio_validate());
4019
ut_ad((n & 0xFFFFFFFFUL) == n);
4022
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4023
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4025
if (mode == OS_AIO_SYNC
4027
&& !srv_use_native_aio
4028
#endif /* WIN_ASYNC_IO */
4030
/* This is actually an ordinary synchronous read or write:
4031
no need to use an i/o-handler thread. NOTE that if we use
4032
Windows async i/o, Windows does not allow us to use
4033
ordinary synchronous os_file_read etc. on the same file,
4034
therefore we have built a special mechanism for synchronous
4035
wait in the Windows case. */
4037
if (type == OS_FILE_READ) {
4038
return(os_file_read(file, buf, offset,
4042
ut_a(type == OS_FILE_WRITE);
4044
return(os_file_write(name, file, buf, offset, offset_high, n));
4047
#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4050
if (mode == OS_AIO_NORMAL) {
4051
if (type == OS_FILE_READ) {
4052
array = os_aio_read_array;
4054
array = os_aio_write_array;
4056
} else if (mode == OS_AIO_IBUF) {
4057
ut_ad(type == OS_FILE_READ);
4058
/* Reduce probability of deadlock bugs in connection with ibuf:
4059
do not let the ibuf i/o handler sleep */
4063
array = os_aio_ibuf_array;
4064
} else if (mode == OS_AIO_LOG) {
4066
array = os_aio_log_array;
4067
} else if (mode == OS_AIO_SYNC) {
4068
array = os_aio_sync_array;
4070
#if defined(LINUX_NATIVE_AIO)
4071
/* In Linux native AIO we don't use sync IO array. */
4072
ut_a(!srv_use_native_aio);
4073
#endif /* LINUX_NATIVE_AIO */
4075
array = NULL; /* Eliminate compiler warning */
4079
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4080
name, buf, offset, offset_high, n);
4081
if (type == OS_FILE_READ) {
4082
if (srv_use_native_aio) {
4084
os_bytes_read_since_printout += n;
4086
ret = ReadFile(file, buf, (DWORD)n, &len,
4089
#elif defined(LINUX_NATIVE_AIO)
4090
if (!os_aio_linux_dispatch(array, slot)) {
4096
os_aio_simulated_wake_handler_thread(
4097
os_aio_get_segment_no_from_slot(
4101
} else if (type == OS_FILE_WRITE) {
4102
if (srv_use_native_aio) {
4105
ret = WriteFile(file, buf, (DWORD)n, &len,
4108
#elif defined(LINUX_NATIVE_AIO)
4109
if (!os_aio_linux_dispatch(array, slot)) {
4115
os_aio_simulated_wake_handler_thread(
4116
os_aio_get_segment_no_from_slot(
4125
if (srv_use_native_aio) {
4126
if ((ret && len == n)
4127
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
4128
/* aio was queued successfully! */
4130
if (mode == OS_AIO_SYNC) {
4131
/* We want a synchronous i/o operation on a
4132
file where we also use async i/o: in Windows
4133
we must use the same wait mechanism as for
4136
retval = os_aio_windows_handle(ULINT_UNDEFINED,
4150
#endif /* WIN_ASYNC_IO */
4151
/* aio was queued successfully! */
4154
#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4156
os_aio_array_free_slot(array, slot);
4158
retry = os_file_handle_error(name,
4159
type == OS_FILE_READ
4160
? "aio read" : "aio write");
4167
#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4171
/**********************************************************************//**
4172
This function is only used in Windows asynchronous i/o.
4173
Waits for an aio operation to complete. This function is used to wait the
4174
for completed requests. The aio array of pending requests is divided
4175
into segments. The thread specifies which segment or slot it wants to wait
4176
for. NOTE: this function will also take care of freeing the aio slot,
4177
therefore no other thread is allowed to do the freeing!
4178
@return TRUE if the aio operation succeeded */
4181
os_aio_windows_handle(
4182
/*==================*/
4183
ulint segment, /*!< in: the number of the segment in the aio
4184
arrays to wait for; segment 0 is the ibuf
4185
i/o thread, segment 1 the log i/o thread,
4186
then follow the non-ibuf read threads, and as
4187
the last are the non-ibuf write threads; if
4188
this is ULINT_UNDEFINED, then it means that
4189
sync aio is used, and this parameter is
4191
ulint pos, /*!< this parameter is used only in sync aio:
4192
wait for the aio slot at this position */
4193
fil_node_t**message1, /*!< out: the messages passed with the aio
4194
request; note that also in the case where
4195
the aio operation failed, these output
4196
parameters are valid and can be used to
4197
restart the operation, for example */
4199
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4201
ulint orig_seg = segment;
4202
os_aio_array_t* array;
4203
os_aio_slot_t* slot;
4211
if (segment == ULINT_UNDEFINED) {
4212
array = os_aio_sync_array;
4215
segment = os_aio_get_array_and_local_segment(&array, segment);
4218
/* NOTE! We only access constant fields in os_aio_array. Therefore
4219
we do not have to acquire the protecting mutex yet */
4221
ut_ad(os_aio_validate());
4222
ut_ad(segment < array->n_segments);
4224
n = array->n_slots / array->n_segments;
4226
if (array == os_aio_sync_array) {
4227
WaitForSingleObject(
4228
os_aio_array_get_nth_slot(array, pos)->handle,
4232
srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4233
i = WaitForMultipleObjects((DWORD) n,
4234
array->handles + segment * n,
4239
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4240
os_thread_exit(NULL);
4243
os_mutex_enter(array->mutex);
4245
slot = os_aio_array_get_nth_slot(array, i + segment * n);
4247
ut_a(slot->reserved);
4249
if (orig_seg != ULINT_UNDEFINED) {
4250
srv_set_io_thread_op_info(orig_seg,
4251
"get windows aio return value");
4254
ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
4256
*message1 = slot->message1;
4257
*message2 = slot->message2;
4261
if (ret && len == slot->len) {
4264
#ifdef UNIV_DO_FLUSH
4265
if (slot->type == OS_FILE_WRITE
4266
&& !os_do_not_call_flush_at_each_write) {
4267
if (!os_file_flush(slot->file)) {
4271
#endif /* UNIV_DO_FLUSH */
4272
} else if (os_file_handle_error(slot->name, "Windows aio")) {
4280
os_mutex_exit(array->mutex);
4283
/* retry failed read/write operation synchronously.
4284
No need to hold array->mutex. */
4287
/* This read/write does not go through os_file_read
4288
and os_file_write APIs, need to register with
4289
performance schema explicitly here. */
4290
struct PSI_file_locker* locker = NULL;
4291
register_pfs_file_io_begin(locker, slot->file, slot->len,
4292
(slot->type == OS_FILE_WRITE)
4295
__FILE__, __LINE__);
4298
ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4300
switch (slot->type) {
4302
ret = WriteFile(slot->file, slot->buf,
4303
(DWORD) slot->len, &len,
4308
ret = ReadFile(slot->file, slot->buf,
4309
(DWORD) slot->len, &len,
4318
register_pfs_file_io_end(locker, len);
4321
if (!ret && GetLastError() == ERROR_IO_PENDING) {
4322
/* aio was queued successfully!
4323
We want a synchronous i/o operation on a
4324
file where we also use async i/o: in Windows
4325
we must use the same wait mechanism as for
4328
ret = GetOverlappedResult(slot->file,
4333
ret_val = ret && len == slot->len;
4336
os_aio_array_free_slot(array, slot);
4342
#if defined(LINUX_NATIVE_AIO)
4343
/******************************************************************//**
4344
This function is only used in Linux native asynchronous i/o. This is
4345
called from within the io-thread. If there are no completed IO requests
4346
in the slot array, the thread calls this function to collect more
4347
requests from the kernel.
4348
The io-thread waits on io_getevents(), which is a blocking call, with
4349
a timeout value. Unless the system is very heavy loaded, keeping the
4350
io-thread very busy, the io-thread will spend most of its time waiting
4352
The io-thread also exits in this function. It checks server status at
4353
each wakeup and that is why we use timed wait in io_getevents(). */
4356
os_aio_linux_collect(
4357
/*=================*/
4358
os_aio_array_t* array, /*!< in/out: slot array. */
4359
ulint segment, /*!< in: local segment no. */
4360
ulint seg_size) /*!< in: segment size. */
4366
struct timespec timeout;
4367
struct io_event* events;
4368
struct io_context* io_ctx;
4370
/* sanity checks. */
4371
ut_ad(array != NULL);
4372
ut_ad(seg_size > 0);
4373
ut_ad(segment < array->n_segments);
4375
/* Which part of event array we are going to work on. */
4376
events = &array->aio_events[segment * seg_size];
4378
/* Which io_context we are going to use. */
4379
io_ctx = array->aio_ctx[segment];
4381
/* Starting point of the segment we will be working on. */
4382
start_pos = segment * seg_size;
4385
end_pos = start_pos + seg_size;
4389
/* Go down if we are in shutdown mode.
4390
In case of srv_fast_shutdown == 2, there may be pending
4391
IO requests but that should be OK as we essentially treat
4392
that as a crash of InnoDB. */
4393
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
4394
os_thread_exit(NULL);
4397
/* Initialize the events. The timeout value is arbitrary.
4398
We probably need to experiment with it a little. */
4399
memset(events, 0, sizeof(*events) * seg_size);
4401
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4403
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4405
/* This error handling is for any error in collecting the
4406
IO requests. The errors, if any, for any particular IO
4407
request are simply passed on to the calling routine. */
4409
/* Not enough resources! Try again. */
4410
if (ret == -EAGAIN) {
4414
/* Interrupted! I have tested the behaviour in case of an
4415
interrupt. If we have some completed IOs available then
4416
the return code will be the number of IOs. We get EINTR only
4417
if there are no completed IOs and we have been interrupted. */
4418
if (ret == -EINTR) {
4422
/* No pending request! Go back and check again. */
4427
/* All other errors! should cause a trap for now. */
4428
if (UNIV_UNLIKELY(ret < 0)) {
4429
ut_print_timestamp(stderr);
4431
" InnoDB: unexpected ret_code[%d] from"
4432
" io_getevents()!\n", ret);
4438
for (i = 0; i < ret; i++) {
4439
os_aio_slot_t* slot;
4440
struct iocb* control;
4442
control = (struct iocb *)events[i].obj;
4443
ut_a(control != NULL);
4445
slot = (os_aio_slot_t *) control->data;
4447
/* Some sanity checks. */
4449
ut_a(slot->reserved);
4451
#if defined(UNIV_AIO_DEBUG)
4453
"io_getevents[%c]: slot[%p] ctx[%p]"
4455
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4456
slot, io_ctx, segment);
4459
/* We are not scribbling previous segment. */
4460
ut_a(slot->pos >= start_pos);
4462
/* We have not overstepped to next segment. */
4463
ut_a(slot->pos < end_pos);
4465
/* Mark this request as completed. The error handling
4466
will be done in the calling function. */
4467
os_mutex_enter(array->mutex);
4468
slot->n_bytes = events[i].res;
4469
slot->ret = events[i].res2;
4470
slot->io_already_done = TRUE;
4471
os_mutex_exit(array->mutex);
4477
/**********************************************************************//**
4478
This function is only used in Linux native asynchronous i/o.
4479
Waits for an aio operation to complete. This function is used to wait for
4480
the completed requests. The aio array of pending requests is divided
4481
into segments. The thread specifies which segment or slot it wants to wait
4482
for. NOTE: this function will also take care of freeing the aio slot,
4483
therefore no other thread is allowed to do the freeing!
4484
@return TRUE if the IO was successful */
4487
os_aio_linux_handle(
4488
/*================*/
4489
ulint global_seg, /*!< in: segment number in the aio array
4490
to wait for; segment 0 is the ibuf
4491
i/o thread, segment 1 is log i/o thread,
4492
then follow the non-ibuf read threads,
4493
and the last are the non-ibuf write
4495
fil_node_t**message1, /*!< out: the messages passed with the */
4496
void** message2, /*!< aio request; note that in case the
4497
aio operation failed, these output
4498
parameters are valid and can be used to
4499
restart the operation. */
4500
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4503
os_aio_array_t* array;
4504
os_aio_slot_t* slot;
4509
/* Should never be doing Sync IO here. */
4510
ut_a(global_seg != ULINT_UNDEFINED);
4512
/* Find the array and the local segment. */
4513
segment = os_aio_get_array_and_local_segment(&array, global_seg);
4514
n = array->n_slots / array->n_segments;
4516
/* Loop until we have found a completed request. */
4518
os_mutex_enter(array->mutex);
4519
for (i = 0; i < n; ++i) {
4520
slot = os_aio_array_get_nth_slot(
4521
array, i + segment * n);
4522
if (slot->reserved && slot->io_already_done) {
4523
/* Something for us to work on. */
4528
os_mutex_exit(array->mutex);
4530
/* We don't have any completed request.
4531
Wait for some request. Note that we return
4532
from wait iff we have found a request. */
4534
srv_set_io_thread_op_info(global_seg,
4535
"waiting for completed aio requests");
4536
os_aio_linux_collect(array, segment, n);
4540
/* Note that it may be that there are more then one completed
4541
IO requests. We process them one at a time. We may have a case
4542
here to improve the performance slightly by dealing with all
4543
requests in one sweep. */
4544
srv_set_io_thread_op_info(global_seg,
4545
"processing completed aio requests");
4547
/* Ensure that we are scribbling only our segment. */
4550
ut_ad(slot != NULL);
4551
ut_ad(slot->reserved);
4552
ut_ad(slot->io_already_done);
4554
*message1 = slot->message1;
4555
*message2 = slot->message2;
4559
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4562
#ifdef UNIV_DO_FLUSH
4563
if (slot->type == OS_FILE_WRITE
4564
&& !os_do_not_call_flush_at_each_write)
4565
&& !os_file_flush(slot->file) {
4568
#endif /* UNIV_DO_FLUSH */
4572
/* os_file_handle_error does tell us if we should retry
4573
this IO. As it stands now, we don't do this retry when
4574
reaping requests from a different context than
4575
the dispatcher. This non-retry logic is the same for
4576
windows and linux native AIO.
4577
We should probably look into this to transparently
4578
re-submit the IO. */
4579
os_file_handle_error(slot->name, "Linux aio");
4584
os_mutex_exit(array->mutex);
4586
os_aio_array_free_slot(array, slot);
4590
#endif /* LINUX_NATIVE_AIO */
4592
/**********************************************************************//**
4593
Does simulated aio. This function should be called by an i/o-handler
4595
@return TRUE if the aio operation succeeded */
4598
os_aio_simulated_handle(
4599
/*====================*/
4600
ulint global_segment, /*!< in: the number of the segment in the aio
4601
arrays to wait for; segment 0 is the ibuf
4602
i/o thread, segment 1 the log i/o thread,
4603
then follow the non-ibuf read threads, and as
4604
the last are the non-ibuf write threads */
4605
fil_node_t**message1, /*!< out: the messages passed with the aio
4606
request; note that also in the case where
4607
the aio operation failed, these output
4608
parameters are valid and can be used to
4609
restart the operation, for example */
4611
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4613
os_aio_array_t* array;
4615
os_aio_slot_t* slot;
4616
os_aio_slot_t* slot2;
4617
os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
4618
ulint n_consecutive;
4621
ulint lowest_offset;
4625
byte* combined_buf2;
4630
/* Fix compiler warning */
4631
*consecutive_ios = NULL;
4633
memset(consecutive_ios, 0, sizeof(os_aio_slot_t*) * OS_AIO_MERGE_N_CONSECUTIVE);
4634
segment = os_aio_get_array_and_local_segment(&array, global_segment);
4637
/* NOTE! We only access constant fields in os_aio_array. Therefore
4638
we do not have to acquire the protecting mutex yet */
4640
srv_set_io_thread_op_info(global_segment,
4641
"looking for i/o requests (a)");
4642
ut_ad(os_aio_validate());
4643
ut_ad(segment < array->n_segments);
4645
n = array->n_slots / array->n_segments;
4647
/* Look through n slots after the segment * n'th slot */
4649
if (array == os_aio_read_array
4650
&& os_aio_recommend_sleep_for_read_threads) {
4652
/* Give other threads chance to add several i/os to the array
4655
goto recommended_sleep;
4658
os_mutex_enter(array->mutex);
4660
srv_set_io_thread_op_info(global_segment,
4661
"looking for i/o requests (b)");
4663
/* Check if there is a slot for which the i/o has already been
4666
for (i = 0; i < n; i++) {
4667
slot = os_aio_array_get_nth_slot(array, i + segment * n);
4669
if (slot->reserved && slot->io_already_done) {
4671
if (os_aio_print_debug) {
4673
"InnoDB: i/o for slot %lu"
4674
" already done, returning\n",
4686
/* If there are at least 2 seconds old requests, then pick the oldest
4687
one to prevent starvation. If several requests have the same age,
4688
then pick the one at the lowest offset. */
4691
lowest_offset = ULINT_MAX;
4693
for (i = 0; i < n; i++) {
4694
slot = os_aio_array_get_nth_slot(array, i + segment * n);
4696
if (slot->reserved) {
4697
age = (ulint)difftime(time(NULL),
4698
slot->reservation_time);
4700
if ((age >= 2 && age > biggest_age)
4701
|| (age >= 2 && age == biggest_age
4702
&& slot->offset < lowest_offset)) {
4704
/* Found an i/o request */
4705
consecutive_ios[0] = slot;
4710
lowest_offset = slot->offset;
4715
if (n_consecutive == 0) {
4716
/* There were no old requests. Look for an i/o request at the
4717
lowest offset in the array (we ignore the high 32 bits of the
4718
offset in these heuristics) */
4720
lowest_offset = ULINT_MAX;
4722
for (i = 0; i < n; i++) {
4723
slot = os_aio_array_get_nth_slot(array,
4726
if (slot->reserved && slot->offset < lowest_offset) {
4728
/* Found an i/o request */
4729
consecutive_ios[0] = slot;
4733
lowest_offset = slot->offset;
4738
if (n_consecutive == 0) {
4740
/* No i/o requested at the moment */
4745
/* if n_consecutive != 0, then we have assigned
4746
something valid to consecutive_ios[0] */
4747
ut_ad(n_consecutive != 0);
4748
ut_ad(consecutive_ios[0] != NULL);
4750
slot = consecutive_ios[0];
4752
/* Check if there are several consecutive blocks to read or write */
4755
for (i = 0; i < n; i++) {
4756
slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4758
if (slot2->reserved && slot2 != slot
4759
&& slot2->offset == slot->offset + slot->len
4760
/* check that sum does not wrap over */
4761
&& slot->offset + slot->len > slot->offset
4762
&& slot2->offset_high == slot->offset_high
4763
&& slot2->type == slot->type
4764
&& slot2->file == slot->file) {
4766
/* Found a consecutive i/o request */
4768
consecutive_ios[n_consecutive] = slot2;
4773
if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4775
goto consecutive_loop;
4782
srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4784
/* We have now collected n_consecutive i/o requests in the array;
4785
allocate a single buffer which can hold all data, and perform the
4789
slot = consecutive_ios[0];
4791
for (i = 0; i < n_consecutive; i++) {
4792
total_len += consecutive_ios[i]->len;
4795
if (n_consecutive == 1) {
4796
/* We can use the buffer of the i/o request */
4797
combined_buf = slot->buf;
4798
combined_buf2 = NULL;
4800
combined_buf2 = static_cast<unsigned char *>(ut_malloc(total_len + UNIV_PAGE_SIZE));
4802
ut_a(combined_buf2);
4804
combined_buf = static_cast<unsigned char *>(ut_align(combined_buf2, UNIV_PAGE_SIZE));
4807
/* We release the array mutex for the time of the i/o: NOTE that
4808
this assumes that there is just one i/o-handler thread serving
4809
a single segment of slots! */
4811
os_mutex_exit(array->mutex);
4813
if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4814
/* Copy the buffers to the combined buffer */
4817
for (i = 0; i < n_consecutive; i++) {
4819
ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4820
consecutive_ios[i]->len);
4821
offs += consecutive_ios[i]->len;
4825
srv_set_io_thread_op_info(global_segment, "doing file i/o");
4827
if (os_aio_print_debug) {
4829
"InnoDB: doing i/o of type %lu at offset %lu %lu,"
4831
(ulong) slot->type, (ulong) slot->offset_high,
4832
(ulong) slot->offset, (ulong) total_len);
4835
/* Do the i/o with ordinary, synchronous i/o functions: */
4836
if (slot->type == OS_FILE_WRITE) {
4837
ret = os_file_write(slot->name, slot->file, combined_buf,
4838
slot->offset, slot->offset_high,
4841
ret = os_file_read(slot->file, combined_buf,
4842
slot->offset, slot->offset_high, total_len);
4846
srv_set_io_thread_op_info(global_segment, "file i/o done");
4850
"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4851
n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4854
if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4855
/* Copy the combined buffer to individual buffers */
4858
for (i = 0; i < n_consecutive; i++) {
4860
ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4861
consecutive_ios[i]->len);
4862
offs += consecutive_ios[i]->len;
4866
if (combined_buf2) {
4867
ut_free(combined_buf2);
4870
os_mutex_enter(array->mutex);
4872
/* Mark the i/os done in slots */
4874
for (i = 0; i < n_consecutive; i++) {
4875
consecutive_ios[i]->io_already_done = TRUE;
4878
/* We return the messages for the first slot now, and if there were
4879
several slots, the messages will be returned with subsequent calls
4884
ut_a(slot->reserved);
4886
*message1 = slot->message1;
4887
*message2 = slot->message2;
4891
os_mutex_exit(array->mutex);
4893
os_aio_array_free_slot(array, slot);
4898
srv_set_io_thread_op_info(global_segment, "resetting wait event");
4900
/* We wait here until there again can be i/os in the segment
4903
os_event_reset(os_aio_segment_wait_events[global_segment]);
4905
os_mutex_exit(array->mutex);
4908
srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4910
os_event_wait(os_aio_segment_wait_events[global_segment]);
4912
if (os_aio_print_debug) {
4914
"InnoDB: i/o handler thread for i/o"
4915
" segment %lu wakes up\n",
4916
(ulong) global_segment);
4922
/**********************************************************************//**
4923
Validates the consistency of an aio array.
4924
@return TRUE if ok */
4927
os_aio_array_validate(
4928
/*==================*/
4929
os_aio_array_t* array) /*!< in: aio wait array */
4931
os_aio_slot_t* slot;
4932
ulint n_reserved = 0;
4937
os_mutex_enter(array->mutex);
4939
ut_a(array->n_slots > 0);
4940
ut_a(array->n_segments > 0);
4942
for (i = 0; i < array->n_slots; i++) {
4943
slot = os_aio_array_get_nth_slot(array, i);
4945
if (slot->reserved) {
4947
ut_a(slot->len > 0);
4951
ut_a(array->n_reserved == n_reserved);
4953
os_mutex_exit(array->mutex);
4958
/**********************************************************************//**
4959
Validates the consistency the aio system.
4960
@return TRUE if ok */
4963
os_aio_validate(void)
4964
/*=================*/
4966
os_aio_array_validate(os_aio_read_array);
4967
os_aio_array_validate(os_aio_write_array);
4968
os_aio_array_validate(os_aio_ibuf_array);
4969
os_aio_array_validate(os_aio_log_array);
4970
os_aio_array_validate(os_aio_sync_array);
4975
/**********************************************************************//**
4976
Prints pending IO requests per segment of an aio array.
4977
We probably don't need per segment statistics but they can help us
4978
during development phase to see if the IO requests are being
4979
distributed as expected. */
4982
os_aio_print_segment_info(
4983
/*======================*/
4984
FILE* file, /*!< in: file where to print */
4985
ulint* n_seg, /*!< in: pending IO array */
4986
os_aio_array_t* array) /*!< in: array to process */
4992
ut_ad(array->n_segments > 0);
4994
if (array->n_segments == 1) {
4998
fprintf(file, " [");
4999
for (i = 0; i < array->n_segments; i++) {
5001
fprintf(file, ", ");
5004
fprintf(file, "%lu", n_seg[i]);
5006
fprintf(file, "] ");
5009
/**********************************************************************//**
5010
Prints info of the aio arrays. */
5015
FILE* file) /*!< in: file where to print */
5017
os_aio_array_t* array;
5018
os_aio_slot_t* slot;
5020
ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5021
time_t current_time;
5022
double time_elapsed;
5023
double avg_bytes_read;
5026
for (i = 0; i < srv_n_file_io_threads; i++) {
5027
fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
5028
srv_io_thread_op_info[i],
5029
srv_io_thread_function[i]);
5032
if (os_aio_segment_wait_events[i]->is_set) {
5033
fprintf(file, " ev set");
5037
fprintf(file, "\n");
5040
fputs("Pending normal aio reads:", file);
5042
array = os_aio_read_array;
5046
os_mutex_enter(array->mutex);
5048
ut_a(array->n_slots > 0);
5049
ut_a(array->n_segments > 0);
5053
memset(n_res_seg, 0x0, sizeof(n_res_seg));
5055
for (i = 0; i < array->n_slots; i++) {
5058
slot = os_aio_array_get_nth_slot(array, i);
5060
seg_no = (i * array->n_segments) / array->n_slots;
5061
if (slot->reserved) {
5063
n_res_seg[seg_no]++;
5065
fprintf(stderr, "Reserved slot, messages %p %p\n",
5066
(void*) slot->message1,
5067
(void*) slot->message2);
5069
ut_a(slot->len > 0);
5073
ut_a(array->n_reserved == n_reserved);
5075
fprintf(file, " %lu", (ulong) n_reserved);
5077
os_aio_print_segment_info(file, n_res_seg, array);
5079
os_mutex_exit(array->mutex);
5081
if (array == os_aio_read_array) {
5082
fputs(", aio writes:", file);
5084
array = os_aio_write_array;
5089
if (array == os_aio_write_array) {
5090
fputs(",\n ibuf aio reads:", file);
5091
array = os_aio_ibuf_array;
5096
if (array == os_aio_ibuf_array) {
5097
fputs(", log i/o's:", file);
5098
array = os_aio_log_array;
5103
if (array == os_aio_log_array) {
5104
fputs(", sync i/o's:", file);
5105
array = os_aio_sync_array;
5111
current_time = time(NULL);
5112
time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5115
"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5116
"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5117
(ulong) fil_n_pending_log_flushes,
5118
(ulong) fil_n_pending_tablespace_flushes,
5119
(ulong) os_n_file_reads, (ulong) os_n_file_writes,
5120
(ulong) os_n_fsyncs);
5122
if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
5124
"%lu pending preads, %lu pending pwrites\n",
5125
(ulong) os_file_n_pending_preads,
5126
(ulong) os_file_n_pending_pwrites);
5129
if (os_n_file_reads == os_n_file_reads_old) {
5130
avg_bytes_read = 0.0;
5132
avg_bytes_read = (double) os_bytes_read_since_printout
5133
/ (os_n_file_reads - os_n_file_reads_old);
5137
"%.2f reads/s, %lu avg bytes/read,"
5138
" %.2f writes/s, %.2f fsyncs/s\n",
5139
(os_n_file_reads - os_n_file_reads_old)
5141
(ulong)avg_bytes_read,
5142
(os_n_file_writes - os_n_file_writes_old)
5144
(os_n_fsyncs - os_n_fsyncs_old)
5147
os_n_file_reads_old = os_n_file_reads;
5148
os_n_file_writes_old = os_n_file_writes;
5149
os_n_fsyncs_old = os_n_fsyncs;
5150
os_bytes_read_since_printout = 0;
5152
os_last_printout = current_time;
5155
/**********************************************************************//**
5156
Refreshes the statistics used to print per-second averages. */
5159
os_aio_refresh_stats(void)
5160
/*======================*/
5162
os_n_file_reads_old = os_n_file_reads;
5163
os_n_file_writes_old = os_n_file_writes;
5164
os_n_fsyncs_old = os_n_fsyncs;
5165
os_bytes_read_since_printout = 0;
5167
os_last_printout = time(NULL);
5171
/**********************************************************************//**
5172
Checks that all slots in the system have been freed, that is, there are
5173
no pending io operations.
5174
@return TRUE if all free */
5177
os_aio_all_slots_free(void)
5178
/*=======================*/
5180
os_aio_array_t* array;
5183
array = os_aio_read_array;
5185
os_mutex_enter(array->mutex);
5187
n_res += array->n_reserved;
5189
os_mutex_exit(array->mutex);
5191
array = os_aio_write_array;
5193
os_mutex_enter(array->mutex);
5195
n_res += array->n_reserved;
5197
os_mutex_exit(array->mutex);
5199
array = os_aio_ibuf_array;
5201
os_mutex_enter(array->mutex);
5203
n_res += array->n_reserved;
5205
os_mutex_exit(array->mutex);
5207
array = os_aio_log_array;
5209
os_mutex_enter(array->mutex);
5211
n_res += array->n_reserved;
5213
os_mutex_exit(array->mutex);
5215
array = os_aio_sync_array;
5217
os_mutex_enter(array->mutex);
5219
n_res += array->n_reserved;
5221
os_mutex_exit(array->mutex);
5230
#endif /* UNIV_DEBUG */
5232
#endif /* !UNIV_HOTBACKUP */