1
/******************************************************
2
The interface to the operating system file i/o primitives
6
Created 10/21/1995 Heikki Tuuri
7
*******************************************************/
11
#include "os0thread.h"
14
#include "srv0start.h"
18
#if defined(UNIV_HOTBACKUP) && defined(__WIN__)
19
/* Add includes for the _stat() call to compile on Windows */
20
#include <sys/types.h>
23
#endif /* UNIV_HOTBACKUP */
26
/* We assume in this case that the OS has standard Posix aio (at least SunOS
27
2.6, HP-UX 11i and AIX 4.3 have) */
31
/* This specifies the file permissions InnoDB uses when it creates files in
32
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
36
UNIV_INTERN ulint os_innodb_umask
37
= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
39
UNIV_INTERN ulint os_innodb_umask = 0;
43
/* If the following is set to TRUE, we do not call os_file_flush in every
44
os_file_write. We can set this TRUE when the doublewrite buffer is used. */
45
UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
47
/* We do not call os_file_flush in every os_file_write. */
48
#endif /* UNIV_DO_FLUSH */
50
/* We use these mutexes to protect lseek + file i/o operation, if the
51
OS does not provide an atomic pread or pwrite, or similar */
52
#define OS_FILE_N_SEEK_MUTEXES 16
53
UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
55
/* In simulated aio, merge at most this many consecutive i/os */
56
#define OS_AIO_MERGE_N_CONSECUTIVE 64
58
/* If this flag is TRUE, then we will use the native aio of the
59
OS (provided we compiled Innobase with it in), otherwise we will
60
use simulated aio we build below with threads */
62
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
64
UNIV_INTERN ibool os_aio_print_debug = FALSE;
66
/* The aio array slot structure */
67
typedef struct os_aio_slot_struct os_aio_slot_t;
69
struct os_aio_slot_struct{
70
ibool is_read; /* TRUE if a read operation */
71
ulint pos; /* index of the slot in the aio
73
ibool reserved; /* TRUE if this slot is reserved */
74
time_t reservation_time;/* time when reserved */
75
ulint len; /* length of the block to read or
77
byte* buf; /* buffer used in i/o */
78
ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
79
ulint offset; /* 32 low bits of file offset in
81
ulint offset_high; /* 32 high bits of file offset */
82
os_file_t file; /* file where to read or write */
83
const char* name; /* file name or path */
84
ibool io_already_done;/* used only in simulated aio:
85
TRUE if the physical i/o already
86
made and only the slot message
87
needs to be passed to the caller
88
of os_aio_simulated_handle */
89
fil_node_t* message1; /* message which is given by the */
90
void* message2; /* the requester of an aio operation
91
and which can be used to identify
92
which pending aio operation was
95
os_event_t event; /* event object we need in the
97
OVERLAPPED control; /* Windows control block for the
99
#elif defined(POSIX_ASYNC_IO)
100
struct aiocb control; /* Posix control block for aio
105
/* The aio array structure */
106
typedef struct os_aio_array_struct os_aio_array_t;
108
struct os_aio_array_struct{
109
os_mutex_t mutex; /* the mutex protecting the aio array */
110
os_event_t not_full; /* The event which is set to the signaled
111
state when there is space in the aio
112
outside the ibuf segment */
113
os_event_t is_empty; /* The event which is set to the signaled
114
state when there are no pending i/os
116
ulint n_slots; /* Total number of slots in the aio array.
117
This must be divisible by n_threads. */
118
ulint n_segments;/* Number of segments in the aio array of
119
pending aio requests. A thread can wait
120
separately for any one of the segments. */
121
ulint n_reserved;/* Number of reserved slots in the
122
aio array outside the ibuf segment */
123
os_aio_slot_t* slots; /* Pointer to the slots in the array */
125
os_native_event_t* native_events;
126
/* Pointer to an array of OS native event
127
handles where we copied the handles from
128
slots, in the same order. This can be used
129
in WaitForMultipleObjects; used only in
134
/* Array of events used in simulated aio */
135
static os_event_t* os_aio_segment_wait_events = NULL;
137
/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
138
are NULL when the module has not yet been initialized. */
139
static os_aio_array_t* os_aio_read_array = NULL;
140
static os_aio_array_t* os_aio_write_array = NULL;
141
static os_aio_array_t* os_aio_ibuf_array = NULL;
142
static os_aio_array_t* os_aio_log_array = NULL;
143
static os_aio_array_t* os_aio_sync_array = NULL;
145
static ulint os_aio_n_segments = ULINT_UNDEFINED;
147
/* If the following is TRUE, read i/o handler threads try to
148
wait until a batch of new read requests have been posted */
149
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
151
UNIV_INTERN ulint os_n_file_reads = 0;
152
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
153
UNIV_INTERN ulint os_n_file_writes = 0;
154
UNIV_INTERN ulint os_n_fsyncs = 0;
155
UNIV_INTERN ulint os_n_file_reads_old = 0;
156
UNIV_INTERN ulint os_n_file_writes_old = 0;
157
UNIV_INTERN ulint os_n_fsyncs_old = 0;
158
UNIV_INTERN time_t os_last_printout;
160
UNIV_INTERN ibool os_has_said_disk_full = FALSE;
162
/* The mutex protecting the following counts of pending I/O operations */
163
static os_mutex_t os_file_count_mutex;
164
UNIV_INTERN ulint os_file_n_pending_preads = 0;
165
UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
166
UNIV_INTERN ulint os_n_pending_writes = 0;
167
UNIV_INTERN ulint os_n_pending_reads = 0;
169
/***************************************************************************
170
Gets the operating system version. Currently works only on Windows. */
173
os_get_os_version(void)
174
/*===================*/
175
/* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
178
OSVERSIONINFO os_info;
180
os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
182
ut_a(GetVersionEx(&os_info));
184
if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
186
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
188
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
189
if (os_info.dwMajorVersion <= 4) {
205
/***************************************************************************
206
Retrieves the last error number if an error occurs in a file io function.
207
The number should be retrieved before any other OS calls (because they may
208
overwrite the error number). If the number is not known to this program,
209
the OS error number + 100 is returned. */
212
os_file_get_last_error(
213
/*===================*/
214
/* out: error number, or OS error
216
ibool report_all_errors) /* in: TRUE if we want an error message
217
printed of all errors */
223
err = (ulint) GetLastError();
225
if (report_all_errors
226
|| (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
228
ut_print_timestamp(stderr);
230
" InnoDB: Operating system error number %lu"
231
" in a file operation.\n", (ulong) err);
233
if (err == ERROR_PATH_NOT_FOUND) {
235
"InnoDB: The error means the system"
236
" cannot find the path specified.\n");
238
if (srv_is_being_started) {
240
"InnoDB: If you are installing InnoDB,"
241
" remember that you must create\n"
242
"InnoDB: directories yourself, InnoDB"
243
" does not create them.\n");
245
} else if (err == ERROR_ACCESS_DENIED) {
247
"InnoDB: The error means mysqld does not have"
248
" the access rights to\n"
249
"InnoDB: the directory. It may also be"
250
" you have created a subdirectory\n"
251
"InnoDB: of the same name as a data file.\n");
252
} else if (err == ERROR_SHARING_VIOLATION
253
|| err == ERROR_LOCK_VIOLATION) {
255
"InnoDB: The error means that another program"
256
" is using InnoDB's files.\n"
257
"InnoDB: This might be a backup or antivirus"
258
" software or another instance\n"
260
" Please close it to get rid of this error.\n");
263
"InnoDB: Some operating system error numbers"
264
" are described at\n"
266
"http://dev.mysql.com/doc/refman/5.1/en/"
267
"operating-system-error-codes.html\n");
273
if (err == ERROR_FILE_NOT_FOUND) {
274
return(OS_FILE_NOT_FOUND);
275
} else if (err == ERROR_DISK_FULL) {
276
return(OS_FILE_DISK_FULL);
277
} else if (err == ERROR_FILE_EXISTS) {
278
return(OS_FILE_ALREADY_EXISTS);
279
} else if (err == ERROR_SHARING_VIOLATION
280
|| err == ERROR_LOCK_VIOLATION) {
281
return(OS_FILE_SHARING_VIOLATION);
288
if (report_all_errors
289
|| (err != ENOSPC && err != EEXIST)) {
291
ut_print_timestamp(stderr);
293
" InnoDB: Operating system error number %lu"
294
" in a file operation.\n", (ulong) err);
298
"InnoDB: The error means the system"
299
" cannot find the path specified.\n");
301
if (srv_is_being_started) {
303
"InnoDB: If you are installing InnoDB,"
304
" remember that you must create\n"
305
"InnoDB: directories yourself, InnoDB"
306
" does not create them.\n");
308
} else if (err == EACCES) {
310
"InnoDB: The error means mysqld does not have"
311
" the access rights to\n"
312
"InnoDB: the directory.\n");
314
if (strerror((int)err) != NULL) {
316
"InnoDB: Error number %lu"
318
err, strerror((int)err));
322
"InnoDB: Some operating system"
323
" error numbers are described at\n"
325
"http://dev.mysql.com/doc/refman/5.1/en/"
326
"operating-system-error-codes.html\n");
333
return(OS_FILE_DISK_FULL);
334
#ifdef POSIX_ASYNC_IO
335
} else if (err == EAGAIN) {
336
return(OS_FILE_AIO_RESOURCES_RESERVED);
338
} else if (err == ENOENT) {
339
return(OS_FILE_NOT_FOUND);
340
} else if (err == EEXIST) {
341
return(OS_FILE_ALREADY_EXISTS);
342
} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
343
return(OS_FILE_PATH_ERROR);
350
/********************************************************************
351
Does error handling when a file operation fails.
352
Conditionally exits (calling exit(3)) based on should_exit value and the
356
os_file_handle_error_cond_exit(
357
/*===========================*/
358
/* out: TRUE if we should retry the
360
const char* name, /* in: name of a file or NULL */
361
const char* operation, /* in: operation */
362
ibool should_exit) /* in: call exit(3) if unknown error
363
and this parameter is TRUE */
367
err = os_file_get_last_error(FALSE);
369
if (err == OS_FILE_DISK_FULL) {
370
/* We only print a warning about disk full once */
372
if (os_has_said_disk_full) {
378
ut_print_timestamp(stderr);
380
" InnoDB: Encountered a problem with"
384
ut_print_timestamp(stderr);
386
" InnoDB: Disk is full. Try to clean the disk"
387
" to free space.\n");
389
os_has_said_disk_full = TRUE;
394
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
397
} else if (err == OS_FILE_ALREADY_EXISTS
398
|| err == OS_FILE_PATH_ERROR) {
401
} else if (err == OS_FILE_SHARING_VIOLATION) {
403
os_thread_sleep(10000000); /* 10 sec */
407
fprintf(stderr, "InnoDB: File name %s\n", name);
410
fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
414
fprintf(stderr, "InnoDB: Cannot continue operation.\n");
425
/********************************************************************
426
Does error handling when a file operation fails. */
429
os_file_handle_error(
430
/*=================*/
431
/* out: TRUE if we should retry the
433
const char* name, /* in: name of a file or NULL */
434
const char* operation)/* in: operation */
436
/* exit in case of unknown error */
437
return(os_file_handle_error_cond_exit(name, operation, TRUE));
440
/********************************************************************
441
Does error handling when a file operation fails. */
444
os_file_handle_error_no_exit(
445
/*=========================*/
446
/* out: TRUE if we should retry the
448
const char* name, /* in: name of a file or NULL */
449
const char* operation)/* in: operation */
451
/* don't exit in case of unknown error */
452
return(os_file_handle_error_cond_exit(name, operation, FALSE));
456
#define USE_FILE_LOCK
457
#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
458
/* InnoDB Hot Backup does not lock the data files.
459
* On Windows, mandatory locking is used.
461
# undef USE_FILE_LOCK
464
/********************************************************************
465
Obtain an exclusive lock on a file. */
470
/* out: 0 on success */
471
int fd, /* in: file descriptor */
472
const char* name) /* in: file name */
476
lk.l_whence = SEEK_SET;
477
lk.l_start = lk.l_len = 0;
478
if (fcntl(fd, F_SETLK, &lk) == -1) {
480
"InnoDB: Unable to lock %s, error: %d\n", name, errno);
482
if (errno == EAGAIN || errno == EACCES) {
484
"InnoDB: Check that you do not already have"
485
" another mysqld process\n"
486
"InnoDB: using the same InnoDB data"
495
#endif /* USE_FILE_LOCK */
497
/********************************************************************
498
Creates the seek mutexes used in positioned reads and writes. */
501
os_io_init_simple(void)
502
/*===================*/
506
os_file_count_mutex = os_mutex_create(NULL);
508
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
509
os_file_seek_mutexes[i] = os_mutex_create(NULL);
513
/***************************************************************************
514
Creates a temporary file. This function is like tmpfile(3), but
515
the temporary file is created in the MySQL temporary directory.
516
On Netware, this function is like tmpfile(3), because the C run-time
517
library of Netware does not expose the delete-on-close flag. */
520
os_file_create_tmpfile(void)
521
/*========================*/
522
/* out: temporary file handle, or NULL on error */
524
#ifdef UNIV_HOTBACKUP
530
FILE* file = tmpfile();
531
# else /* __NETWARE__ */
533
int fd = innobase_mysql_tmpfile();
536
file = fdopen(fd, "w+b");
538
# endif /* __NETWARE__ */
541
ut_print_timestamp(stderr);
543
" InnoDB: Error: unable to create temporary file;"
544
" errno: %d\n", errno);
549
# endif /* !__NETWARE__ */
553
#endif /* UNIV_HOTBACKUP */
556
/***************************************************************************
557
The os_file_opendir() function opens a directory stream corresponding to the
558
directory named by the dirname argument. The directory stream is positioned
559
at the first entry. In both Unix and Windows we automatically skip the '.'
560
and '..' items at the start of the directory listing. */
565
/* out: directory stream, NULL if
567
const char* dirname, /* in: directory name; it must not
568
contain a trailing '\' or '/' */
569
ibool error_is_fatal) /* in: TRUE if we should treat an
570
error as a fatal error; if we try to
571
open symlinks then we do not wish a
572
fatal error if it happens not to be
577
LPWIN32_FIND_DATA lpFindFileData;
578
char path[OS_FILE_MAX_PATH + 3];
580
ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
582
strcpy(path, dirname);
583
strcpy(path + strlen(path), "\\*");
585
/* Note that in Windows opening the 'directory stream' also retrieves
586
the first entry in the directory. Since it is '.', that is no problem,
587
as we will skip over the '.' and '..' entries anyway. */
589
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
591
dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
593
ut_free(lpFindFileData);
595
if (dir == INVALID_HANDLE_VALUE) {
597
if (error_is_fatal) {
598
os_file_handle_error(dirname, "opendir");
606
dir = opendir(dirname);
608
if (dir == NULL && error_is_fatal) {
609
os_file_handle_error(dirname, "opendir");
616
/***************************************************************************
617
Closes a directory stream. */
622
/* out: 0 if success, -1 if failure */
623
os_file_dir_t dir) /* in: directory stream */
628
ret = FindClose(dir);
631
os_file_handle_error_no_exit(NULL, "closedir");
643
os_file_handle_error_no_exit(NULL, "closedir");
650
/***************************************************************************
651
This function returns information of the next file in the directory. We jump
652
over the '.' and '..' entries in the directory. */
655
os_file_readdir_next_file(
656
/*======================*/
657
/* out: 0 if ok, -1 if error, 1 if at the end
659
const char* dirname,/* in: directory name or path */
660
os_file_dir_t dir, /* in: directory stream */
661
os_file_stat_t* info) /* in/out: buffer where the info is returned */
664
LPWIN32_FIND_DATA lpFindFileData;
667
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
669
ret = FindNextFile(dir, lpFindFileData);
672
ut_a(strlen((char *) lpFindFileData->cFileName)
675
if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
676
|| strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
681
strcpy(info->name, (char *) lpFindFileData->cFileName);
683
info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
684
+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
687
if (lpFindFileData->dwFileAttributes
688
& FILE_ATTRIBUTE_REPARSE_POINT) {
689
/* TODO: test Windows symlinks */
690
/* TODO: MySQL has apparently its own symlink
691
implementation in Windows, dbname.sym can
692
redirect a database directory:
693
http://dev.mysql.com/doc/refman/5.1/en/
694
windows-symbolic-links.html */
695
info->type = OS_FILE_TYPE_LINK;
696
} else if (lpFindFileData->dwFileAttributes
697
& FILE_ATTRIBUTE_DIRECTORY) {
698
info->type = OS_FILE_TYPE_DIR;
700
/* It is probably safest to assume that all other
701
file types are normal. Better to check them rather
702
than blindly skip them. */
704
info->type = OS_FILE_TYPE_FILE;
708
ut_free(lpFindFileData);
712
} else if (GetLastError() == ERROR_NO_MORE_FILES) {
716
os_file_handle_error_no_exit(dirname,
717
"readdir_next_file");
724
struct stat statinfo;
725
#ifdef HAVE_READDIR_R
726
char dirent_buf[sizeof(struct dirent)
727
+ _POSIX_PATH_MAX + 100];
728
/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
729
the max file name len; but in most standards, the
730
length is NAME_MAX; we add 100 to be even safer */
735
#ifdef HAVE_READDIR_R
736
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
740
"InnoDB: cannot read directory %s, error %lu\n",
741
dirname, (ulong)ret);
747
/* End of directory */
752
ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
761
ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
763
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
768
strcpy(info->name, ent->d_name);
770
full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
772
sprintf(full_path, "%s/%s", dirname, ent->d_name);
774
ret = stat(full_path, &statinfo);
777
os_file_handle_error_no_exit(full_path, "stat");
784
info->size = (ib_int64_t)statinfo.st_size;
786
if (S_ISDIR(statinfo.st_mode)) {
787
info->type = OS_FILE_TYPE_DIR;
788
} else if (S_ISLNK(statinfo.st_mode)) {
789
info->type = OS_FILE_TYPE_LINK;
790
} else if (S_ISREG(statinfo.st_mode)) {
791
info->type = OS_FILE_TYPE_FILE;
793
info->type = OS_FILE_TYPE_UNKNOWN;
802
/*********************************************************************
803
This function attempts to create a directory named pathname. The new directory
804
gets default permissions. On Unix the permissions are (0770 & ~umask). If the
805
directory exists already, nothing is done and the call succeeds, unless the
806
fail_if_exists arguments is true. */
809
os_file_create_directory(
810
/*=====================*/
811
/* out: TRUE if call succeeds,
813
const char* pathname, /* in: directory name as
814
null-terminated string */
815
ibool fail_if_exists) /* in: if TRUE, pre-existing directory
816
is treated as an error. */
821
rcode = CreateDirectory((LPCTSTR) pathname, NULL);
823
|| (GetLastError() == ERROR_ALREADY_EXISTS
824
&& !fail_if_exists))) {
826
os_file_handle_error(pathname, "CreateDirectory");
835
rcode = mkdir(pathname, 0770);
837
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
839
os_file_handle_error(pathname, "mkdir");
848
/********************************************************************
849
A simple function to open or create a file. */
852
os_file_create_simple(
853
/*==================*/
854
/* out, own: handle to the file, not defined
855
if error, error number can be retrieved with
856
os_file_get_last_error */
857
const char* name, /* in: name of the file or path as a
858
null-terminated string */
859
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is
860
opened (if does not exist, error), or
861
OS_FILE_CREATE if a new file is created
862
(if exists, error), or
863
OS_FILE_CREATE_PATH if new file
864
(if exists, error) and subdirectories along
865
its path are created (if needed)*/
866
ulint access_type,/* in: OS_FILE_READ_ONLY or
867
OS_FILE_READ_WRITE */
868
ibool* success)/* out: TRUE if succeed, FALSE if error */
874
DWORD attributes = 0;
880
if (create_mode == OS_FILE_OPEN) {
881
create_flag = OPEN_EXISTING;
882
} else if (create_mode == OS_FILE_CREATE) {
883
create_flag = CREATE_NEW;
884
} else if (create_mode == OS_FILE_CREATE_PATH) {
885
/* create subdirs along the path if needed */
886
*success = os_file_create_subdirs_if_needed(name);
890
create_flag = CREATE_NEW;
891
create_mode = OS_FILE_CREATE;
897
if (access_type == OS_FILE_READ_ONLY) {
898
access = GENERIC_READ;
899
} else if (access_type == OS_FILE_READ_WRITE) {
900
access = GENERIC_READ | GENERIC_WRITE;
906
file = CreateFile((LPCTSTR) name,
908
FILE_SHARE_READ | FILE_SHARE_WRITE,
909
/* file can be read and written also
910
by other processes */
911
NULL, /* default security attributes */
914
NULL); /* no template file */
916
if (file == INVALID_HANDLE_VALUE) {
919
retry = os_file_handle_error(name,
920
create_mode == OS_FILE_OPEN ?
938
if (create_mode == OS_FILE_OPEN) {
939
if (access_type == OS_FILE_READ_ONLY) {
940
create_flag = O_RDONLY;
942
create_flag = O_RDWR;
944
} else if (create_mode == OS_FILE_CREATE) {
945
create_flag = O_RDWR | O_CREAT | O_EXCL;
946
} else if (create_mode == OS_FILE_CREATE_PATH) {
947
/* create subdirs along the path if needed */
948
*success = os_file_create_subdirs_if_needed(name);
952
create_flag = O_RDWR | O_CREAT | O_EXCL;
953
create_mode = OS_FILE_CREATE;
959
if (create_mode == OS_FILE_CREATE) {
960
file = open(name, create_flag, S_IRUSR | S_IWUSR
961
| S_IRGRP | S_IWGRP);
963
file = open(name, create_flag);
969
retry = os_file_handle_error(name,
970
create_mode == OS_FILE_OPEN ?
976
} else if (access_type == OS_FILE_READ_WRITE
977
&& os_file_lock(file, name)) {
990
/********************************************************************
991
A simple function to open or create a file. */
994
os_file_create_simple_no_error_handling(
995
/*====================================*/
996
/* out, own: handle to the file, not defined
997
if error, error number can be retrieved with
998
os_file_get_last_error */
999
const char* name, /* in: name of the file or path as a
1000
null-terminated string */
1001
ulint create_mode,/* in: OS_FILE_OPEN if an existing file
1002
is opened (if does not exist, error), or
1003
OS_FILE_CREATE if a new file is created
1004
(if exists, error) */
1005
ulint access_type,/* in: OS_FILE_READ_ONLY,
1006
OS_FILE_READ_WRITE, or
1007
OS_FILE_READ_ALLOW_DELETE; the last option is
1008
used by a backup program reading the file */
1009
ibool* success)/* out: TRUE if succeed, FALSE if error */
1015
DWORD attributes = 0;
1016
DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1020
if (create_mode == OS_FILE_OPEN) {
1021
create_flag = OPEN_EXISTING;
1022
} else if (create_mode == OS_FILE_CREATE) {
1023
create_flag = CREATE_NEW;
1029
if (access_type == OS_FILE_READ_ONLY) {
1030
access = GENERIC_READ;
1031
} else if (access_type == OS_FILE_READ_WRITE) {
1032
access = GENERIC_READ | GENERIC_WRITE;
1033
} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1034
access = GENERIC_READ;
1035
share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1036
| FILE_SHARE_WRITE; /* A backup program has to give
1037
mysqld the maximum freedom to
1038
do what it likes with the
1045
file = CreateFile((LPCTSTR) name,
1048
NULL, /* default security attributes */
1051
NULL); /* no template file */
1053
if (file == INVALID_HANDLE_VALUE) {
1066
if (create_mode == OS_FILE_OPEN) {
1067
if (access_type == OS_FILE_READ_ONLY) {
1068
create_flag = O_RDONLY;
1070
create_flag = O_RDWR;
1072
} else if (create_mode == OS_FILE_CREATE) {
1073
create_flag = O_RDWR | O_CREAT | O_EXCL;
1079
if (create_mode == OS_FILE_CREATE) {
1080
file = open(name, create_flag, S_IRUSR | S_IWUSR
1081
| S_IRGRP | S_IWGRP);
1083
file = open(name, create_flag);
1088
#ifdef USE_FILE_LOCK
1089
} else if (access_type == OS_FILE_READ_WRITE
1090
&& os_file_lock(file, name)) {
1100
#endif /* __WIN__ */
1103
/********************************************************************
1104
Tries to disable OS caching on an opened file descriptor. */
1107
os_file_set_nocache(
1108
/*================*/
1109
int fd, /* in: file descriptor to alter */
1110
const char* file_name, /* in: file name, used in the
1111
diagnostic message */
1112
const char* operation_name) /* in: "open" or "create"; used in the
1113
diagnostic message */
1115
/* some versions of Solaris may not have DIRECTIO_ON */
1116
#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1117
if (directio(fd, DIRECTIO_ON) == -1) {
1119
errno_save = (int)errno;
1120
ut_print_timestamp(stderr);
1122
" InnoDB: Failed to set DIRECTIO_ON "
1123
"on file %s: %s: %s, continuing anyway\n",
1124
file_name, operation_name, strerror(errno_save));
1126
#elif defined(O_DIRECT)
1127
if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1129
errno_save = (int)errno;
1130
ut_print_timestamp(stderr);
1132
" InnoDB: Failed to set O_DIRECT "
1133
"on file %s: %s: %s, continuing anyway\n",
1134
file_name, operation_name, strerror(errno_save));
1135
if (errno_save == EINVAL) {
1136
ut_print_timestamp(stderr);
1138
" InnoDB: O_DIRECT is known to result in "
1139
"'Invalid argument' on Linux on tmpfs, "
1140
"see MySQL Bug#26662\n");
1143
#else /* Required for OSX */
1146
(void)operation_name;
1150
/********************************************************************
1151
Opens an existing file or creates a new. */
1156
/* out, own: handle to the file, not defined
1157
if error, error number can be retrieved with
1158
os_file_get_last_error */
1159
const char* name, /* in: name of the file or path as a
1160
null-terminated string */
1161
ulint create_mode,/* in: OS_FILE_OPEN if an existing file
1162
is opened (if does not exist, error), or
1163
OS_FILE_CREATE if a new file is created
1165
OS_FILE_OVERWRITE if a new file is created
1166
or an old overwritten;
1167
OS_FILE_OPEN_RAW, if a raw device or disk
1168
partition should be opened */
1169
ulint purpose,/* in: OS_FILE_AIO, if asynchronous,
1170
non-buffered i/o is desired,
1171
OS_FILE_NORMAL, if any normal file;
1172
NOTE that it also depends on type, os_aio_..
1173
and srv_.. variables whether we really use
1174
async i/o or unbuffered i/o: look in the
1175
function source code for the exact rules */
1176
ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
1177
ibool* success)/* out: TRUE if succeed, FALSE if error */
1181
DWORD share_mode = FILE_SHARE_READ;
1188
if (create_mode == OS_FILE_OPEN_RAW) {
1189
create_flag = OPEN_EXISTING;
1190
share_mode = FILE_SHARE_WRITE;
1191
} else if (create_mode == OS_FILE_OPEN
1192
|| create_mode == OS_FILE_OPEN_RETRY) {
1193
create_flag = OPEN_EXISTING;
1194
} else if (create_mode == OS_FILE_CREATE) {
1195
create_flag = CREATE_NEW;
1196
} else if (create_mode == OS_FILE_OVERWRITE) {
1197
create_flag = CREATE_ALWAYS;
1203
if (purpose == OS_FILE_AIO) {
1204
/* If specified, use asynchronous (overlapped) io and no
1205
buffering of writes in the OS */
1208
if (os_aio_use_native_aio) {
1209
attributes = attributes | FILE_FLAG_OVERLAPPED;
1212
#ifdef UNIV_NON_BUFFERED_IO
1213
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1214
/* Do not use unbuffered i/o to log files because
1215
value 2 denotes that we do not flush the log at every
1216
commit, but only once per second */
1217
} else if (srv_win_file_flush_method
1218
== SRV_WIN_IO_UNBUFFERED) {
1219
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1222
} else if (purpose == OS_FILE_NORMAL) {
1224
#ifdef UNIV_NON_BUFFERED_IO
1225
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1226
/* Do not use unbuffered i/o to log files because
1227
value 2 denotes that we do not flush the log at every
1228
commit, but only once per second */
1229
} else if (srv_win_file_flush_method
1230
== SRV_WIN_IO_UNBUFFERED) {
1231
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1239
file = CreateFile((LPCTSTR) name,
1240
GENERIC_READ | GENERIC_WRITE, /* read and write
1242
share_mode, /* File can be read also by other
1243
processes; we must give the read
1244
permission because of ibbackup. We do
1245
not give the write permission to
1246
others because if one would succeed to
1247
start 2 instances of mysqld on the
1248
SAME files, that could cause severe
1249
database corruption! When opening
1250
raw disk partitions, Microsoft manuals
1251
say that we must give also the write
1253
NULL, /* default security attributes */
1256
NULL); /* no template file */
1258
if (file == INVALID_HANDLE_VALUE) {
1261
retry = os_file_handle_error(name,
1262
create_mode == OS_FILE_CREATE ?
1276
const char* mode_str = NULL;
1277
const char* type_str = NULL;
1278
const char* purpose_str = NULL;
1283
if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1284
|| create_mode == OS_FILE_OPEN_RETRY) {
1286
create_flag = O_RDWR;
1287
} else if (create_mode == OS_FILE_CREATE) {
1288
mode_str = "CREATE";
1289
create_flag = O_RDWR | O_CREAT | O_EXCL;
1290
} else if (create_mode == OS_FILE_OVERWRITE) {
1291
mode_str = "OVERWRITE";
1292
create_flag = O_RDWR | O_CREAT | O_TRUNC;
1298
if (type == OS_LOG_FILE) {
1300
} else if (type == OS_DATA_FILE) {
1306
if (purpose == OS_FILE_AIO) {
1307
purpose_str = "AIO";
1308
} else if (purpose == OS_FILE_NORMAL) {
1309
purpose_str = "NORMAL";
1315
fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
1316
name, mode_str, type_str, purpose_str);
1319
/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1320
O_SYNC because the datasync options seemed to corrupt files in 2001
1321
in both Linux and Solaris */
1322
if (type == OS_LOG_FILE
1323
&& srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1326
fprintf(stderr, "Using O_SYNC for file %s\n", name);
1329
create_flag = create_flag | O_SYNC;
1333
file = open(name, create_flag, os_innodb_umask);
1338
retry = os_file_handle_error(name,
1339
create_mode == OS_FILE_CREATE ?
1344
return(file /* -1 */);
1351
/* We disable OS caching (O_DIRECT) only on data files */
1352
if (type != OS_LOG_FILE
1353
&& srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1355
os_file_set_nocache(file, name, mode_str);
1358
#ifdef USE_FILE_LOCK
1359
if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1361
if (create_mode == OS_FILE_OPEN_RETRY) {
1363
ut_print_timestamp(stderr);
1364
fputs(" InnoDB: Retrying to lock"
1365
" the first data file\n",
1367
for (i = 0; i < 100; i++) {
1368
os_thread_sleep(1000000);
1369
if (!os_file_lock(file, name)) {
1374
ut_print_timestamp(stderr);
1375
fputs(" InnoDB: Unable to open the first data file\n",
1383
#endif /* USE_FILE_LOCK */
1386
#endif /* __WIN__ */
1389
/***************************************************************************
1390
Deletes a file if it exists. The file has to be closed before calling this. */
1393
os_file_delete_if_exists(
1394
/*=====================*/
1395
/* out: TRUE if success */
1396
const char* name) /* in: file path as a null-terminated string */
1402
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1405
ret = DeleteFile((LPCTSTR)name);
1411
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1412
/* the file does not exist, this not an error */
1419
if (count > 100 && 0 == (count % 10)) {
1421
"InnoDB: Warning: cannot delete file %s\n"
1422
"InnoDB: Are you running ibbackup"
1423
" to back up the file?\n", name);
1425
os_file_get_last_error(TRUE); /* print error information */
1428
os_thread_sleep(1000000); /* sleep for a second */
1441
if (ret != 0 && errno != ENOENT) {
1442
os_file_handle_error_no_exit(name, "delete");
1451
/***************************************************************************
1452
Deletes a file. The file has to be closed before calling this. */
1457
/* out: TRUE if success */
1458
const char* name) /* in: file path as a null-terminated string */
1464
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1467
ret = DeleteFile((LPCTSTR)name);
1473
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1474
/* If the file does not exist, we classify this as a 'mild'
1482
if (count > 100 && 0 == (count % 10)) {
1484
"InnoDB: Warning: cannot delete file %s\n"
1485
"InnoDB: Are you running ibbackup"
1486
" to back up the file?\n", name);
1488
os_file_get_last_error(TRUE); /* print error information */
1491
os_thread_sleep(1000000); /* sleep for a second */
1505
os_file_handle_error_no_exit(name, "delete");
1514
/***************************************************************************
1515
Renames a file (can also move it to another directory). It is safest that the
1516
file is closed before calling this function. */
1521
/* out: TRUE if success */
1522
const char* oldpath,/* in: old file path as a null-terminated
1524
const char* newpath)/* in: new file path */
1529
ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1535
os_file_handle_error_no_exit(oldpath, "rename");
1541
ret = rename(oldpath, newpath);
1544
os_file_handle_error_no_exit(oldpath, "rename");
1553
/***************************************************************************
1554
Closes a file handle. In case of error, error number can be retrieved with
1555
os_file_get_last_error. */
1560
/* out: TRUE if success */
1561
os_file_t file) /* in, own: handle to a file */
1568
ret = CloseHandle(file);
1574
os_file_handle_error(NULL, "close");
1583
os_file_handle_error(NULL, "close");
1592
/***************************************************************************
1593
Closes a file handle. */
1596
os_file_close_no_error_handling(
1597
/*============================*/
1598
/* out: TRUE if success */
1599
os_file_t file) /* in, own: handle to a file */
1606
ret = CloseHandle(file);
1627
/***************************************************************************
1628
Gets a file size. */
1633
/* out: TRUE if success */
1634
os_file_t file, /* in: handle to a file */
1635
ulint* size, /* out: least significant 32 bits of file
1637
ulint* size_high)/* out: most significant 32 bits of size */
1643
low = GetFileSize(file, &high);
1645
if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1656
offs = lseek(file, 0, SEEK_END);
1658
if (offs == ((off_t)-1)) {
1663
if (sizeof(off_t) > 4) {
1664
*size = (ulint)(offs & 0xFFFFFFFFUL);
1665
*size_high = (ulint)(offs >> 32);
1667
*size = (ulint) offs;
1675
/***************************************************************************
1676
Gets file size as a 64-bit integer ib_int64_t. */
1679
os_file_get_size_as_iblonglong(
1680
/*===========================*/
1681
/* out: size in bytes, -1 if error */
1682
os_file_t file) /* in: handle to a file */
1688
success = os_file_get_size(file, &size, &size_high);
1695
return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
1698
/***************************************************************************
1699
Write the specified number of zeros to a newly created file. */
1704
/* out: TRUE if success */
1705
const char* name, /* in: name of the file or path as a
1706
null-terminated string */
1707
os_file_t file, /* in: handle to a file */
1708
ulint size, /* in: least significant 32 bits of file
1710
ulint size_high)/* in: most significant 32 bits of size */
1712
ib_int64_t current_size;
1713
ib_int64_t desired_size;
1719
ut_a(size == (size & 0xFFFFFFFF));
1722
desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
1724
/* Write up to 1 megabyte at a time. */
1725
buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1727
buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
1729
/* Align the buffer for possible raw i/o */
1730
buf = ut_align(buf2, UNIV_PAGE_SIZE);
1732
/* Write buffer full of zeros */
1733
memset(buf, 0, buf_size);
1735
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1737
fprintf(stderr, "InnoDB: Progress in MB:");
1740
while (current_size < desired_size) {
1743
if (desired_size - current_size < (ib_int64_t) buf_size) {
1744
n_bytes = (ulint) (desired_size - current_size);
1749
ret = os_file_write(name, file, buf,
1750
(ulint)(current_size & 0xFFFFFFFF),
1751
(ulint)(current_size >> 32),
1755
goto error_handling;
1758
/* Print about progress for each 100 MB written */
1759
if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
1760
!= current_size / (ib_int64_t)(100 * 1024 * 1024)) {
1762
fprintf(stderr, " %lu00",
1763
(ulong) ((current_size + n_bytes)
1764
/ (ib_int64_t)(100 * 1024 * 1024)));
1767
current_size += n_bytes;
1770
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1772
fprintf(stderr, "\n");
1777
ret = os_file_flush(file);
1787
/***************************************************************************
1788
Truncates a file at its current position. */
1793
/* out: TRUE if success */
1794
FILE* file) /* in: file to be truncated */
1797
HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
1798
return(SetEndOfFile(h));
1800
return(!ftruncate(fileno(file), ftell(file)));
1801
#endif /* __WIN__ */
1805
/***************************************************************************
1806
Wrapper to fsync(2) that retries the call on some errors.
1807
Returns the value 0 if successful; otherwise the value -1 is returned and
1808
the global variable errno is set to indicate the error. */
1814
/* out: 0 if success, -1 otherwise */
1815
os_file_t file) /* in: handle to a file */
1828
if (ret == -1 && errno == ENOLCK) {
1830
if (failures % 100 == 0) {
1832
ut_print_timestamp(stderr);
1834
" InnoDB: fsync(): "
1835
"No locks available; retrying\n");
1838
os_thread_sleep(200000 /* 0.2 sec */);
1851
#endif /* !__WIN__ */
1853
/***************************************************************************
1854
Flushes the write buffers of a given file to the disk. */
1859
/* out: TRUE if success */
1860
os_file_t file) /* in, own: handle to a file */
1869
ret = FlushFileBuffers(file);
1875
/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
1876
actually a raw device, we choose to ignore that error if we are using
1879
if (srv_start_raw_disk_in_use && GetLastError()
1880
== ERROR_INVALID_FUNCTION) {
1884
os_file_handle_error(NULL, "flush");
1886
/* It is a fatal error if a file flush does not succeed, because then
1887
the database can get corrupt on disk */
1894
#if defined(HAVE_DARWIN_THREADS)
1895
# ifndef F_FULLFSYNC
1896
/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
1897
# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
1898
# elif F_FULLFSYNC != 51
1899
# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
1901
/* Apple has disabled fsync() for internal disk drives in OS X. That
1902
caused corruption for a user when he tested a power outage. Let us in
1903
OS X use a nonstandard flush method recommended by an Apple
1906
if (!srv_have_fullfsync) {
1907
/* If we are not on an operating system that supports this,
1908
then fall back to a plain fsync. */
1910
ret = os_file_fsync(file);
1912
ret = fcntl(file, F_FULLFSYNC, NULL);
1915
/* If we are not on a file system that supports this,
1916
then fall back to a plain fsync. */
1917
ret = os_file_fsync(file);
1921
ret = os_file_fsync(file);
1928
/* Since Linux returns EINVAL if the 'file' is actually a raw device,
1929
we choose to ignore that error if we are using raw disks */
1931
if (srv_start_raw_disk_in_use && errno == EINVAL) {
1936
ut_print_timestamp(stderr);
1939
" InnoDB: Error: the OS said file flush did not succeed\n");
1941
os_file_handle_error(NULL, "flush");
1943
/* It is a fatal error if a file flush does not succeed, because then
1944
the database can get corrupt on disk */
1952
/***********************************************************************
1953
Does a synchronous read operation in Posix. */
1958
/* out: number of bytes read, -1 if error */
1959
os_file_t file, /* in: handle to a file */
1960
void* buf, /* in: buffer where to read */
1961
ulint n, /* in: number of bytes to read */
1962
ulint offset, /* in: least significant 32 bits of file
1963
offset from where to read */
1964
ulint offset_high) /* in: most significant 32 bits of
1970
ut_a((offset & 0xFFFFFFFFUL) == offset);
1972
/* If off_t is > 4 bytes in size, then we assume we can pass a
1975
if (sizeof(off_t) > 4) {
1976
offs = (off_t)offset + (((off_t)offset_high) << 32);
1979
offs = (off_t)offset;
1981
if (offset_high > 0) {
1983
"InnoDB: Error: file read at offset > 4 GB\n");
1989
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
1990
os_mutex_enter(os_file_count_mutex);
1991
os_file_n_pending_preads++;
1992
os_n_pending_reads++;
1993
os_mutex_exit(os_file_count_mutex);
1995
n_bytes = pread(file, buf, (ssize_t)n, offs);
1997
os_mutex_enter(os_file_count_mutex);
1998
os_file_n_pending_preads--;
1999
os_n_pending_reads--;
2000
os_mutex_exit(os_file_count_mutex);
2009
os_mutex_enter(os_file_count_mutex);
2010
os_n_pending_reads++;
2011
os_mutex_exit(os_file_count_mutex);
2013
/* Protect the seek / read operation with a mutex */
2014
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2016
os_mutex_enter(os_file_seek_mutexes[i]);
2018
ret_offset = lseek(file, offs, SEEK_SET);
2020
if (ret_offset < 0) {
2023
ret = read(file, buf, (ssize_t)n);
2026
os_mutex_exit(os_file_seek_mutexes[i]);
2028
os_mutex_enter(os_file_count_mutex);
2029
os_n_pending_reads--;
2030
os_mutex_exit(os_file_count_mutex);
2037
/***********************************************************************
2038
Does a synchronous write operation in Posix. */
2043
/* out: number of bytes written, -1 if error */
2044
os_file_t file, /* in: handle to a file */
2045
const void* buf, /* in: buffer from where to write */
2046
ulint n, /* in: number of bytes to write */
2047
ulint offset, /* in: least significant 32 bits of file
2048
offset where to write */
2049
ulint offset_high) /* in: most significant 32 bits of
2055
ut_a((offset & 0xFFFFFFFFUL) == offset);
2057
/* If off_t is > 4 bytes in size, then we assume we can pass a
2060
if (sizeof(off_t) > 4) {
2061
offs = (off_t)offset + (((off_t)offset_high) << 32);
2063
offs = (off_t)offset;
2065
if (offset_high > 0) {
2067
"InnoDB: Error: file write"
2068
" at offset > 4 GB\n");
2074
#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2075
os_mutex_enter(os_file_count_mutex);
2076
os_file_n_pending_pwrites++;
2077
os_n_pending_writes++;
2078
os_mutex_exit(os_file_count_mutex);
2080
ret = pwrite(file, buf, (ssize_t)n, offs);
2082
os_mutex_enter(os_file_count_mutex);
2083
os_file_n_pending_pwrites--;
2084
os_n_pending_writes--;
2085
os_mutex_exit(os_file_count_mutex);
2087
# ifdef UNIV_DO_FLUSH
2088
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2089
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2090
&& !os_do_not_call_flush_at_each_write) {
2092
/* Always do fsync to reduce the probability that when
2093
the OS crashes, a database page is only partially
2094
physically written to disk. */
2096
ut_a(TRUE == os_file_flush(file));
2098
# endif /* UNIV_DO_FLUSH */
2106
os_mutex_enter(os_file_count_mutex);
2107
os_n_pending_writes++;
2108
os_mutex_exit(os_file_count_mutex);
2110
/* Protect the seek / write operation with a mutex */
2111
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2113
os_mutex_enter(os_file_seek_mutexes[i]);
2115
ret_offset = lseek(file, offs, SEEK_SET);
2117
if (ret_offset < 0) {
2123
ret = write(file, buf, (ssize_t)n);
2125
# ifdef UNIV_DO_FLUSH
2126
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2127
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2128
&& !os_do_not_call_flush_at_each_write) {
2130
/* Always do fsync to reduce the probability that when
2131
the OS crashes, a database page is only partially
2132
physically written to disk. */
2134
ut_a(TRUE == os_file_flush(file));
2136
# endif /* UNIV_DO_FLUSH */
2139
os_mutex_exit(os_file_seek_mutexes[i]);
2141
os_mutex_enter(os_file_count_mutex);
2142
os_n_pending_writes--;
2143
os_mutex_exit(os_file_count_mutex);
2151
/***********************************************************************
2152
Requests a synchronous positioned read operation. */
2157
/* out: TRUE if request was
2158
successful, FALSE if fail */
2159
os_file_t file, /* in: handle to a file */
2160
void* buf, /* in: buffer where to read */
2161
ulint offset, /* in: least significant 32 bits of file
2162
offset where to read */
2163
ulint offset_high, /* in: most significant 32 bits of
2165
ulint n) /* in: number of bytes to read */
2176
ut_a((offset & 0xFFFFFFFFUL) == offset);
2179
os_bytes_read_since_printout += n;
2186
low = (DWORD) offset;
2187
high = (DWORD) offset_high;
2189
os_mutex_enter(os_file_count_mutex);
2190
os_n_pending_reads++;
2191
os_mutex_exit(os_file_count_mutex);
2193
/* Protect the seek / read operation with a mutex */
2194
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2196
os_mutex_enter(os_file_seek_mutexes[i]);
2198
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2200
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2202
os_mutex_exit(os_file_seek_mutexes[i]);
2204
os_mutex_enter(os_file_count_mutex);
2205
os_n_pending_reads--;
2206
os_mutex_exit(os_file_count_mutex);
2208
goto error_handling;
2211
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2213
os_mutex_exit(os_file_seek_mutexes[i]);
2215
os_mutex_enter(os_file_count_mutex);
2216
os_n_pending_reads--;
2217
os_mutex_exit(os_file_count_mutex);
2219
if (ret && len == n) {
2226
os_bytes_read_since_printout += n;
2229
ret = os_file_pread(file, buf, n, offset, offset_high);
2231
if ((ulint)ret == n) {
2237
"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2238
"InnoDB: Was only able to read %ld.\n",
2239
(ulong)n, (ulong)offset_high,
2240
(ulong)offset, (long)ret);
2245
retry = os_file_handle_error(NULL, "read");
2252
"InnoDB: Fatal error: cannot read from file."
2253
" OS error number %lu.\n",
2255
(ulong) GetLastError()
2267
/***********************************************************************
2268
Requests a synchronous positioned read operation. This function does not do
2269
any error handling. In case of error it returns FALSE. */
2272
os_file_read_no_error_handling(
2273
/*===========================*/
2274
/* out: TRUE if request was
2275
successful, FALSE if fail */
2276
os_file_t file, /* in: handle to a file */
2277
void* buf, /* in: buffer where to read */
2278
ulint offset, /* in: least significant 32 bits of file
2279
offset where to read */
2280
ulint offset_high, /* in: most significant 32 bits of
2282
ulint n) /* in: number of bytes to read */
2293
ut_a((offset & 0xFFFFFFFFUL) == offset);
2296
os_bytes_read_since_printout += n;
2303
low = (DWORD) offset;
2304
high = (DWORD) offset_high;
2306
os_mutex_enter(os_file_count_mutex);
2307
os_n_pending_reads++;
2308
os_mutex_exit(os_file_count_mutex);
2310
/* Protect the seek / read operation with a mutex */
2311
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2313
os_mutex_enter(os_file_seek_mutexes[i]);
2315
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2317
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2319
os_mutex_exit(os_file_seek_mutexes[i]);
2321
os_mutex_enter(os_file_count_mutex);
2322
os_n_pending_reads--;
2323
os_mutex_exit(os_file_count_mutex);
2325
goto error_handling;
2328
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2330
os_mutex_exit(os_file_seek_mutexes[i]);
2332
os_mutex_enter(os_file_count_mutex);
2333
os_n_pending_reads--;
2334
os_mutex_exit(os_file_count_mutex);
2336
if (ret && len == n) {
2343
os_bytes_read_since_printout += n;
2346
ret = os_file_pread(file, buf, n, offset, offset_high);
2348
if ((ulint)ret == n) {
2356
retry = os_file_handle_error_no_exit(NULL, "read");
2365
/***********************************************************************
2366
Rewind file to its start, read at most size - 1 bytes from it to str, and
2367
NUL-terminate str. All errors are silently ignored. This function is
2368
mostly meant to be used with temporary files. */
2371
os_file_read_string(
2372
/*================*/
2373
FILE* file, /* in: file to read from */
2374
char* str, /* in: buffer where to read */
2375
ulint size) /* in: size of buffer */
2384
flen = fread(str, 1, size - 1, file);
2388
/***********************************************************************
2389
Requests a synchronous write operation. */
2394
/* out: TRUE if request was
2395
successful, FALSE if fail */
2396
const char* name, /* in: name of the file or path as a
2397
null-terminated string */
2398
os_file_t file, /* in: handle to a file */
2399
const void* buf, /* in: buffer from which to write */
2400
ulint offset, /* in: least significant 32 bits of file
2401
offset where to write */
2402
ulint offset_high, /* in: most significant 32 bits of
2404
ulint n) /* in: number of bytes to write */
2413
ulint n_retries = 0;
2416
ut_a((offset & 0xFFFFFFFF) == offset);
2424
low = (DWORD) offset;
2425
high = (DWORD) offset_high;
2427
os_mutex_enter(os_file_count_mutex);
2428
os_n_pending_writes++;
2429
os_mutex_exit(os_file_count_mutex);
2431
/* Protect the seek / write operation with a mutex */
2432
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2434
os_mutex_enter(os_file_seek_mutexes[i]);
2436
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2438
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2440
os_mutex_exit(os_file_seek_mutexes[i]);
2442
os_mutex_enter(os_file_count_mutex);
2443
os_n_pending_writes--;
2444
os_mutex_exit(os_file_count_mutex);
2446
ut_print_timestamp(stderr);
2449
" InnoDB: Error: File pointer positioning to"
2450
" file %s failed at\n"
2451
"InnoDB: offset %lu %lu. Operating system"
2452
" error number %lu.\n"
2453
"InnoDB: Some operating system error numbers"
2454
" are described at\n"
2456
"http://dev.mysql.com/doc/refman/5.1/en/"
2457
"operating-system-error-codes.html\n",
2458
name, (ulong) offset_high, (ulong) offset,
2459
(ulong) GetLastError());
2464
ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2466
/* Always do fsync to reduce the probability that when the OS crashes,
2467
a database page is only partially physically written to disk. */
2469
# ifdef UNIV_DO_FLUSH
2470
if (!os_do_not_call_flush_at_each_write) {
2471
ut_a(TRUE == os_file_flush(file));
2473
# endif /* UNIV_DO_FLUSH */
2475
os_mutex_exit(os_file_seek_mutexes[i]);
2477
os_mutex_enter(os_file_count_mutex);
2478
os_n_pending_writes--;
2479
os_mutex_exit(os_file_count_mutex);
2481
if (ret && len == n) {
2486
/* If some background file system backup tool is running, then, at
2487
least in Windows 2000, we may get here a specific error. Let us
2488
retry the operation 100 times, with 1 second waits. */
2490
if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2492
os_thread_sleep(1000000);
2499
if (!os_has_said_disk_full) {
2501
err = (ulint)GetLastError();
2503
ut_print_timestamp(stderr);
2506
" InnoDB: Error: Write to file %s failed"
2507
" at offset %lu %lu.\n"
2508
"InnoDB: %lu bytes should have been written,"
2509
" only %lu were written.\n"
2510
"InnoDB: Operating system error number %lu.\n"
2511
"InnoDB: Check that your OS and file system"
2512
" support files of this size.\n"
2513
"InnoDB: Check also that the disk is not full"
2514
" or a disk quota exceeded.\n",
2515
name, (ulong) offset_high, (ulong) offset,
2516
(ulong) n, (ulong) len, (ulong) err);
2518
if (strerror((int)err) != NULL) {
2520
"InnoDB: Error number %lu means '%s'.\n",
2521
(ulong) err, strerror((int)err));
2525
"InnoDB: Some operating system error numbers"
2526
" are described at\n"
2528
"http://dev.mysql.com/doc/refman/5.1/en/"
2529
"operating-system-error-codes.html\n");
2531
os_has_said_disk_full = TRUE;
2538
ret = os_file_pwrite(file, buf, n, offset, offset_high);
2540
if ((ulint)ret == n) {
2545
if (!os_has_said_disk_full) {
2547
ut_print_timestamp(stderr);
2550
" InnoDB: Error: Write to file %s failed"
2551
" at offset %lu %lu.\n"
2552
"InnoDB: %lu bytes should have been written,"
2553
" only %ld were written.\n"
2554
"InnoDB: Operating system error number %lu.\n"
2555
"InnoDB: Check that your OS and file system"
2556
" support files of this size.\n"
2557
"InnoDB: Check also that the disk is not full"
2558
" or a disk quota exceeded.\n",
2559
name, offset_high, offset, n, (long int)ret,
2561
if (strerror(errno) != NULL) {
2563
"InnoDB: Error number %lu means '%s'.\n",
2564
(ulint)errno, strerror(errno));
2568
"InnoDB: Some operating system error numbers"
2569
" are described at\n"
2571
"http://dev.mysql.com/doc/refman/5.1/en/"
2572
"operating-system-error-codes.html\n");
2574
os_has_said_disk_full = TRUE;
2581
/***********************************************************************
2582
Check the existence and type of the given file. */
2587
/* out: TRUE if call succeeded */
2588
const char* path, /* in: pathname of the file */
2589
ibool* exists, /* out: TRUE if file exists */
2590
os_file_type_t* type) /* out: type of the file (if it exists) */
2594
struct _stat statinfo;
2596
ret = _stat(path, &statinfo);
2597
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2598
/* file does not exist */
2602
/* file exists, but stat call failed */
2604
os_file_handle_error_no_exit(path, "stat");
2609
if (_S_IFDIR & statinfo.st_mode) {
2610
*type = OS_FILE_TYPE_DIR;
2611
} else if (_S_IFREG & statinfo.st_mode) {
2612
*type = OS_FILE_TYPE_FILE;
2614
*type = OS_FILE_TYPE_UNKNOWN;
2622
struct stat statinfo;
2624
ret = stat(path, &statinfo);
2625
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2626
/* file does not exist */
2630
/* file exists, but stat call failed */
2632
os_file_handle_error_no_exit(path, "stat");
2637
if (S_ISDIR(statinfo.st_mode)) {
2638
*type = OS_FILE_TYPE_DIR;
2639
} else if (S_ISLNK(statinfo.st_mode)) {
2640
*type = OS_FILE_TYPE_LINK;
2641
} else if (S_ISREG(statinfo.st_mode)) {
2642
*type = OS_FILE_TYPE_FILE;
2644
*type = OS_FILE_TYPE_UNKNOWN;
2653
/***********************************************************************
2654
This function returns information about the specified file */
2659
/* out: TRUE if stat
2660
information found */
2661
const char* path, /* in: pathname of the file */
2662
os_file_stat_t* stat_info) /* information of a file in a
2667
struct _stat statinfo;
2669
ret = _stat(path, &statinfo);
2670
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2671
/* file does not exist */
2675
/* file exists, but stat call failed */
2677
os_file_handle_error_no_exit(path, "stat");
2681
if (_S_IFDIR & statinfo.st_mode) {
2682
stat_info->type = OS_FILE_TYPE_DIR;
2683
} else if (_S_IFREG & statinfo.st_mode) {
2684
stat_info->type = OS_FILE_TYPE_FILE;
2686
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2689
stat_info->ctime = statinfo.st_ctime;
2690
stat_info->atime = statinfo.st_atime;
2691
stat_info->mtime = statinfo.st_mtime;
2692
stat_info->size = statinfo.st_size;
2697
struct stat statinfo;
2699
ret = stat(path, &statinfo);
2701
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2702
/* file does not exist */
2706
/* file exists, but stat call failed */
2708
os_file_handle_error_no_exit(path, "stat");
2713
if (S_ISDIR(statinfo.st_mode)) {
2714
stat_info->type = OS_FILE_TYPE_DIR;
2715
} else if (S_ISLNK(statinfo.st_mode)) {
2716
stat_info->type = OS_FILE_TYPE_LINK;
2717
} else if (S_ISREG(statinfo.st_mode)) {
2718
stat_info->type = OS_FILE_TYPE_FILE;
2720
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2723
stat_info->ctime = statinfo.st_ctime;
2724
stat_info->atime = statinfo.st_atime;
2725
stat_info->mtime = statinfo.st_mtime;
2726
stat_info->size = statinfo.st_size;
2732
/* path name separator character */
2734
# define OS_FILE_PATH_SEPARATOR '\\'
2736
# define OS_FILE_PATH_SEPARATOR '/'
2739
/********************************************************************
2740
The function os_file_dirname returns a directory component of a
2741
null-terminated pathname string. In the usual case, dirname returns
2742
the string up to, but not including, the final '/', and basename
2743
is the component following the final '/'. Trailing '/' charac�
2744
ters are not counted as part of the pathname.
2746
If path does not contain a slash, dirname returns the string ".".
2748
Concatenating the string returned by dirname, a "/", and the basename
2749
yields a complete pathname.
2751
The return value is a copy of the directory component of the pathname.
2752
The copy is allocated from heap. It is the caller responsibility
2753
to free it after it is no longer needed.
2755
The following list of examples (taken from SUSv2) shows the strings
2756
returned by dirname and basename for different paths:
2758
path dirname basename
2759
"/usr/lib" "/usr" "lib"
2770
/* out, own: directory component of the
2772
const char* path) /* in: pathname */
2774
/* Find the offset of the last slash */
2775
const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
2777
/* No slash in the path, return "." */
2779
return(mem_strdup("."));
2782
/* Ok, there is a slash */
2784
if (last_slash == path) {
2785
/* last slash is the first char of the path */
2787
return(mem_strdup("/"));
2790
/* Non-trivial directory component */
2792
return(mem_strdupl(path, last_slash - path));
2795
/********************************************************************
2796
Creates all missing subdirectories along the given path. */
2799
os_file_create_subdirs_if_needed(
2800
/*=============================*/
2801
/* out: TRUE if call succeeded
2803
const char* path) /* in: path name */
2806
ibool success, subdir_exists;
2807
os_file_type_t type;
2809
subdir = os_file_dirname(path);
2810
if (strlen(subdir) == 1
2811
&& (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
2812
/* subdir is root or cwd, nothing to do */
2818
/* Test if subdir exists */
2819
success = os_file_status(subdir, &subdir_exists, &type);
2820
if (success && !subdir_exists) {
2821
/* subdir does not exist, create it */
2822
success = os_file_create_subdirs_if_needed(subdir);
2828
success = os_file_create_directory(subdir, FALSE);
2836
/********************************************************************
2837
Returns a pointer to the nth slot in the aio array. */
2840
os_aio_array_get_nth_slot(
2841
/*======================*/
2842
/* out: pointer to slot */
2843
os_aio_array_t* array, /* in: aio array */
2844
ulint index) /* in: index of the slot */
2846
ut_a(index < array->n_slots);
2848
return((array->slots) + index);
2851
/****************************************************************************
2852
Creates an aio wait array. */
2855
os_aio_array_create(
2856
/*================*/
2857
/* out, own: aio array */
2858
ulint n, /* in: maximum number of pending aio operations
2859
allowed; n must be divisible by n_segments */
2860
ulint n_segments) /* in: number of segments in the aio array */
2862
os_aio_array_t* array;
2864
os_aio_slot_t* slot;
2869
ut_a(n_segments > 0);
2871
array = ut_malloc(sizeof(os_aio_array_t));
2873
array->mutex = os_mutex_create(NULL);
2874
array->not_full = os_event_create(NULL);
2875
array->is_empty = os_event_create(NULL);
2877
os_event_set(array->is_empty);
2880
array->n_segments = n_segments;
2881
array->n_reserved = 0;
2882
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
2884
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
2886
for (i = 0; i < n; i++) {
2887
slot = os_aio_array_get_nth_slot(array, i);
2890
slot->reserved = FALSE;
2892
slot->event = os_event_create(NULL);
2894
over = &(slot->control);
2896
over->hEvent = slot->event->handle;
2898
*((array->native_events) + i) = over->hEvent;
2905
/****************************************************************************
2906
Initializes the asynchronous io system. Calls also os_io_init_simple.
2907
Creates a separate aio array for
2908
non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2909
segment, two aio arrays for log reads and writes with one segment, and a
2910
synchronous aio array of the specified size. The combined number of segments
2911
in the three first aio arrays is the parameter n_segments given to the
2912
function. The caller must create an i/o handler thread for each segment in
2913
the four first arrays, but not for the sync aio array. */
2918
ulint n, /* in: maximum number of pending aio operations
2919
allowed; n must be divisible by n_segments */
2920
ulint n_segments, /* in: combined number of segments in the four
2921
first aio arrays; must be >= 4 */
2922
ulint n_slots_sync) /* in: number of slots in the sync aio array */
2928
#ifdef POSIX_ASYNC_IO
2931
ut_ad(n % n_segments == 0);
2932
ut_ad(n_segments >= 4);
2934
os_io_init_simple();
2936
for (i = 0; i < n_segments; i++) {
2937
srv_set_io_thread_op_info(i, "not started yet");
2940
n_per_seg = n / n_segments;
2941
n_write_segs = (n_segments - 2) / 2;
2942
n_read_segs = n_segments - 2 - n_write_segs;
2944
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
2946
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
2948
srv_io_thread_function[0] = "insert buffer thread";
2950
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
2952
srv_io_thread_function[1] = "log thread";
2954
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
2956
for (i = 2; i < 2 + n_read_segs; i++) {
2957
ut_a(i < SRV_MAX_N_IO_THREADS);
2958
srv_io_thread_function[i] = "read thread";
2961
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
2963
for (i = 2 + n_read_segs; i < n_segments; i++) {
2964
ut_a(i < SRV_MAX_N_IO_THREADS);
2965
srv_io_thread_function[i] = "write thread";
2968
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
2970
os_aio_n_segments = n_segments;
2974
os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
2976
for (i = 0; i < n_segments; i++) {
2977
os_aio_segment_wait_events[i] = os_event_create(NULL);
2980
os_last_printout = time(NULL);
2982
#ifdef POSIX_ASYNC_IO
2983
/* Block aio signals from the current thread and its children:
2984
for this to work, the current thread must be the first created
2985
in the database, so that all its children will inherit its
2988
/* TODO: to work MySQL needs the SIGALARM signal; the following
2989
will not work yet! */
2990
sigemptyset(&sigset);
2991
sigaddset(&sigset, SIGRTMIN + 1 + 0);
2992
sigaddset(&sigset, SIGRTMIN + 1 + 1);
2993
sigaddset(&sigset, SIGRTMIN + 1 + 2);
2994
sigaddset(&sigset, SIGRTMIN + 1 + 3);
2996
pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
3001
/****************************************************************************
3002
Wakes up all async i/o threads in the array in Windows async i/o at
3006
os_aio_array_wake_win_aio_at_shutdown(
3007
/*==================================*/
3008
os_aio_array_t* array) /* in: aio array */
3012
for (i = 0; i < array->n_slots; i++) {
3014
os_event_set((array->slots + i)->event);
3019
/****************************************************************************
3020
Wakes up all async i/o threads so that they know to exit themselves in
3024
os_aio_wake_all_threads_at_shutdown(void)
3025
/*=====================================*/
3030
/* This code wakes up all ai/o threads in Windows native aio */
3031
os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3032
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3033
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3034
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3036
/* This loop wakes up all simulated ai/o threads */
3038
for (i = 0; i < os_aio_n_segments; i++) {
3040
os_event_set(os_aio_segment_wait_events[i]);
3044
/****************************************************************************
3045
Waits until there are no pending writes in os_aio_write_array. There can
3046
be other, synchronous, pending writes. */
3049
os_aio_wait_until_no_pending_writes(void)
3050
/*=====================================*/
3052
os_event_wait(os_aio_write_array->is_empty);
3055
/**************************************************************************
3056
Calculates segment number for a slot. */
3059
os_aio_get_segment_no_from_slot(
3060
/*============================*/
3061
/* out: segment number (which is the number
3062
used by, for example, i/o-handler threads) */
3063
os_aio_array_t* array, /* in: aio wait array */
3064
os_aio_slot_t* slot) /* in: slot in this array */
3069
if (array == os_aio_ibuf_array) {
3072
} else if (array == os_aio_log_array) {
3075
} else if (array == os_aio_read_array) {
3076
seg_len = os_aio_read_array->n_slots
3077
/ os_aio_read_array->n_segments;
3079
segment = 2 + slot->pos / seg_len;
3081
ut_a(array == os_aio_write_array);
3082
seg_len = os_aio_write_array->n_slots
3083
/ os_aio_write_array->n_segments;
3085
segment = os_aio_read_array->n_segments + 2
3086
+ slot->pos / seg_len;
3092
/**************************************************************************
3093
Calculates local segment number and aio array from global segment number. */
3096
os_aio_get_array_and_local_segment(
3097
/*===============================*/
3098
/* out: local segment number within
3100
os_aio_array_t** array, /* out: aio wait array */
3101
ulint global_segment)/* in: global segment number */
3105
ut_a(global_segment < os_aio_n_segments);
3107
if (global_segment == 0) {
3108
*array = os_aio_ibuf_array;
3111
} else if (global_segment == 1) {
3112
*array = os_aio_log_array;
3115
} else if (global_segment < os_aio_read_array->n_segments + 2) {
3116
*array = os_aio_read_array;
3118
segment = global_segment - 2;
3120
*array = os_aio_write_array;
3122
segment = global_segment - (os_aio_read_array->n_segments + 2);
3128
/***********************************************************************
3129
Gets an integer value designating a specified aio array. This is used
3130
to give numbers to signals in Posix aio. */
3132
#if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO)
3135
os_aio_get_array_no(
3136
/*================*/
3137
os_aio_array_t* array) /* in: aio array */
3139
if (array == os_aio_ibuf_array) {
3143
} else if (array == os_aio_log_array) {
3147
} else if (array == os_aio_read_array) {
3150
} else if (array == os_aio_write_array) {
3160
/***********************************************************************
3161
Gets the aio array for its number. */
3164
os_aio_get_array_from_no(
3165
/*=====================*/
3166
/* out: aio array */
3167
ulint n) /* in: array number */
3170
return(os_aio_ibuf_array);
3171
} else if (n == 1) {
3173
return(os_aio_log_array);
3174
} else if (n == 2) {
3176
return(os_aio_read_array);
3177
} else if (n == 3) {
3179
return(os_aio_write_array);
3186
#endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */
3188
/***********************************************************************
3189
Requests for a slot in the aio array. If no slot is available, waits until
3190
not_full-event becomes signaled. */
3193
os_aio_array_reserve_slot(
3194
/*======================*/
3195
/* out: pointer to slot */
3196
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3197
os_aio_array_t* array, /* in: aio array */
3198
fil_node_t* message1,/* in: message to be passed along with
3199
the aio operation */
3200
void* message2,/* in: message to be passed along with
3201
the aio operation */
3202
os_file_t file, /* in: file handle */
3203
const char* name, /* in: name of the file or path as a
3204
null-terminated string */
3205
void* buf, /* in: buffer where to read or from which
3207
ulint offset, /* in: least significant 32 bits of file
3209
ulint offset_high, /* in: most significant 32 bits of
3211
ulint len) /* in: length of the block to read or write */
3213
os_aio_slot_t* slot;
3215
OVERLAPPED* control;
3217
#elif defined(POSIX_ASYNC_IO)
3219
struct aiocb* control;
3223
os_mutex_enter(array->mutex);
3225
if (array->n_reserved == array->n_slots) {
3226
os_mutex_exit(array->mutex);
3228
if (!os_aio_use_native_aio) {
3229
/* If the handler threads are suspended, wake them
3230
so that we get more slots */
3232
os_aio_simulated_wake_handler_threads();
3235
os_event_wait(array->not_full);
3241
slot = os_aio_array_get_nth_slot(array, i);
3243
if (slot->reserved == FALSE) {
3248
array->n_reserved++;
3250
if (array->n_reserved == 1) {
3251
os_event_reset(array->is_empty);
3254
if (array->n_reserved == array->n_slots) {
3255
os_event_reset(array->not_full);
3258
slot->reserved = TRUE;
3259
slot->reservation_time = time(NULL);
3260
slot->message1 = message1;
3261
slot->message2 = message2;
3267
slot->offset = offset;
3268
slot->offset_high = offset_high;
3269
slot->io_already_done = FALSE;
3272
control = &(slot->control);
3273
control->Offset = (DWORD)offset;
3274
control->OffsetHigh = (DWORD)offset_high;
3275
os_event_reset(slot->event);
3277
#elif defined(POSIX_ASYNC_IO)
3279
#if (UNIV_WORD_SIZE == 8)
3280
offset = offset + (offset_high << 32);
3282
ut_a(offset_high == 0);
3284
control = &(slot->control);
3285
control->aio_fildes = file;
3286
control->aio_buf = buf;
3287
control->aio_nbytes = len;
3288
control->aio_offset = offset;
3289
control->aio_reqprio = 0;
3290
control->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
3291
control->aio_sigevent.sigev_signo
3292
= SIGRTMIN + 1 + os_aio_get_array_no(array);
3293
/* TODO: How to choose the signal numbers? */
3295
fprintf(stderr, "AIO signal number %lu\n",
3296
(ulint) control->aio_sigevent.sigev_signo);
3298
control->aio_sigevent.sigev_value.sival_ptr = slot;
3300
os_mutex_exit(array->mutex);
3305
/***********************************************************************
3306
Frees a slot in the aio array. */
3309
os_aio_array_free_slot(
3310
/*===================*/
3311
os_aio_array_t* array, /* in: aio array */
3312
os_aio_slot_t* slot) /* in: pointer to slot */
3317
os_mutex_enter(array->mutex);
3319
ut_ad(slot->reserved);
3321
slot->reserved = FALSE;
3323
array->n_reserved--;
3325
if (array->n_reserved == array->n_slots - 1) {
3326
os_event_set(array->not_full);
3329
if (array->n_reserved == 0) {
3330
os_event_set(array->is_empty);
3334
os_event_reset(slot->event);
3336
os_mutex_exit(array->mutex);
3339
/**************************************************************************
3340
Wakes up a simulated aio i/o-handler thread if it has something to do. */
3343
os_aio_simulated_wake_handler_thread(
3344
/*=================================*/
3345
ulint global_segment) /* in: the number of the segment in the aio
3348
os_aio_array_t* array;
3349
os_aio_slot_t* slot;
3354
ut_ad(!os_aio_use_native_aio);
3356
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3358
n = array->n_slots / array->n_segments;
3360
/* Look through n slots after the segment * n'th slot */
3362
os_mutex_enter(array->mutex);
3364
for (i = 0; i < n; i++) {
3365
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3367
if (slot->reserved) {
3368
/* Found an i/o request */
3374
os_mutex_exit(array->mutex);
3377
os_event_set(os_aio_segment_wait_events[global_segment]);
3381
/**************************************************************************
3382
Wakes up simulated aio i/o-handler threads if they have something to do. */
3385
os_aio_simulated_wake_handler_threads(void)
3386
/*=======================================*/
3390
if (os_aio_use_native_aio) {
3391
/* We do not use simulated aio: do nothing */
3396
os_aio_recommend_sleep_for_read_threads = FALSE;
3398
for (i = 0; i < os_aio_n_segments; i++) {
3399
os_aio_simulated_wake_handler_thread(i);
3403
/**************************************************************************
3404
This function can be called if one wants to post a batch of reads and
3405
prefers an i/o-handler thread to handle them all at once later. You must
3406
call os_aio_simulated_wake_handler_threads later to ensure the threads
3407
are not left sleeping! */
3410
os_aio_simulated_put_read_threads_to_sleep(void)
3411
/*============================================*/
3413
os_aio_array_t* array;
3416
os_aio_recommend_sleep_for_read_threads = TRUE;
3418
for (g = 0; g < os_aio_n_segments; g++) {
3419
os_aio_get_array_and_local_segment(&array, g);
3421
if (array == os_aio_read_array) {
3423
os_event_reset(os_aio_segment_wait_events[g]);
3428
/***********************************************************************
3429
Requests an asynchronous i/o operation. */
3434
/* out: TRUE if request was queued
3435
successfully, FALSE if fail */
3436
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3437
ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
3438
to OS_AIO_SIMULATED_WAKE_LATER: the
3439
last flag advises this function not to wake
3440
i/o-handler threads, but the caller will
3441
do the waking explicitly later, in this
3442
way the caller can post several requests in
3443
a batch; NOTE that the batch must not be
3444
so big that it exhausts the slots in aio
3445
arrays! NOTE that a simulated batch
3446
may introduce hidden chances of deadlocks,
3447
because i/os are not actually handled until
3448
all have been posted: use with great
3450
const char* name, /* in: name of the file or path as a
3451
null-terminated string */
3452
os_file_t file, /* in: handle to a file */
3453
void* buf, /* in: buffer where to read or from which
3455
ulint offset, /* in: least significant 32 bits of file
3456
offset where to read or write */
3457
ulint offset_high, /* in: most significant 32 bits of
3459
ulint n, /* in: number of bytes to read or write */
3460
fil_node_t* message1,/* in: messages for the aio handler (these
3461
can be used to identify a completed aio
3462
operation); if mode is OS_AIO_SYNC, these
3466
os_aio_array_t* array;
3467
os_aio_slot_t* slot;
3471
DWORD len = (DWORD) n;
3472
struct fil_node_struct * dummy_mess1;
3483
ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
3484
ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
3485
ut_ad(os_aio_validate());
3487
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
3488
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
3490
if (mode == OS_AIO_SYNC
3492
&& !os_aio_use_native_aio
3495
/* This is actually an ordinary synchronous read or write:
3496
no need to use an i/o-handler thread. NOTE that if we use
3497
Windows async i/o, Windows does not allow us to use
3498
ordinary synchronous os_file_read etc. on the same file,
3499
therefore we have built a special mechanism for synchronous
3500
wait in the Windows case. */
3502
if (type == OS_FILE_READ) {
3503
return(os_file_read(file, buf, offset,
3507
ut_a(type == OS_FILE_WRITE);
3509
return(os_file_write(name, file, buf, offset, offset_high, n));
3513
if (mode == OS_AIO_NORMAL) {
3514
if (type == OS_FILE_READ) {
3515
array = os_aio_read_array;
3517
array = os_aio_write_array;
3519
} else if (mode == OS_AIO_IBUF) {
3520
ut_ad(type == OS_FILE_READ);
3521
/* Reduce probability of deadlock bugs in connection with ibuf:
3522
do not let the ibuf i/o handler sleep */
3526
array = os_aio_ibuf_array;
3527
} else if (mode == OS_AIO_LOG) {
3529
array = os_aio_log_array;
3530
} else if (mode == OS_AIO_SYNC) {
3531
array = os_aio_sync_array;
3533
array = NULL; /* Eliminate compiler warning */
3537
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
3538
name, buf, offset, offset_high, n);
3539
if (type == OS_FILE_READ) {
3540
if (os_aio_use_native_aio) {
3543
os_bytes_read_since_printout += len;
3545
ret = ReadFile(file, buf, (DWORD)n, &len,
3547
#elif defined(POSIX_ASYNC_IO)
3548
slot->control.aio_lio_opcode = LIO_READ;
3549
err = (ulint) aio_read(&(slot->control));
3550
fprintf(stderr, "Starting POSIX aio read %lu\n", err);
3554
os_aio_simulated_wake_handler_thread(
3555
os_aio_get_segment_no_from_slot(
3559
} else if (type == OS_FILE_WRITE) {
3560
if (os_aio_use_native_aio) {
3563
ret = WriteFile(file, buf, (DWORD)n, &len,
3565
#elif defined(POSIX_ASYNC_IO)
3566
slot->control.aio_lio_opcode = LIO_WRITE;
3567
err = (ulint) aio_write(&(slot->control));
3568
fprintf(stderr, "Starting POSIX aio write %lu\n", err);
3572
os_aio_simulated_wake_handler_thread(
3573
os_aio_get_segment_no_from_slot(
3582
if (os_aio_use_native_aio) {
3583
if ((ret && len == n)
3584
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
3585
/* aio was queued successfully! */
3587
if (mode == OS_AIO_SYNC) {
3588
/* We want a synchronous i/o operation on a
3589
file where we also use async i/o: in Windows
3590
we must use the same wait mechanism as for
3593
retval = os_aio_windows_handle(ULINT_UNDEFINED,
3605
err = 1; /* Fall through the next if */
3609
/* aio was queued successfully! */
3614
os_aio_array_free_slot(array, slot);
3616
retry = os_file_handle_error(name,
3617
type == OS_FILE_READ
3618
? "aio read" : "aio write");
3628
/**************************************************************************
3629
This function is only used in Windows asynchronous i/o.
3630
Waits for an aio operation to complete. This function is used to wait the
3631
for completed requests. The aio array of pending requests is divided
3632
into segments. The thread specifies which segment or slot it wants to wait
3633
for. NOTE: this function will also take care of freeing the aio slot,
3634
therefore no other thread is allowed to do the freeing! */
3637
os_aio_windows_handle(
3638
/*==================*/
3639
/* out: TRUE if the aio operation succeeded */
3640
ulint segment, /* in: the number of the segment in the aio
3641
arrays to wait for; segment 0 is the ibuf
3642
i/o thread, segment 1 the log i/o thread,
3643
then follow the non-ibuf read threads, and as
3644
the last are the non-ibuf write threads; if
3645
this is ULINT_UNDEFINED, then it means that
3646
sync aio is used, and this parameter is
3648
ulint pos, /* this parameter is used only in sync aio:
3649
wait for the aio slot at this position */
3650
fil_node_t**message1, /* out: the messages passed with the aio
3651
request; note that also in the case where
3652
the aio operation failed, these output
3653
parameters are valid and can be used to
3654
restart the operation, for example */
3656
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
3658
ulint orig_seg = segment;
3659
os_aio_array_t* array;
3660
os_aio_slot_t* slot;
3667
if (segment == ULINT_UNDEFINED) {
3668
array = os_aio_sync_array;
3671
segment = os_aio_get_array_and_local_segment(&array, segment);
3674
/* NOTE! We only access constant fields in os_aio_array. Therefore
3675
we do not have to acquire the protecting mutex yet */
3677
ut_ad(os_aio_validate());
3678
ut_ad(segment < array->n_segments);
3680
n = array->n_slots / array->n_segments;
3682
if (array == os_aio_sync_array) {
3683
os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
3686
srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
3687
i = os_event_wait_multiple(n,
3688
(array->native_events)
3692
os_mutex_enter(array->mutex);
3694
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3696
ut_a(slot->reserved);
3698
if (orig_seg != ULINT_UNDEFINED) {
3699
srv_set_io_thread_op_info(orig_seg,
3700
"get windows aio return value");
3703
ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
3705
*message1 = slot->message1;
3706
*message2 = slot->message2;
3710
if (ret && len == slot->len) {
3713
# ifdef UNIV_DO_FLUSH
3714
if (slot->type == OS_FILE_WRITE
3715
&& !os_do_not_call_flush_at_each_write) {
3716
ut_a(TRUE == os_file_flush(slot->file));
3718
# endif /* UNIV_DO_FLUSH */
3720
os_file_handle_error(slot->name, "Windows aio");
3725
os_mutex_exit(array->mutex);
3727
os_aio_array_free_slot(array, slot);
3733
#ifdef POSIX_ASYNC_IO
3735
/**************************************************************************
3736
This function is only used in Posix asynchronous i/o. Waits for an aio
3737
operation to complete. */
3740
os_aio_posix_handle(
3741
/*================*/
3742
/* out: TRUE if the aio operation succeeded */
3743
ulint array_no, /* in: array number 0 - 3 */
3744
fil_node_t**message1, /* out: the messages passed with the aio
3745
request; note that also in the case where
3746
the aio operation failed, these output
3747
parameters are valid and can be used to
3748
restart the operation, for example */
3751
os_aio_array_t* array;
3752
os_aio_slot_t* slot;
3755
sigset_t proc_sigset;
3756
sigset_t thr_sigset;
3761
sigemptyset(&sigset);
3762
sigaddset(&sigset, SIGRTMIN + 1 + array_no);
3764
pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
3767
sigprocmask(0, NULL, &proc_sigset);
3768
pthread_sigmask(0, NULL, &thr_sigset);
3770
for (i = 32 ; i < 40; i++) {
3771
fprintf(stderr, "%lu : %lu %lu\n", (ulint)i,
3772
(ulint) sigismember(&proc_sigset, i),
3773
(ulint) sigismember(&thr_sigset, i));
3777
ret = sigwaitinfo(&sigset, &info);
3779
if (sig != SIGRTMIN + 1 + array_no) {
3786
fputs("Handling POSIX aio\n", stderr);
3788
array = os_aio_get_array_from_no(array_no);
3790
os_mutex_enter(array->mutex);
3792
slot = info.si_value.sival_ptr;
3794
ut_a(slot->reserved);
3796
*message1 = slot->message1;
3797
*message2 = slot->message2;
3799
# ifdef UNIV_DO_FLUSH
3800
if (slot->type == OS_FILE_WRITE
3801
&& !os_do_not_call_flush_at_each_write) {
3802
ut_a(TRUE == os_file_flush(slot->file));
3804
# endif /* UNIV_DO_FLUSH */
3806
os_mutex_exit(array->mutex);
3808
os_aio_array_free_slot(array, slot);
3814
/**************************************************************************
3815
Does simulated aio. This function should be called by an i/o-handler
3819
os_aio_simulated_handle(
3820
/*====================*/
3821
/* out: TRUE if the aio operation succeeded */
3822
ulint global_segment, /* in: the number of the segment in the aio
3823
arrays to wait for; segment 0 is the ibuf
3824
i/o thread, segment 1 the log i/o thread,
3825
then follow the non-ibuf read threads, and as
3826
the last are the non-ibuf write threads */
3827
fil_node_t**message1, /* out: the messages passed with the aio
3828
request; note that also in the case where
3829
the aio operation failed, these output
3830
parameters are valid and can be used to
3831
restart the operation, for example */
3833
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
3835
os_aio_array_t* array;
3837
os_aio_slot_t* slot;
3838
os_aio_slot_t* slot2;
3839
os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
3840
ulint n_consecutive;
3843
ulint lowest_offset;
3847
byte* combined_buf2;
3852
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3855
/* NOTE! We only access constant fields in os_aio_array. Therefore
3856
we do not have to acquire the protecting mutex yet */
3858
srv_set_io_thread_op_info(global_segment,
3859
"looking for i/o requests (a)");
3860
ut_ad(os_aio_validate());
3861
ut_ad(segment < array->n_segments);
3863
n = array->n_slots / array->n_segments;
3865
/* Look through n slots after the segment * n'th slot */
3867
if (array == os_aio_read_array
3868
&& os_aio_recommend_sleep_for_read_threads) {
3870
/* Give other threads chance to add several i/os to the array
3873
goto recommended_sleep;
3876
os_mutex_enter(array->mutex);
3878
srv_set_io_thread_op_info(global_segment,
3879
"looking for i/o requests (b)");
3881
/* Check if there is a slot for which the i/o has already been
3884
for (i = 0; i < n; i++) {
3885
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3887
if (slot->reserved && slot->io_already_done) {
3889
if (os_aio_print_debug) {
3891
"InnoDB: i/o for slot %lu"
3892
" already done, returning\n",
3904
/* If there are at least 2 seconds old requests, then pick the oldest
3905
one to prevent starvation. If several requests have the same age,
3906
then pick the one at the lowest offset. */
3909
lowest_offset = ULINT_MAX;
3911
for (i = 0; i < n; i++) {
3912
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3914
if (slot->reserved) {
3915
age = (ulint)difftime(time(NULL),
3916
slot->reservation_time);
3918
if ((age >= 2 && age > biggest_age)
3919
|| (age >= 2 && age == biggest_age
3920
&& slot->offset < lowest_offset)) {
3922
/* Found an i/o request */
3923
consecutive_ios[0] = slot;
3928
lowest_offset = slot->offset;
3933
if (n_consecutive == 0) {
3934
/* There were no old requests. Look for an i/o request at the
3935
lowest offset in the array (we ignore the high 32 bits of the
3936
offset in these heuristics) */
3938
lowest_offset = ULINT_MAX;
3940
for (i = 0; i < n; i++) {
3941
slot = os_aio_array_get_nth_slot(array,
3944
if (slot->reserved && slot->offset < lowest_offset) {
3946
/* Found an i/o request */
3947
consecutive_ios[0] = slot;
3951
lowest_offset = slot->offset;
3956
if (n_consecutive == 0) {
3958
/* No i/o requested at the moment */
3963
slot = consecutive_ios[0];
3965
/* Check if there are several consecutive blocks to read or write */
3968
for (i = 0; i < n; i++) {
3969
slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
3971
if (slot2->reserved && slot2 != slot
3972
&& slot2->offset == slot->offset + slot->len
3973
/* check that sum does not wrap over */
3974
&& slot->offset + slot->len > slot->offset
3975
&& slot2->offset_high == slot->offset_high
3976
&& slot2->type == slot->type
3977
&& slot2->file == slot->file) {
3979
/* Found a consecutive i/o request */
3981
consecutive_ios[n_consecutive] = slot2;
3986
if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
3988
goto consecutive_loop;
3995
srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
3997
/* We have now collected n_consecutive i/o requests in the array;
3998
allocate a single buffer which can hold all data, and perform the
4002
slot = consecutive_ios[0];
4004
for (i = 0; i < n_consecutive; i++) {
4005
total_len += consecutive_ios[i]->len;
4008
if (n_consecutive == 1) {
4009
/* We can use the buffer of the i/o request */
4010
combined_buf = slot->buf;
4011
combined_buf2 = NULL;
4013
combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
4015
ut_a(combined_buf2);
4017
combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
4020
/* We release the array mutex for the time of the i/o: NOTE that
4021
this assumes that there is just one i/o-handler thread serving
4022
a single segment of slots! */
4024
os_mutex_exit(array->mutex);
4026
if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4027
/* Copy the buffers to the combined buffer */
4030
for (i = 0; i < n_consecutive; i++) {
4032
ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4033
consecutive_ios[i]->len);
4034
offs += consecutive_ios[i]->len;
4038
srv_set_io_thread_op_info(global_segment, "doing file i/o");
4040
if (os_aio_print_debug) {
4042
"InnoDB: doing i/o of type %lu at offset %lu %lu,"
4044
(ulong) slot->type, (ulong) slot->offset_high,
4045
(ulong) slot->offset, (ulong) total_len);
4048
/* Do the i/o with ordinary, synchronous i/o functions: */
4049
if (slot->type == OS_FILE_WRITE) {
4050
ret = os_file_write(slot->name, slot->file, combined_buf,
4051
slot->offset, slot->offset_high,
4054
ret = os_file_read(slot->file, combined_buf,
4055
slot->offset, slot->offset_high, total_len);
4059
srv_set_io_thread_op_info(global_segment, "file i/o done");
4063
"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4064
n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4067
if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4068
/* Copy the combined buffer to individual buffers */
4071
for (i = 0; i < n_consecutive; i++) {
4073
ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4074
consecutive_ios[i]->len);
4075
offs += consecutive_ios[i]->len;
4079
if (combined_buf2) {
4080
ut_free(combined_buf2);
4083
os_mutex_enter(array->mutex);
4085
/* Mark the i/os done in slots */
4087
for (i = 0; i < n_consecutive; i++) {
4088
consecutive_ios[i]->io_already_done = TRUE;
4091
/* We return the messages for the first slot now, and if there were
4092
several slots, the messages will be returned with subsequent calls
4097
ut_a(slot->reserved);
4099
*message1 = slot->message1;
4100
*message2 = slot->message2;
4104
os_mutex_exit(array->mutex);
4106
os_aio_array_free_slot(array, slot);
4111
srv_set_io_thread_op_info(global_segment, "resetting wait event");
4113
/* We wait here until there again can be i/os in the segment
4116
os_event_reset(os_aio_segment_wait_events[global_segment]);
4118
os_mutex_exit(array->mutex);
4121
srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4123
os_event_wait(os_aio_segment_wait_events[global_segment]);
4125
if (os_aio_print_debug) {
4127
"InnoDB: i/o handler thread for i/o"
4128
" segment %lu wakes up\n",
4129
(ulong) global_segment);
4135
/**************************************************************************
4136
Validates the consistency of an aio array. */
4139
os_aio_array_validate(
4140
/*==================*/
4141
/* out: TRUE if ok */
4142
os_aio_array_t* array) /* in: aio wait array */
4144
os_aio_slot_t* slot;
4145
ulint n_reserved = 0;
4150
os_mutex_enter(array->mutex);
4152
ut_a(array->n_slots > 0);
4153
ut_a(array->n_segments > 0);
4155
for (i = 0; i < array->n_slots; i++) {
4156
slot = os_aio_array_get_nth_slot(array, i);
4158
if (slot->reserved) {
4160
ut_a(slot->len > 0);
4164
ut_a(array->n_reserved == n_reserved);
4166
os_mutex_exit(array->mutex);
4171
/**************************************************************************
4172
Validates the consistency the aio system. */
4175
os_aio_validate(void)
4176
/*=================*/
4177
/* out: TRUE if ok */
4179
os_aio_array_validate(os_aio_read_array);
4180
os_aio_array_validate(os_aio_write_array);
4181
os_aio_array_validate(os_aio_ibuf_array);
4182
os_aio_array_validate(os_aio_log_array);
4183
os_aio_array_validate(os_aio_sync_array);
4188
/**************************************************************************
4189
Prints info of the aio arrays. */
4194
FILE* file) /* in: file where to print */
4196
os_aio_array_t* array;
4197
os_aio_slot_t* slot;
4199
time_t current_time;
4200
double time_elapsed;
4201
double avg_bytes_read;
4204
for (i = 0; i < srv_n_file_io_threads; i++) {
4205
fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
4206
srv_io_thread_op_info[i],
4207
srv_io_thread_function[i]);
4210
if (os_aio_segment_wait_events[i]->is_set) {
4211
fprintf(file, " ev set");
4215
fprintf(file, "\n");
4218
fputs("Pending normal aio reads:", file);
4220
array = os_aio_read_array;
4224
os_mutex_enter(array->mutex);
4226
ut_a(array->n_slots > 0);
4227
ut_a(array->n_segments > 0);
4231
for (i = 0; i < array->n_slots; i++) {
4232
slot = os_aio_array_get_nth_slot(array, i);
4234
if (slot->reserved) {
4237
fprintf(stderr, "Reserved slot, messages %p %p\n",
4238
(void*) slot->message1,
4239
(void*) slot->message2);
4241
ut_a(slot->len > 0);
4245
ut_a(array->n_reserved == n_reserved);
4247
fprintf(file, " %lu", (ulong) n_reserved);
4249
os_mutex_exit(array->mutex);
4251
if (array == os_aio_read_array) {
4252
fputs(", aio writes:", file);
4254
array = os_aio_write_array;
4259
if (array == os_aio_write_array) {
4260
fputs(",\n ibuf aio reads:", file);
4261
array = os_aio_ibuf_array;
4266
if (array == os_aio_ibuf_array) {
4267
fputs(", log i/o's:", file);
4268
array = os_aio_log_array;
4273
if (array == os_aio_log_array) {
4274
fputs(", sync i/o's:", file);
4275
array = os_aio_sync_array;
4281
current_time = time(NULL);
4282
time_elapsed = 0.001 + difftime(current_time, os_last_printout);
4285
"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
4286
"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
4287
(ulong) fil_n_pending_log_flushes,
4288
(ulong) fil_n_pending_tablespace_flushes,
4289
(ulong) os_n_file_reads, (ulong) os_n_file_writes,
4290
(ulong) os_n_fsyncs);
4292
if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
4294
"%lu pending preads, %lu pending pwrites\n",
4295
(ulong) os_file_n_pending_preads,
4296
(ulong) os_file_n_pending_pwrites);
4299
if (os_n_file_reads == os_n_file_reads_old) {
4300
avg_bytes_read = 0.0;
4302
avg_bytes_read = (double) os_bytes_read_since_printout
4303
/ (os_n_file_reads - os_n_file_reads_old);
4307
"%.2f reads/s, %lu avg bytes/read,"
4308
" %.2f writes/s, %.2f fsyncs/s\n",
4309
(os_n_file_reads - os_n_file_reads_old)
4311
(ulong)avg_bytes_read,
4312
(os_n_file_writes - os_n_file_writes_old)
4314
(os_n_fsyncs - os_n_fsyncs_old)
4317
os_n_file_reads_old = os_n_file_reads;
4318
os_n_file_writes_old = os_n_file_writes;
4319
os_n_fsyncs_old = os_n_fsyncs;
4320
os_bytes_read_since_printout = 0;
4322
os_last_printout = current_time;
4325
/**************************************************************************
4326
Refreshes the statistics used to print per-second averages. */
4329
os_aio_refresh_stats(void)
4330
/*======================*/
4332
os_n_file_reads_old = os_n_file_reads;
4333
os_n_file_writes_old = os_n_file_writes;
4334
os_n_fsyncs_old = os_n_fsyncs;
4335
os_bytes_read_since_printout = 0;
4337
os_last_printout = time(NULL);
4341
/**************************************************************************
4342
Checks that all slots in the system have been freed, that is, there are
4343
no pending io operations. */
4346
os_aio_all_slots_free(void)
4347
/*=======================*/
4348
/* out: TRUE if all free */
4350
os_aio_array_t* array;
4353
array = os_aio_read_array;
4355
os_mutex_enter(array->mutex);
4357
n_res += array->n_reserved;
4359
os_mutex_exit(array->mutex);
4361
array = os_aio_write_array;
4363
os_mutex_enter(array->mutex);
4365
n_res += array->n_reserved;
4367
os_mutex_exit(array->mutex);
4369
array = os_aio_ibuf_array;
4371
os_mutex_enter(array->mutex);
4373
n_res += array->n_reserved;
4375
os_mutex_exit(array->mutex);
4377
array = os_aio_log_array;
4379
os_mutex_enter(array->mutex);
4381
n_res += array->n_reserved;
4383
os_mutex_exit(array->mutex);
4385
array = os_aio_sync_array;
4387
os_mutex_enter(array->mutex);
4389
n_res += array->n_reserved;
4391
os_mutex_exit(array->mutex);
4400
#endif /* UNIV_DEBUG */