1
/******************************************************
2
The interface to the operating system file i/o primitives
6
Created 10/21/1995 Heikki Tuuri
7
*******************************************************/
11
#include "os0thread.h"
14
#include "srv0start.h"
18
#if defined(UNIV_HOTBACKUP) && defined(__WIN__)
19
/* Add includes for the _stat() call to compile on Windows */
20
#include <sys/types.h>
23
#endif /* UNIV_HOTBACKUP */
26
/* We assume in this case that the OS has standard Posix aio (at least SunOS
27
2.6, HP-UX 11i and AIX 4.3 have) */
31
/* This specifies the file permissions InnoDB uses when it creates files in
32
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
36
ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
38
ulint os_innodb_umask = 0;
42
/* If the following is set to TRUE, we do not call os_file_flush in every
43
os_file_write. We can set this TRUE when the doublewrite buffer is used. */
44
ibool os_do_not_call_flush_at_each_write = FALSE;
46
/* We do not call os_file_flush in every os_file_write. */
47
#endif /* UNIV_DO_FLUSH */
49
/* We use these mutexes to protect lseek + file i/o operation, if the
50
OS does not provide an atomic pread or pwrite, or similar */
51
#define OS_FILE_N_SEEK_MUTEXES 16
52
os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
54
/* In simulated aio, merge at most this many consecutive i/os */
55
#define OS_AIO_MERGE_N_CONSECUTIVE 64
57
/* If this flag is TRUE, then we will use the native aio of the
58
OS (provided we compiled Innobase with it in), otherwise we will
59
use simulated aio we build below with threads */
61
ibool os_aio_use_native_aio = FALSE;
63
ibool os_aio_print_debug = FALSE;
65
/* The aio array slot structure */
66
typedef struct os_aio_slot_struct os_aio_slot_t;
68
struct os_aio_slot_struct{
69
ibool is_read; /* TRUE if a read operation */
70
ulint pos; /* index of the slot in the aio
72
ibool reserved; /* TRUE if this slot is reserved */
73
time_t reservation_time;/* time when reserved */
74
ulint len; /* length of the block to read or
76
byte* buf; /* buffer used in i/o */
77
ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
78
ulint offset; /* 32 low bits of file offset in
80
ulint offset_high; /* 32 high bits of file offset */
81
os_file_t file; /* file where to read or write */
82
const char* name; /* file name or path */
83
ibool io_already_done;/* used only in simulated aio:
84
TRUE if the physical i/o already
85
made and only the slot message
86
needs to be passed to the caller
87
of os_aio_simulated_handle */
88
fil_node_t* message1; /* message which is given by the */
89
void* message2; /* the requester of an aio operation
90
and which can be used to identify
91
which pending aio operation was
94
os_event_t event; /* event object we need in the
96
OVERLAPPED control; /* Windows control block for the
98
#elif defined(POSIX_ASYNC_IO)
99
struct aiocb control; /* Posix control block for aio
104
/* The aio array structure */
105
typedef struct os_aio_array_struct os_aio_array_t;
107
struct os_aio_array_struct{
108
os_mutex_t mutex; /* the mutex protecting the aio array */
109
os_event_t not_full; /* The event which is set to the signaled
110
state when there is space in the aio
111
outside the ibuf segment */
112
os_event_t is_empty; /* The event which is set to the signaled
113
state when there are no pending i/os
115
ulint n_slots; /* Total number of slots in the aio array.
116
This must be divisible by n_threads. */
117
ulint n_segments;/* Number of segments in the aio array of
118
pending aio requests. A thread can wait
119
separately for any one of the segments. */
120
ulint n_reserved;/* Number of reserved slots in the
121
aio array outside the ibuf segment */
122
os_aio_slot_t* slots; /* Pointer to the slots in the array */
124
os_native_event_t* native_events;
125
/* Pointer to an array of OS native event
126
handles where we copied the handles from
127
slots, in the same order. This can be used
128
in WaitForMultipleObjects; used only in
133
/* Array of events used in simulated aio */
134
os_event_t* os_aio_segment_wait_events = NULL;
136
/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
137
are NULL when the module has not yet been initialized. */
138
static os_aio_array_t* os_aio_read_array = NULL;
139
static os_aio_array_t* os_aio_write_array = NULL;
140
static os_aio_array_t* os_aio_ibuf_array = NULL;
141
static os_aio_array_t* os_aio_log_array = NULL;
142
static os_aio_array_t* os_aio_sync_array = NULL;
144
static ulint os_aio_n_segments = ULINT_UNDEFINED;
146
/* If the following is TRUE, read i/o handler threads try to
147
wait until a batch of new read requests have been posted */
148
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
150
ulint os_n_file_reads = 0;
151
ulint os_bytes_read_since_printout = 0;
152
ulint os_n_file_writes = 0;
153
ulint os_n_fsyncs = 0;
154
ulint os_n_file_reads_old = 0;
155
ulint os_n_file_writes_old = 0;
156
ulint os_n_fsyncs_old = 0;
157
time_t os_last_printout;
159
ibool os_has_said_disk_full = FALSE;
161
/* The mutex protecting the following counts of pending I/O operations */
162
static os_mutex_t os_file_count_mutex;
163
ulint os_file_n_pending_preads = 0;
164
ulint os_file_n_pending_pwrites = 0;
165
ulint os_n_pending_writes = 0;
166
ulint os_n_pending_reads = 0;
168
/***************************************************************************
169
Gets the operating system version. Currently works only on Windows. */
172
os_get_os_version(void)
173
/*===================*/
174
/* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
177
OSVERSIONINFO os_info;
179
os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
181
ut_a(GetVersionEx(&os_info));
183
if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
185
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
187
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
188
if (os_info.dwMajorVersion <= 4) {
204
/***************************************************************************
205
Retrieves the last error number if an error occurs in a file io function.
206
The number should be retrieved before any other OS calls (because they may
207
overwrite the error number). If the number is not known to this program,
208
the OS error number + 100 is returned. */
211
os_file_get_last_error(
212
/*===================*/
213
/* out: error number, or OS error
215
ibool report_all_errors) /* in: TRUE if we want an error message
216
printed of all errors */
222
err = (ulint) GetLastError();
224
if (report_all_errors
225
|| (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
227
ut_print_timestamp(stderr);
229
" InnoDB: Operating system error number %lu"
230
" in a file operation.\n", (ulong) err);
232
if (err == ERROR_PATH_NOT_FOUND) {
234
"InnoDB: The error means the system"
235
" cannot find the path specified.\n");
237
if (srv_is_being_started) {
239
"InnoDB: If you are installing InnoDB,"
240
" remember that you must create\n"
241
"InnoDB: directories yourself, InnoDB"
242
" does not create them.\n");
244
} else if (err == ERROR_ACCESS_DENIED) {
246
"InnoDB: The error means mysqld does not have"
247
" the access rights to\n"
248
"InnoDB: the directory. It may also be"
249
" you have created a subdirectory\n"
250
"InnoDB: of the same name as a data file.\n");
251
} else if (err == ERROR_SHARING_VIOLATION
252
|| err == ERROR_LOCK_VIOLATION) {
254
"InnoDB: The error means that another program"
255
" is using InnoDB's files.\n"
256
"InnoDB: This might be a backup or antivirus"
257
" software or another instance\n"
259
" Please close it to get rid of this error.\n");
262
"InnoDB: Some operating system error numbers"
263
" are described at\n"
265
"http://dev.mysql.com/doc/refman/5.1/en/"
266
"operating-system-error-codes.html\n");
272
if (err == ERROR_FILE_NOT_FOUND) {
273
return(OS_FILE_NOT_FOUND);
274
} else if (err == ERROR_DISK_FULL) {
275
return(OS_FILE_DISK_FULL);
276
} else if (err == ERROR_FILE_EXISTS) {
277
return(OS_FILE_ALREADY_EXISTS);
278
} else if (err == ERROR_SHARING_VIOLATION
279
|| err == ERROR_LOCK_VIOLATION) {
280
return(OS_FILE_SHARING_VIOLATION);
287
if (report_all_errors
288
|| (err != ENOSPC && err != EEXIST)) {
290
ut_print_timestamp(stderr);
292
" InnoDB: Operating system error number %lu"
293
" in a file operation.\n", (ulong) err);
297
"InnoDB: The error means the system"
298
" cannot find the path specified.\n");
300
if (srv_is_being_started) {
302
"InnoDB: If you are installing InnoDB,"
303
" remember that you must create\n"
304
"InnoDB: directories yourself, InnoDB"
305
" does not create them.\n");
307
} else if (err == EACCES) {
309
"InnoDB: The error means mysqld does not have"
310
" the access rights to\n"
311
"InnoDB: the directory.\n");
313
if (strerror((int)err) != NULL) {
315
"InnoDB: Error number %lu"
317
err, strerror((int)err));
321
"InnoDB: Some operating system"
322
" error numbers are described at\n"
324
"http://dev.mysql.com/doc/refman/5.1/en/"
325
"operating-system-error-codes.html\n");
332
return(OS_FILE_DISK_FULL);
333
#ifdef POSIX_ASYNC_IO
334
} else if (err == EAGAIN) {
335
return(OS_FILE_AIO_RESOURCES_RESERVED);
337
} else if (err == ENOENT) {
338
return(OS_FILE_NOT_FOUND);
339
} else if (err == EEXIST) {
340
return(OS_FILE_ALREADY_EXISTS);
341
} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
342
return(OS_FILE_PATH_ERROR);
349
/********************************************************************
350
Does error handling when a file operation fails.
351
Conditionally exits (calling exit(3)) based on should_exit value and the
356
os_file_handle_error_cond_exit(
357
/*===========================*/
358
/* out: TRUE if we should retry the
360
const char* name, /* in: name of a file or NULL */
361
const char* operation, /* in: operation */
362
ibool should_exit) /* in: call exit(3) if unknown error
363
and this parameter is TRUE */
367
err = os_file_get_last_error(FALSE);
369
if (err == OS_FILE_DISK_FULL) {
370
/* We only print a warning about disk full once */
372
if (os_has_said_disk_full) {
378
ut_print_timestamp(stderr);
380
" InnoDB: Encountered a problem with"
384
ut_print_timestamp(stderr);
386
" InnoDB: Disk is full. Try to clean the disk"
387
" to free space.\n");
389
os_has_said_disk_full = TRUE;
394
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
397
} else if (err == OS_FILE_ALREADY_EXISTS
398
|| err == OS_FILE_PATH_ERROR) {
401
} else if (err == OS_FILE_SHARING_VIOLATION) {
403
os_thread_sleep(10000000); /* 10 sec */
407
fprintf(stderr, "InnoDB: File name %s\n", name);
410
fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
414
fprintf(stderr, "InnoDB: Cannot continue operation.\n");
425
/********************************************************************
426
Does error handling when a file operation fails. */
429
os_file_handle_error(
430
/*=================*/
431
/* out: TRUE if we should retry the
433
const char* name, /* in: name of a file or NULL */
434
const char* operation)/* in: operation */
436
/* exit in case of unknown error */
437
return(os_file_handle_error_cond_exit(name, operation, TRUE));
440
/********************************************************************
441
Does error handling when a file operation fails. */
444
os_file_handle_error_no_exit(
445
/*=========================*/
446
/* out: TRUE if we should retry the
448
const char* name, /* in: name of a file or NULL */
449
const char* operation)/* in: operation */
451
/* don't exit in case of unknown error */
452
return(os_file_handle_error_cond_exit(name, operation, FALSE));
456
#define USE_FILE_LOCK
457
#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
458
/* InnoDB Hot Backup does not lock the data files.
459
* On Windows, mandatory locking is used.
461
# undef USE_FILE_LOCK
464
/********************************************************************
465
Obtain an exclusive lock on a file. */
470
/* out: 0 on success */
471
int fd, /* in: file descriptor */
472
const char* name) /* in: file name */
476
lk.l_whence = SEEK_SET;
477
lk.l_start = lk.l_len = 0;
478
if (fcntl(fd, F_SETLK, &lk) == -1) {
480
"InnoDB: Unable to lock %s, error: %d\n", name, errno);
482
if (errno == EAGAIN || errno == EACCES) {
484
"InnoDB: Check that you do not already have"
485
" another mysqld process\n"
486
"InnoDB: using the same InnoDB data"
495
#endif /* USE_FILE_LOCK */
497
/********************************************************************
498
Creates the seek mutexes used in positioned reads and writes. */
501
os_io_init_simple(void)
502
/*===================*/
506
os_file_count_mutex = os_mutex_create(NULL);
508
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
509
os_file_seek_mutexes[i] = os_mutex_create(NULL);
513
#if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
514
/*************************************************************************
515
Creates a temporary file that will be deleted on close.
516
This function is defined in ha_innodb.cc. */
519
innobase_mysql_tmpfile(void);
520
/*========================*/
521
/* out: temporary file descriptor, or < 0 on error */
522
#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
524
/***************************************************************************
525
Creates a temporary file. This function is like tmpfile(3), but
526
the temporary file is created in the MySQL temporary directory.
527
On Netware, this function is like tmpfile(3), because the C run-time
528
library of Netware does not expose the delete-on-close flag. */
531
os_file_create_tmpfile(void)
532
/*========================*/
533
/* out: temporary file handle, or NULL on error */
535
#ifdef UNIV_HOTBACKUP
541
FILE* file = tmpfile();
542
# else /* __NETWARE__ */
544
int fd = innobase_mysql_tmpfile();
547
file = fdopen(fd, "w+b");
549
# endif /* __NETWARE__ */
552
ut_print_timestamp(stderr);
554
" InnoDB: Error: unable to create temporary file;"
555
" errno: %d\n", errno);
560
# endif /* !__NETWARE__ */
564
#endif /* UNIV_HOTBACKUP */
567
/***************************************************************************
568
The os_file_opendir() function opens a directory stream corresponding to the
569
directory named by the dirname argument. The directory stream is positioned
570
at the first entry. In both Unix and Windows we automatically skip the '.'
571
and '..' items at the start of the directory listing. */
576
/* out: directory stream, NULL if
578
const char* dirname, /* in: directory name; it must not
579
contain a trailing '\' or '/' */
580
ibool error_is_fatal) /* in: TRUE if we should treat an
581
error as a fatal error; if we try to
582
open symlinks then we do not wish a
583
fatal error if it happens not to be
588
LPWIN32_FIND_DATA lpFindFileData;
589
char path[OS_FILE_MAX_PATH + 3];
591
ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
593
strcpy(path, dirname);
594
strcpy(path + strlen(path), "\\*");
596
/* Note that in Windows opening the 'directory stream' also retrieves
597
the first entry in the directory. Since it is '.', that is no problem,
598
as we will skip over the '.' and '..' entries anyway. */
600
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
602
dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
604
ut_free(lpFindFileData);
606
if (dir == INVALID_HANDLE_VALUE) {
608
if (error_is_fatal) {
609
os_file_handle_error(dirname, "opendir");
617
dir = opendir(dirname);
619
if (dir == NULL && error_is_fatal) {
620
os_file_handle_error(dirname, "opendir");
627
/***************************************************************************
628
Closes a directory stream. */
633
/* out: 0 if success, -1 if failure */
634
os_file_dir_t dir) /* in: directory stream */
639
ret = FindClose(dir);
642
os_file_handle_error_no_exit(NULL, "closedir");
654
os_file_handle_error_no_exit(NULL, "closedir");
661
/***************************************************************************
662
This function returns information of the next file in the directory. We jump
663
over the '.' and '..' entries in the directory. */
666
os_file_readdir_next_file(
667
/*======================*/
668
/* out: 0 if ok, -1 if error, 1 if at the end
670
const char* dirname,/* in: directory name or path */
671
os_file_dir_t dir, /* in: directory stream */
672
os_file_stat_t* info) /* in/out: buffer where the info is returned */
675
LPWIN32_FIND_DATA lpFindFileData;
678
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
680
ret = FindNextFile(dir, lpFindFileData);
683
ut_a(strlen((char *) lpFindFileData->cFileName)
686
if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
687
|| strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
692
strcpy(info->name, (char *) lpFindFileData->cFileName);
694
info->size = (ib_longlong)(lpFindFileData->nFileSizeLow)
695
+ (((ib_longlong)(lpFindFileData->nFileSizeHigh))
698
if (lpFindFileData->dwFileAttributes
699
& FILE_ATTRIBUTE_REPARSE_POINT) {
700
/* TODO: test Windows symlinks */
701
/* TODO: MySQL has apparently its own symlink
702
implementation in Windows, dbname.sym can
703
redirect a database directory:
704
http://dev.mysql.com/doc/refman/5.1/en/
705
windows-symbolic-links.html */
706
info->type = OS_FILE_TYPE_LINK;
707
} else if (lpFindFileData->dwFileAttributes
708
& FILE_ATTRIBUTE_DIRECTORY) {
709
info->type = OS_FILE_TYPE_DIR;
711
/* It is probably safest to assume that all other
712
file types are normal. Better to check them rather
713
than blindly skip them. */
715
info->type = OS_FILE_TYPE_FILE;
719
ut_free(lpFindFileData);
723
} else if (GetLastError() == ERROR_NO_MORE_FILES) {
727
os_file_handle_error_no_exit(dirname,
728
"readdir_next_file");
735
struct stat statinfo;
736
#ifdef HAVE_READDIR_R
737
char dirent_buf[sizeof(struct dirent)
738
+ _POSIX_PATH_MAX + 100];
739
/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
740
the max file name len; but in most standards, the
741
length is NAME_MAX; we add 100 to be even safer */
746
#ifdef HAVE_READDIR_R
747
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
751
"InnoDB: cannot read directory %s, error %lu\n",
752
dirname, (ulong)ret);
758
/* End of directory */
763
ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
772
ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
774
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
779
strcpy(info->name, ent->d_name);
781
full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
783
sprintf(full_path, "%s/%s", dirname, ent->d_name);
785
ret = stat(full_path, &statinfo);
788
os_file_handle_error_no_exit(full_path, "stat");
795
info->size = (ib_longlong)statinfo.st_size;
797
if (S_ISDIR(statinfo.st_mode)) {
798
info->type = OS_FILE_TYPE_DIR;
799
} else if (S_ISLNK(statinfo.st_mode)) {
800
info->type = OS_FILE_TYPE_LINK;
801
} else if (S_ISREG(statinfo.st_mode)) {
802
info->type = OS_FILE_TYPE_FILE;
804
info->type = OS_FILE_TYPE_UNKNOWN;
813
/*********************************************************************
814
This function attempts to create a directory named pathname. The new directory
815
gets default permissions. On Unix the permissions are (0770 & ~umask). If the
816
directory exists already, nothing is done and the call succeeds, unless the
817
fail_if_exists arguments is true. */
820
os_file_create_directory(
821
/*=====================*/
822
/* out: TRUE if call succeeds,
824
const char* pathname, /* in: directory name as
825
null-terminated string */
826
ibool fail_if_exists) /* in: if TRUE, pre-existing directory
827
is treated as an error. */
832
rcode = CreateDirectory((LPCTSTR) pathname, NULL);
834
|| (GetLastError() == ERROR_ALREADY_EXISTS
835
&& !fail_if_exists))) {
837
os_file_handle_error(pathname, "CreateDirectory");
846
rcode = mkdir(pathname, 0770);
848
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
850
os_file_handle_error(pathname, "mkdir");
859
/********************************************************************
860
A simple function to open or create a file. */
863
os_file_create_simple(
864
/*==================*/
865
/* out, own: handle to the file, not defined
866
if error, error number can be retrieved with
867
os_file_get_last_error */
868
const char* name, /* in: name of the file or path as a
869
null-terminated string */
870
ulint create_mode,/* in: OS_FILE_OPEN if an existing file is
871
opened (if does not exist, error), or
872
OS_FILE_CREATE if a new file is created
873
(if exists, error), or
874
OS_FILE_CREATE_PATH if new file
875
(if exists, error) and subdirectories along
876
its path are created (if needed)*/
877
ulint access_type,/* in: OS_FILE_READ_ONLY or
878
OS_FILE_READ_WRITE */
879
ibool* success)/* out: TRUE if succeed, FALSE if error */
885
DWORD attributes = 0;
891
if (create_mode == OS_FILE_OPEN) {
892
create_flag = OPEN_EXISTING;
893
} else if (create_mode == OS_FILE_CREATE) {
894
create_flag = CREATE_NEW;
895
} else if (create_mode == OS_FILE_CREATE_PATH) {
896
/* create subdirs along the path if needed */
897
*success = os_file_create_subdirs_if_needed(name);
901
create_flag = CREATE_NEW;
902
create_mode = OS_FILE_CREATE;
908
if (access_type == OS_FILE_READ_ONLY) {
909
access = GENERIC_READ;
910
} else if (access_type == OS_FILE_READ_WRITE) {
911
access = GENERIC_READ | GENERIC_WRITE;
917
file = CreateFile((LPCTSTR) name,
919
FILE_SHARE_READ | FILE_SHARE_WRITE,
920
/* file can be read and written also
921
by other processes */
922
NULL, /* default security attributes */
925
NULL); /* no template file */
927
if (file == INVALID_HANDLE_VALUE) {
930
retry = os_file_handle_error(name,
931
create_mode == OS_FILE_OPEN ?
949
if (create_mode == OS_FILE_OPEN) {
950
if (access_type == OS_FILE_READ_ONLY) {
951
create_flag = O_RDONLY;
953
create_flag = O_RDWR;
955
} else if (create_mode == OS_FILE_CREATE) {
956
create_flag = O_RDWR | O_CREAT | O_EXCL;
957
} else if (create_mode == OS_FILE_CREATE_PATH) {
958
/* create subdirs along the path if needed */
959
*success = os_file_create_subdirs_if_needed(name);
963
create_flag = O_RDWR | O_CREAT | O_EXCL;
964
create_mode = OS_FILE_CREATE;
970
if (create_mode == OS_FILE_CREATE) {
971
file = open(name, create_flag, S_IRUSR | S_IWUSR
972
| S_IRGRP | S_IWGRP);
974
file = open(name, create_flag);
980
retry = os_file_handle_error(name,
981
create_mode == OS_FILE_OPEN ?
987
} else if (access_type == OS_FILE_READ_WRITE
988
&& os_file_lock(file, name)) {
1001
/********************************************************************
1002
A simple function to open or create a file. */
1005
os_file_create_simple_no_error_handling(
1006
/*====================================*/
1007
/* out, own: handle to the file, not defined
1008
if error, error number can be retrieved with
1009
os_file_get_last_error */
1010
const char* name, /* in: name of the file or path as a
1011
null-terminated string */
1012
ulint create_mode,/* in: OS_FILE_OPEN if an existing file
1013
is opened (if does not exist, error), or
1014
OS_FILE_CREATE if a new file is created
1015
(if exists, error) */
1016
ulint access_type,/* in: OS_FILE_READ_ONLY,
1017
OS_FILE_READ_WRITE, or
1018
OS_FILE_READ_ALLOW_DELETE; the last option is
1019
used by a backup program reading the file */
1020
ibool* success)/* out: TRUE if succeed, FALSE if error */
1026
DWORD attributes = 0;
1027
DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1031
if (create_mode == OS_FILE_OPEN) {
1032
create_flag = OPEN_EXISTING;
1033
} else if (create_mode == OS_FILE_CREATE) {
1034
create_flag = CREATE_NEW;
1040
if (access_type == OS_FILE_READ_ONLY) {
1041
access = GENERIC_READ;
1042
} else if (access_type == OS_FILE_READ_WRITE) {
1043
access = GENERIC_READ | GENERIC_WRITE;
1044
} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1045
access = GENERIC_READ;
1046
share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1047
| FILE_SHARE_WRITE; /* A backup program has to give
1048
mysqld the maximum freedom to
1049
do what it likes with the
1056
file = CreateFile((LPCTSTR) name,
1059
NULL, /* default security attributes */
1062
NULL); /* no template file */
1064
if (file == INVALID_HANDLE_VALUE) {
1077
if (create_mode == OS_FILE_OPEN) {
1078
if (access_type == OS_FILE_READ_ONLY) {
1079
create_flag = O_RDONLY;
1081
create_flag = O_RDWR;
1083
} else if (create_mode == OS_FILE_CREATE) {
1084
create_flag = O_RDWR | O_CREAT | O_EXCL;
1090
if (create_mode == OS_FILE_CREATE) {
1091
file = open(name, create_flag, S_IRUSR | S_IWUSR
1092
| S_IRGRP | S_IWGRP);
1094
file = open(name, create_flag);
1099
#ifdef USE_FILE_LOCK
1100
} else if (access_type == OS_FILE_READ_WRITE
1101
&& os_file_lock(file, name)) {
1111
#endif /* __WIN__ */
1114
/********************************************************************
1115
Tries to disable OS caching on an opened file descriptor. */
1118
os_file_set_nocache(
1119
/*================*/
1120
int fd, /* in: file descriptor to alter */
1121
const char* file_name, /* in: used in the diagnostic message */
1122
const char* operation_name) /* in: used in the diagnostic message,
1123
we call os_file_set_nocache()
1124
immediately after opening or creating
1125
a file, so this is either "open" or
1128
/* some versions of Solaris may not have DIRECTIO_ON */
1129
#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1130
if (directio(fd, DIRECTIO_ON) == -1) {
1132
errno_save = (int)errno;
1133
ut_print_timestamp(stderr);
1135
" InnoDB: Failed to set DIRECTIO_ON "
1136
"on file %s: %s: %s, continuing anyway\n",
1137
file_name, operation_name, strerror(errno_save));
1139
#elif defined(O_DIRECT)
1140
if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1142
errno_save = (int)errno;
1143
ut_print_timestamp(stderr);
1145
" InnoDB: Failed to set O_DIRECT "
1146
"on file %s: %s: %s, continuing anyway\n",
1147
file_name, operation_name, strerror(errno_save));
1148
if (errno_save == EINVAL) {
1149
ut_print_timestamp(stderr);
1151
" InnoDB: O_DIRECT is known to result in "
1152
"'Invalid argument' on Linux on tmpfs, "
1153
"see MySQL Bug#26662\n");
1159
/********************************************************************
1160
Opens an existing file or creates a new. */
1165
/* out, own: handle to the file, not defined
1166
if error, error number can be retrieved with
1167
os_file_get_last_error */
1168
const char* name, /* in: name of the file or path as a
1169
null-terminated string */
1170
ulint create_mode,/* in: OS_FILE_OPEN if an existing file
1171
is opened (if does not exist, error), or
1172
OS_FILE_CREATE if a new file is created
1174
OS_FILE_OVERWRITE if a new file is created
1175
or an old overwritten;
1176
OS_FILE_OPEN_RAW, if a raw device or disk
1177
partition should be opened */
1178
ulint purpose,/* in: OS_FILE_AIO, if asynchronous,
1179
non-buffered i/o is desired,
1180
OS_FILE_NORMAL, if any normal file;
1181
NOTE that it also depends on type, os_aio_..
1182
and srv_.. variables whether we really use
1183
async i/o or unbuffered i/o: look in the
1184
function source code for the exact rules */
1185
ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
1186
ibool* success)/* out: TRUE if succeed, FALSE if error */
1190
DWORD share_mode = FILE_SHARE_READ;
1197
if (create_mode == OS_FILE_OPEN_RAW) {
1198
create_flag = OPEN_EXISTING;
1199
share_mode = FILE_SHARE_WRITE;
1200
} else if (create_mode == OS_FILE_OPEN
1201
|| create_mode == OS_FILE_OPEN_RETRY) {
1202
create_flag = OPEN_EXISTING;
1203
} else if (create_mode == OS_FILE_CREATE) {
1204
create_flag = CREATE_NEW;
1205
} else if (create_mode == OS_FILE_OVERWRITE) {
1206
create_flag = CREATE_ALWAYS;
1212
if (purpose == OS_FILE_AIO) {
1213
/* If specified, use asynchronous (overlapped) io and no
1214
buffering of writes in the OS */
1217
if (os_aio_use_native_aio) {
1218
attributes = attributes | FILE_FLAG_OVERLAPPED;
1221
#ifdef UNIV_NON_BUFFERED_IO
1222
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1223
/* Do not use unbuffered i/o to log files because
1224
value 2 denotes that we do not flush the log at every
1225
commit, but only once per second */
1226
} else if (srv_win_file_flush_method
1227
== SRV_WIN_IO_UNBUFFERED) {
1228
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1231
} else if (purpose == OS_FILE_NORMAL) {
1233
#ifdef UNIV_NON_BUFFERED_IO
1234
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1235
/* Do not use unbuffered i/o to log files because
1236
value 2 denotes that we do not flush the log at every
1237
commit, but only once per second */
1238
} else if (srv_win_file_flush_method
1239
== SRV_WIN_IO_UNBUFFERED) {
1240
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1248
file = CreateFile((LPCTSTR) name,
1249
GENERIC_READ | GENERIC_WRITE, /* read and write
1251
share_mode, /* File can be read also by other
1252
processes; we must give the read
1253
permission because of ibbackup. We do
1254
not give the write permission to
1255
others because if one would succeed to
1256
start 2 instances of mysqld on the
1257
SAME files, that could cause severe
1258
database corruption! When opening
1259
raw disk partitions, Microsoft manuals
1260
say that we must give also the write
1262
NULL, /* default security attributes */
1265
NULL); /* no template file */
1267
if (file == INVALID_HANDLE_VALUE) {
1270
retry = os_file_handle_error(name,
1271
create_mode == OS_FILE_CREATE ?
1285
const char* mode_str = NULL;
1286
const char* type_str = NULL;
1287
const char* purpose_str = NULL;
1292
if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1293
|| create_mode == OS_FILE_OPEN_RETRY) {
1295
create_flag = O_RDWR;
1296
} else if (create_mode == OS_FILE_CREATE) {
1297
mode_str = "CREATE";
1298
create_flag = O_RDWR | O_CREAT | O_EXCL;
1299
} else if (create_mode == OS_FILE_OVERWRITE) {
1300
mode_str = "OVERWRITE";
1301
create_flag = O_RDWR | O_CREAT | O_TRUNC;
1307
if (type == OS_LOG_FILE) {
1309
} else if (type == OS_DATA_FILE) {
1315
if (purpose == OS_FILE_AIO) {
1316
purpose_str = "AIO";
1317
} else if (purpose == OS_FILE_NORMAL) {
1318
purpose_str = "NORMAL";
1324
fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
1325
name, mode_str, type_str, purpose_str);
1328
/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1329
O_SYNC because the datasync options seemed to corrupt files in 2001
1330
in both Linux and Solaris */
1331
if (type == OS_LOG_FILE
1332
&& srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1335
fprintf(stderr, "Using O_SYNC for file %s\n", name);
1338
create_flag = create_flag | O_SYNC;
1342
file = open(name, create_flag, os_innodb_umask);
1347
retry = os_file_handle_error(name,
1348
create_mode == OS_FILE_CREATE ?
1353
return(file /* -1 */);
1360
/* We disable OS caching (O_DIRECT) only on data files */
1361
if (type != OS_LOG_FILE
1362
&& srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1364
os_file_set_nocache(file, name, mode_str);
1367
#ifdef USE_FILE_LOCK
1368
if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1370
if (create_mode == OS_FILE_OPEN_RETRY) {
1372
ut_print_timestamp(stderr);
1373
fputs(" InnoDB: Retrying to lock"
1374
" the first data file\n",
1376
for (i = 0; i < 100; i++) {
1377
os_thread_sleep(1000000);
1378
if (!os_file_lock(file, name)) {
1383
ut_print_timestamp(stderr);
1384
fputs(" InnoDB: Unable to open the first data file\n",
1392
#endif /* USE_FILE_LOCK */
1395
#endif /* __WIN__ */
1398
/***************************************************************************
1399
Deletes a file if it exists. The file has to be closed before calling this. */
1402
os_file_delete_if_exists(
1403
/*=====================*/
1404
/* out: TRUE if success */
1405
const char* name) /* in: file path as a null-terminated string */
1411
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1414
ret = DeleteFile((LPCTSTR)name);
1420
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1421
/* the file does not exist, this not an error */
1428
if (count > 100 && 0 == (count % 10)) {
1430
"InnoDB: Warning: cannot delete file %s\n"
1431
"InnoDB: Are you running ibbackup"
1432
" to back up the file?\n", name);
1434
os_file_get_last_error(TRUE); /* print error information */
1437
os_thread_sleep(1000000); /* sleep for a second */
1448
ret = unlink((const char*)name);
1450
if (ret != 0 && errno != ENOENT) {
1451
os_file_handle_error_no_exit(name, "delete");
1460
/***************************************************************************
1461
Deletes a file. The file has to be closed before calling this. */
1466
/* out: TRUE if success */
1467
const char* name) /* in: file path as a null-terminated string */
1473
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1476
ret = DeleteFile((LPCTSTR)name);
1482
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1483
/* If the file does not exist, we classify this as a 'mild'
1491
if (count > 100 && 0 == (count % 10)) {
1493
"InnoDB: Warning: cannot delete file %s\n"
1494
"InnoDB: Are you running ibbackup"
1495
" to back up the file?\n", name);
1497
os_file_get_last_error(TRUE); /* print error information */
1500
os_thread_sleep(1000000); /* sleep for a second */
1511
ret = unlink((const char*)name);
1514
os_file_handle_error_no_exit(name, "delete");
1523
/***************************************************************************
1524
Renames a file (can also move it to another directory). It is safest that the
1525
file is closed before calling this function. */
1530
/* out: TRUE if success */
1531
const char* oldpath,/* in: old file path as a null-terminated
1533
const char* newpath)/* in: new file path */
1538
ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1544
os_file_handle_error_no_exit(oldpath, "rename");
1550
ret = rename((const char*)oldpath, (const char*)newpath);
1553
os_file_handle_error_no_exit(oldpath, "rename");
1562
/***************************************************************************
1563
Closes a file handle. In case of error, error number can be retrieved with
1564
os_file_get_last_error. */
1569
/* out: TRUE if success */
1570
os_file_t file) /* in, own: handle to a file */
1577
ret = CloseHandle(file);
1583
os_file_handle_error(NULL, "close");
1592
os_file_handle_error(NULL, "close");
1601
/***************************************************************************
1602
Closes a file handle. */
1605
os_file_close_no_error_handling(
1606
/*============================*/
1607
/* out: TRUE if success */
1608
os_file_t file) /* in, own: handle to a file */
1615
ret = CloseHandle(file);
1636
/***************************************************************************
1637
Gets a file size. */
1642
/* out: TRUE if success */
1643
os_file_t file, /* in: handle to a file */
1644
ulint* size, /* out: least significant 32 bits of file
1646
ulint* size_high)/* out: most significant 32 bits of size */
1652
low = GetFileSize(file, &high);
1654
if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1665
offs = lseek(file, 0, SEEK_END);
1667
if (offs == ((off_t)-1)) {
1672
if (sizeof(off_t) > 4) {
1673
*size = (ulint)(offs & 0xFFFFFFFFUL);
1674
*size_high = (ulint)(offs >> 32);
1676
*size = (ulint) offs;
1684
/***************************************************************************
1685
Gets file size as a 64-bit integer ib_longlong. */
1688
os_file_get_size_as_iblonglong(
1689
/*===========================*/
1690
/* out: size in bytes, -1 if error */
1691
os_file_t file) /* in: handle to a file */
1697
success = os_file_get_size(file, &size, &size_high);
1704
return((((ib_longlong)size_high) << 32) + (ib_longlong)size);
1707
/***************************************************************************
1708
Write the specified number of zeros to a newly created file. */
1713
/* out: TRUE if success */
1714
const char* name, /* in: name of the file or path as a
1715
null-terminated string */
1716
os_file_t file, /* in: handle to a file */
1717
ulint size, /* in: least significant 32 bits of file
1719
ulint size_high)/* in: most significant 32 bits of size */
1721
ib_longlong current_size;
1722
ib_longlong desired_size;
1728
ut_a(size == (size & 0xFFFFFFFF));
1731
desired_size = (ib_longlong)size + (((ib_longlong)size_high) << 32);
1733
/* Write up to 1 megabyte at a time. */
1734
buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1736
buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
1738
/* Align the buffer for possible raw i/o */
1739
buf = ut_align(buf2, UNIV_PAGE_SIZE);
1741
/* Write buffer full of zeros */
1742
memset(buf, 0, buf_size);
1744
if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
1746
fprintf(stderr, "InnoDB: Progress in MB:");
1749
while (current_size < desired_size) {
1752
if (desired_size - current_size < (ib_longlong) buf_size) {
1753
n_bytes = (ulint) (desired_size - current_size);
1758
ret = os_file_write(name, file, buf,
1759
(ulint)(current_size & 0xFFFFFFFF),
1760
(ulint)(current_size >> 32),
1764
goto error_handling;
1767
/* Print about progress for each 100 MB written */
1768
if ((ib_longlong) (current_size + n_bytes) / (ib_longlong)(100 * 1024 * 1024)
1769
!= current_size / (ib_longlong)(100 * 1024 * 1024)) {
1771
fprintf(stderr, " %lu00",
1772
(ulong) ((current_size + n_bytes)
1773
/ (ib_longlong)(100 * 1024 * 1024)));
1776
current_size += n_bytes;
1779
if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
1781
fprintf(stderr, "\n");
1786
ret = os_file_flush(file);
1796
/***************************************************************************
1797
Truncates a file at its current position. */
1802
/* out: TRUE if success */
1803
FILE* file) /* in: file to be truncated */
1806
HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
1807
return(SetEndOfFile(h));
1809
return(!ftruncate(fileno(file), ftell(file)));
1810
#endif /* __WIN__ */
1814
/***************************************************************************
1815
Wrapper to fsync(2) that retries the call on some errors.
1816
Returns the value 0 if successful; otherwise the value -1 is returned and
1817
the global variable errno is set to indicate the error. */
1823
/* out: 0 if success, -1 otherwise */
1824
os_file_t file) /* in: handle to a file */
1837
if (ret == -1 && errno == ENOLCK) {
1839
if (failures % 100 == 0) {
1841
ut_print_timestamp(stderr);
1843
" InnoDB: fsync(): "
1844
"No locks available; retrying\n");
1847
os_thread_sleep(200000 /* 0.2 sec */);
1860
#endif /* !__WIN__ */
1862
/***************************************************************************
1863
Flushes the write buffers of a given file to the disk. */
1868
/* out: TRUE if success */
1869
os_file_t file) /* in, own: handle to a file */
1878
ret = FlushFileBuffers(file);
1884
/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
1885
actually a raw device, we choose to ignore that error if we are using
1888
if (srv_start_raw_disk_in_use && GetLastError()
1889
== ERROR_INVALID_FUNCTION) {
1893
os_file_handle_error(NULL, "flush");
1895
/* It is a fatal error if a file flush does not succeed, because then
1896
the database can get corrupt on disk */
1903
#if defined(HAVE_DARWIN_THREADS)
1904
# ifndef F_FULLFSYNC
1905
/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
1906
# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
1907
# elif F_FULLFSYNC != 51
1908
# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
1910
/* Apple has disabled fsync() for internal disk drives in OS X. That
1911
caused corruption for a user when he tested a power outage. Let us in
1912
OS X use a nonstandard flush method recommended by an Apple
1915
if (!srv_have_fullfsync) {
1916
/* If we are not on an operating system that supports this,
1917
then fall back to a plain fsync. */
1919
ret = os_file_fsync(file);
1921
ret = fcntl(file, F_FULLFSYNC, NULL);
1924
/* If we are not on a file system that supports this,
1925
then fall back to a plain fsync. */
1926
ret = os_file_fsync(file);
1930
ret = os_file_fsync(file);
1937
/* Since Linux returns EINVAL if the 'file' is actually a raw device,
1938
we choose to ignore that error if we are using raw disks */
1940
if (srv_start_raw_disk_in_use && errno == EINVAL) {
1945
ut_print_timestamp(stderr);
1948
" InnoDB: Error: the OS said file flush did not succeed\n");
1950
os_file_handle_error(NULL, "flush");
1952
/* It is a fatal error if a file flush does not succeed, because then
1953
the database can get corrupt on disk */
1961
/***********************************************************************
1962
Does a synchronous read operation in Posix. */
1967
/* out: number of bytes read, -1 if error */
1968
os_file_t file, /* in: handle to a file */
1969
void* buf, /* in: buffer where to read */
1970
ulint n, /* in: number of bytes to read */
1971
ulint offset, /* in: least significant 32 bits of file
1972
offset from where to read */
1973
ulint offset_high) /* in: most significant 32 bits of
1979
ut_a((offset & 0xFFFFFFFFUL) == offset);
1981
/* If off_t is > 4 bytes in size, then we assume we can pass a
1984
if (sizeof(off_t) > 4) {
1985
offs = (off_t)offset + (((off_t)offset_high) << 32);
1988
offs = (off_t)offset;
1990
if (offset_high > 0) {
1992
"InnoDB: Error: file read at offset > 4 GB\n");
1998
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
1999
os_mutex_enter(os_file_count_mutex);
2000
os_file_n_pending_preads++;
2001
os_n_pending_reads++;
2002
os_mutex_exit(os_file_count_mutex);
2004
n_bytes = pread(file, buf, (ssize_t)n, offs);
2006
os_mutex_enter(os_file_count_mutex);
2007
os_file_n_pending_preads--;
2008
os_n_pending_reads--;
2009
os_mutex_exit(os_file_count_mutex);
2018
os_mutex_enter(os_file_count_mutex);
2019
os_n_pending_reads++;
2020
os_mutex_exit(os_file_count_mutex);
2022
/* Protect the seek / read operation with a mutex */
2023
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2025
os_mutex_enter(os_file_seek_mutexes[i]);
2027
ret_offset = lseek(file, offs, SEEK_SET);
2029
if (ret_offset < 0) {
2032
ret = read(file, buf, (ssize_t)n);
2035
os_mutex_exit(os_file_seek_mutexes[i]);
2037
os_mutex_enter(os_file_count_mutex);
2038
os_n_pending_reads--;
2039
os_mutex_exit(os_file_count_mutex);
2046
/***********************************************************************
2047
Does a synchronous write operation in Posix. */
2052
/* out: number of bytes written, -1 if error */
2053
os_file_t file, /* in: handle to a file */
2054
const void* buf, /* in: buffer from where to write */
2055
ulint n, /* in: number of bytes to write */
2056
ulint offset, /* in: least significant 32 bits of file
2057
offset where to write */
2058
ulint offset_high) /* in: most significant 32 bits of
2064
ut_a((offset & 0xFFFFFFFFUL) == offset);
2066
/* If off_t is > 4 bytes in size, then we assume we can pass a
2069
if (sizeof(off_t) > 4) {
2070
offs = (off_t)offset + (((off_t)offset_high) << 32);
2072
offs = (off_t)offset;
2074
if (offset_high > 0) {
2076
"InnoDB: Error: file write"
2077
" at offset > 4 GB\n");
2083
#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2084
os_mutex_enter(os_file_count_mutex);
2085
os_file_n_pending_pwrites++;
2086
os_n_pending_writes++;
2087
os_mutex_exit(os_file_count_mutex);
2089
ret = pwrite(file, buf, (ssize_t)n, offs);
2091
os_mutex_enter(os_file_count_mutex);
2092
os_file_n_pending_pwrites--;
2093
os_n_pending_writes--;
2094
os_mutex_exit(os_file_count_mutex);
2096
# ifdef UNIV_DO_FLUSH
2097
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2098
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2099
&& !os_do_not_call_flush_at_each_write) {
2101
/* Always do fsync to reduce the probability that when
2102
the OS crashes, a database page is only partially
2103
physically written to disk. */
2105
ut_a(TRUE == os_file_flush(file));
2107
# endif /* UNIV_DO_FLUSH */
2115
os_mutex_enter(os_file_count_mutex);
2116
os_n_pending_writes++;
2117
os_mutex_exit(os_file_count_mutex);
2119
/* Protect the seek / write operation with a mutex */
2120
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2122
os_mutex_enter(os_file_seek_mutexes[i]);
2124
ret_offset = lseek(file, offs, SEEK_SET);
2126
if (ret_offset < 0) {
2132
ret = write(file, buf, (ssize_t)n);
2134
# ifdef UNIV_DO_FLUSH
2135
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2136
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2137
&& !os_do_not_call_flush_at_each_write) {
2139
/* Always do fsync to reduce the probability that when
2140
the OS crashes, a database page is only partially
2141
physically written to disk. */
2143
ut_a(TRUE == os_file_flush(file));
2145
# endif /* UNIV_DO_FLUSH */
2148
os_mutex_exit(os_file_seek_mutexes[i]);
2150
os_mutex_enter(os_file_count_mutex);
2151
os_n_pending_writes--;
2152
os_mutex_exit(os_file_count_mutex);
2160
/***********************************************************************
2161
Requests a synchronous positioned read operation. */
2166
/* out: TRUE if request was
2167
successful, FALSE if fail */
2168
os_file_t file, /* in: handle to a file */
2169
void* buf, /* in: buffer where to read */
2170
ulint offset, /* in: least significant 32 bits of file
2171
offset where to read */
2172
ulint offset_high, /* in: most significant 32 bits of
2174
ulint n) /* in: number of bytes to read */
2185
ut_a((offset & 0xFFFFFFFFUL) == offset);
2188
os_bytes_read_since_printout += n;
2195
low = (DWORD) offset;
2196
high = (DWORD) offset_high;
2198
os_mutex_enter(os_file_count_mutex);
2199
os_n_pending_reads++;
2200
os_mutex_exit(os_file_count_mutex);
2202
/* Protect the seek / read operation with a mutex */
2203
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2205
os_mutex_enter(os_file_seek_mutexes[i]);
2207
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2209
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2211
os_mutex_exit(os_file_seek_mutexes[i]);
2213
os_mutex_enter(os_file_count_mutex);
2214
os_n_pending_reads--;
2215
os_mutex_exit(os_file_count_mutex);
2217
goto error_handling;
2220
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2222
os_mutex_exit(os_file_seek_mutexes[i]);
2224
os_mutex_enter(os_file_count_mutex);
2225
os_n_pending_reads--;
2226
os_mutex_exit(os_file_count_mutex);
2228
if (ret && len == n) {
2235
os_bytes_read_since_printout += n;
2238
ret = os_file_pread(file, buf, n, offset, offset_high);
2240
if ((ulint)ret == n) {
2246
"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2247
"InnoDB: Was only able to read %ld.\n",
2248
(ulong)n, (ulong)offset_high,
2249
(ulong)offset, (long)ret);
2254
retry = os_file_handle_error(NULL, "read");
2261
"InnoDB: Fatal error: cannot read from file."
2262
" OS error number %lu.\n",
2264
(ulong) GetLastError()
2276
/***********************************************************************
2277
Requests a synchronous positioned read operation. This function does not do
2278
any error handling. In case of error it returns FALSE. */
2281
os_file_read_no_error_handling(
2282
/*===========================*/
2283
/* out: TRUE if request was
2284
successful, FALSE if fail */
2285
os_file_t file, /* in: handle to a file */
2286
void* buf, /* in: buffer where to read */
2287
ulint offset, /* in: least significant 32 bits of file
2288
offset where to read */
2289
ulint offset_high, /* in: most significant 32 bits of
2291
ulint n) /* in: number of bytes to read */
2302
ut_a((offset & 0xFFFFFFFFUL) == offset);
2305
os_bytes_read_since_printout += n;
2312
low = (DWORD) offset;
2313
high = (DWORD) offset_high;
2315
os_mutex_enter(os_file_count_mutex);
2316
os_n_pending_reads++;
2317
os_mutex_exit(os_file_count_mutex);
2319
/* Protect the seek / read operation with a mutex */
2320
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2322
os_mutex_enter(os_file_seek_mutexes[i]);
2324
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2326
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2328
os_mutex_exit(os_file_seek_mutexes[i]);
2330
os_mutex_enter(os_file_count_mutex);
2331
os_n_pending_reads--;
2332
os_mutex_exit(os_file_count_mutex);
2334
goto error_handling;
2337
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2339
os_mutex_exit(os_file_seek_mutexes[i]);
2341
os_mutex_enter(os_file_count_mutex);
2342
os_n_pending_reads--;
2343
os_mutex_exit(os_file_count_mutex);
2345
if (ret && len == n) {
2352
os_bytes_read_since_printout += n;
2355
ret = os_file_pread(file, buf, n, offset, offset_high);
2357
if ((ulint)ret == n) {
2365
retry = os_file_handle_error_no_exit(NULL, "read");
2374
/***********************************************************************
2375
Rewind file to its start, read at most size - 1 bytes from it to str, and
2376
NUL-terminate str. All errors are silently ignored. This function is
2377
mostly meant to be used with temporary files. */
2380
os_file_read_string(
2381
/*================*/
2382
FILE* file, /* in: file to read from */
2383
char* str, /* in: buffer where to read */
2384
ulint size) /* in: size of buffer */
2393
flen = fread(str, 1, size - 1, file);
2397
/***********************************************************************
2398
Requests a synchronous write operation. */
2403
/* out: TRUE if request was
2404
successful, FALSE if fail */
2405
const char* name, /* in: name of the file or path as a
2406
null-terminated string */
2407
os_file_t file, /* in: handle to a file */
2408
const void* buf, /* in: buffer from which to write */
2409
ulint offset, /* in: least significant 32 bits of file
2410
offset where to write */
2411
ulint offset_high, /* in: most significant 32 bits of
2413
ulint n) /* in: number of bytes to write */
2422
ulint n_retries = 0;
2425
ut_a((offset & 0xFFFFFFFF) == offset);
2433
low = (DWORD) offset;
2434
high = (DWORD) offset_high;
2436
os_mutex_enter(os_file_count_mutex);
2437
os_n_pending_writes++;
2438
os_mutex_exit(os_file_count_mutex);
2440
/* Protect the seek / write operation with a mutex */
2441
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2443
os_mutex_enter(os_file_seek_mutexes[i]);
2445
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2447
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2449
os_mutex_exit(os_file_seek_mutexes[i]);
2451
os_mutex_enter(os_file_count_mutex);
2452
os_n_pending_writes--;
2453
os_mutex_exit(os_file_count_mutex);
2455
ut_print_timestamp(stderr);
2458
" InnoDB: Error: File pointer positioning to"
2459
" file %s failed at\n"
2460
"InnoDB: offset %lu %lu. Operating system"
2461
" error number %lu.\n"
2462
"InnoDB: Some operating system error numbers"
2463
" are described at\n"
2465
"http://dev.mysql.com/doc/refman/5.1/en/"
2466
"operating-system-error-codes.html\n",
2467
name, (ulong) offset_high, (ulong) offset,
2468
(ulong) GetLastError());
2473
ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2475
/* Always do fsync to reduce the probability that when the OS crashes,
2476
a database page is only partially physically written to disk. */
2478
# ifdef UNIV_DO_FLUSH
2479
if (!os_do_not_call_flush_at_each_write) {
2480
ut_a(TRUE == os_file_flush(file));
2482
# endif /* UNIV_DO_FLUSH */
2484
os_mutex_exit(os_file_seek_mutexes[i]);
2486
os_mutex_enter(os_file_count_mutex);
2487
os_n_pending_writes--;
2488
os_mutex_exit(os_file_count_mutex);
2490
if (ret && len == n) {
2495
/* If some background file system backup tool is running, then, at
2496
least in Windows 2000, we may get here a specific error. Let us
2497
retry the operation 100 times, with 1 second waits. */
2499
if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2501
os_thread_sleep(1000000);
2508
if (!os_has_said_disk_full) {
2510
err = (ulint)GetLastError();
2512
ut_print_timestamp(stderr);
2515
" InnoDB: Error: Write to file %s failed"
2516
" at offset %lu %lu.\n"
2517
"InnoDB: %lu bytes should have been written,"
2518
" only %lu were written.\n"
2519
"InnoDB: Operating system error number %lu.\n"
2520
"InnoDB: Check that your OS and file system"
2521
" support files of this size.\n"
2522
"InnoDB: Check also that the disk is not full"
2523
" or a disk quota exceeded.\n",
2524
name, (ulong) offset_high, (ulong) offset,
2525
(ulong) n, (ulong) len, (ulong) err);
2527
if (strerror((int)err) != NULL) {
2529
"InnoDB: Error number %lu means '%s'.\n",
2530
(ulong) err, strerror((int)err));
2534
"InnoDB: Some operating system error numbers"
2535
" are described at\n"
2537
"http://dev.mysql.com/doc/refman/5.1/en/"
2538
"operating-system-error-codes.html\n");
2540
os_has_said_disk_full = TRUE;
2547
ret = os_file_pwrite(file, buf, n, offset, offset_high);
2549
if ((ulint)ret == n) {
2554
if (!os_has_said_disk_full) {
2556
ut_print_timestamp(stderr);
2559
" InnoDB: Error: Write to file %s failed"
2560
" at offset %lu %lu.\n"
2561
"InnoDB: %lu bytes should have been written,"
2562
" only %ld were written.\n"
2563
"InnoDB: Operating system error number %lu.\n"
2564
"InnoDB: Check that your OS and file system"
2565
" support files of this size.\n"
2566
"InnoDB: Check also that the disk is not full"
2567
" or a disk quota exceeded.\n",
2568
name, offset_high, offset, n, (long int)ret,
2570
if (strerror(errno) != NULL) {
2572
"InnoDB: Error number %lu means '%s'.\n",
2573
(ulint)errno, strerror(errno));
2577
"InnoDB: Some operating system error numbers"
2578
" are described at\n"
2580
"http://dev.mysql.com/doc/refman/5.1/en/"
2581
"operating-system-error-codes.html\n");
2583
os_has_said_disk_full = TRUE;
2590
/***********************************************************************
2591
Check the existence and type of the given file. */
2596
/* out: TRUE if call succeeded */
2597
const char* path, /* in: pathname of the file */
2598
ibool* exists, /* out: TRUE if file exists */
2599
os_file_type_t* type) /* out: type of the file (if it exists) */
2603
struct _stat statinfo;
2605
ret = _stat(path, &statinfo);
2606
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2607
/* file does not exist */
2611
/* file exists, but stat call failed */
2613
os_file_handle_error_no_exit(path, "stat");
2618
if (_S_IFDIR & statinfo.st_mode) {
2619
*type = OS_FILE_TYPE_DIR;
2620
} else if (_S_IFREG & statinfo.st_mode) {
2621
*type = OS_FILE_TYPE_FILE;
2623
*type = OS_FILE_TYPE_UNKNOWN;
2631
struct stat statinfo;
2633
ret = stat(path, &statinfo);
2634
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2635
/* file does not exist */
2639
/* file exists, but stat call failed */
2641
os_file_handle_error_no_exit(path, "stat");
2646
if (S_ISDIR(statinfo.st_mode)) {
2647
*type = OS_FILE_TYPE_DIR;
2648
} else if (S_ISLNK(statinfo.st_mode)) {
2649
*type = OS_FILE_TYPE_LINK;
2650
} else if (S_ISREG(statinfo.st_mode)) {
2651
*type = OS_FILE_TYPE_FILE;
2653
*type = OS_FILE_TYPE_UNKNOWN;
2662
/***********************************************************************
2663
This function returns information about the specified file */
2668
/* out: TRUE if stat
2669
information found */
2670
const char* path, /* in: pathname of the file */
2671
os_file_stat_t* stat_info) /* information of a file in a
2676
struct _stat statinfo;
2678
ret = _stat(path, &statinfo);
2679
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2680
/* file does not exist */
2684
/* file exists, but stat call failed */
2686
os_file_handle_error_no_exit(path, "stat");
2690
if (_S_IFDIR & statinfo.st_mode) {
2691
stat_info->type = OS_FILE_TYPE_DIR;
2692
} else if (_S_IFREG & statinfo.st_mode) {
2693
stat_info->type = OS_FILE_TYPE_FILE;
2695
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2698
stat_info->ctime = statinfo.st_ctime;
2699
stat_info->atime = statinfo.st_atime;
2700
stat_info->mtime = statinfo.st_mtime;
2701
stat_info->size = statinfo.st_size;
2706
struct stat statinfo;
2708
ret = stat(path, &statinfo);
2710
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2711
/* file does not exist */
2715
/* file exists, but stat call failed */
2717
os_file_handle_error_no_exit(path, "stat");
2722
if (S_ISDIR(statinfo.st_mode)) {
2723
stat_info->type = OS_FILE_TYPE_DIR;
2724
} else if (S_ISLNK(statinfo.st_mode)) {
2725
stat_info->type = OS_FILE_TYPE_LINK;
2726
} else if (S_ISREG(statinfo.st_mode)) {
2727
stat_info->type = OS_FILE_TYPE_FILE;
2729
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2732
stat_info->ctime = statinfo.st_ctime;
2733
stat_info->atime = statinfo.st_atime;
2734
stat_info->mtime = statinfo.st_mtime;
2735
stat_info->size = statinfo.st_size;
2741
/* path name separator character */
2743
# define OS_FILE_PATH_SEPARATOR '\\'
2745
# define OS_FILE_PATH_SEPARATOR '/'
2748
/********************************************************************
2749
The function os_file_dirname returns a directory component of a
2750
null-terminated pathname string. In the usual case, dirname returns
2751
the string up to, but not including, the final '/', and basename
2752
is the component following the final '/'. Trailing '/' characďż˝
2753
ters are not counted as part of the pathname.
2755
If path does not contain a slash, dirname returns the string ".".
2757
Concatenating the string returned by dirname, a "/", and the basename
2758
yields a complete pathname.
2760
The return value is a copy of the directory component of the pathname.
2761
The copy is allocated from heap. It is the caller responsibility
2762
to free it after it is no longer needed.
2764
The following list of examples (taken from SUSv2) shows the strings
2765
returned by dirname and basename for different paths:
2767
path dirname basename
2768
"/usr/lib" "/usr" "lib"
2779
/* out, own: directory component of the
2781
const char* path) /* in: pathname */
2783
/* Find the offset of the last slash */
2784
const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
2786
/* No slash in the path, return "." */
2788
return(mem_strdup("."));
2791
/* Ok, there is a slash */
2793
if (last_slash == path) {
2794
/* last slash is the first char of the path */
2796
return(mem_strdup("/"));
2799
/* Non-trivial directory component */
2801
return(mem_strdupl(path, last_slash - path));
2804
/********************************************************************
2805
Creates all missing subdirectories along the given path. */
2808
os_file_create_subdirs_if_needed(
2809
/*=============================*/
2810
/* out: TRUE if call succeeded
2812
const char* path) /* in: path name */
2815
ibool success, subdir_exists;
2816
os_file_type_t type;
2818
subdir = os_file_dirname(path);
2819
if (strlen(subdir) == 1
2820
&& (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
2821
/* subdir is root or cwd, nothing to do */
2827
/* Test if subdir exists */
2828
success = os_file_status(subdir, &subdir_exists, &type);
2829
if (success && !subdir_exists) {
2830
/* subdir does not exist, create it */
2831
success = os_file_create_subdirs_if_needed(subdir);
2837
success = os_file_create_directory(subdir, FALSE);
2845
/********************************************************************
2846
Returns a pointer to the nth slot in the aio array. */
2849
os_aio_array_get_nth_slot(
2850
/*======================*/
2851
/* out: pointer to slot */
2852
os_aio_array_t* array, /* in: aio array */
2853
ulint index) /* in: index of the slot */
2855
ut_a(index < array->n_slots);
2857
return((array->slots) + index);
2860
/****************************************************************************
2861
Creates an aio wait array. */
2864
os_aio_array_create(
2865
/*================*/
2866
/* out, own: aio array */
2867
ulint n, /* in: maximum number of pending aio operations
2868
allowed; n must be divisible by n_segments */
2869
ulint n_segments) /* in: number of segments in the aio array */
2871
os_aio_array_t* array;
2873
os_aio_slot_t* slot;
2878
ut_a(n_segments > 0);
2880
array = ut_malloc(sizeof(os_aio_array_t));
2882
array->mutex = os_mutex_create(NULL);
2883
array->not_full = os_event_create(NULL);
2884
array->is_empty = os_event_create(NULL);
2886
os_event_set(array->is_empty);
2889
array->n_segments = n_segments;
2890
array->n_reserved = 0;
2891
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
2893
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
2895
for (i = 0; i < n; i++) {
2896
slot = os_aio_array_get_nth_slot(array, i);
2899
slot->reserved = FALSE;
2901
slot->event = os_event_create(NULL);
2903
over = &(slot->control);
2905
over->hEvent = slot->event->handle;
2907
*((array->native_events) + i) = over->hEvent;
2914
/****************************************************************************
2915
Initializes the asynchronous io system. Calls also os_io_init_simple.
2916
Creates a separate aio array for
2917
non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2918
segment, two aio arrays for log reads and writes with one segment, and a
2919
synchronous aio array of the specified size. The combined number of segments
2920
in the three first aio arrays is the parameter n_segments given to the
2921
function. The caller must create an i/o handler thread for each segment in
2922
the four first arrays, but not for the sync aio array. */
2927
ulint n, /* in: maximum number of pending aio operations
2928
allowed; n must be divisible by n_segments */
2929
ulint n_segments, /* in: combined number of segments in the four
2930
first aio arrays; must be >= 4 */
2931
ulint n_slots_sync) /* in: number of slots in the sync aio array */
2937
#ifdef POSIX_ASYNC_IO
2940
ut_ad(n % n_segments == 0);
2941
ut_ad(n_segments >= 4);
2943
os_io_init_simple();
2945
for (i = 0; i < n_segments; i++) {
2946
srv_set_io_thread_op_info(i, "not started yet");
2949
n_per_seg = n / n_segments;
2950
n_write_segs = (n_segments - 2) / 2;
2951
n_read_segs = n_segments - 2 - n_write_segs;
2953
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
2955
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
2957
srv_io_thread_function[0] = "insert buffer thread";
2959
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
2961
srv_io_thread_function[1] = "log thread";
2963
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
2965
for (i = 2; i < 2 + n_read_segs; i++) {
2966
ut_a(i < SRV_MAX_N_IO_THREADS);
2967
srv_io_thread_function[i] = "read thread";
2970
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
2972
for (i = 2 + n_read_segs; i < n_segments; i++) {
2973
ut_a(i < SRV_MAX_N_IO_THREADS);
2974
srv_io_thread_function[i] = "write thread";
2977
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
2979
os_aio_n_segments = n_segments;
2983
os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
2985
for (i = 0; i < n_segments; i++) {
2986
os_aio_segment_wait_events[i] = os_event_create(NULL);
2989
os_last_printout = time(NULL);
2991
#ifdef POSIX_ASYNC_IO
2992
/* Block aio signals from the current thread and its children:
2993
for this to work, the current thread must be the first created
2994
in the database, so that all its children will inherit its
2997
/* TODO: to work MySQL needs the SIGALARM signal; the following
2998
will not work yet! */
2999
sigemptyset(&sigset);
3000
sigaddset(&sigset, SIGRTMIN + 1 + 0);
3001
sigaddset(&sigset, SIGRTMIN + 1 + 1);
3002
sigaddset(&sigset, SIGRTMIN + 1 + 2);
3003
sigaddset(&sigset, SIGRTMIN + 1 + 3);
3005
pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
3010
/****************************************************************************
3011
Wakes up all async i/o threads in the array in Windows async i/o at
3015
os_aio_array_wake_win_aio_at_shutdown(
3016
/*==================================*/
3017
os_aio_array_t* array) /* in: aio array */
3021
for (i = 0; i < array->n_slots; i++) {
3023
os_event_set((array->slots + i)->event);
3028
/****************************************************************************
3029
Wakes up all async i/o threads so that they know to exit themselves in
3033
os_aio_wake_all_threads_at_shutdown(void)
3034
/*=====================================*/
3039
/* This code wakes up all ai/o threads in Windows native aio */
3040
os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3041
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3042
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3043
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3045
/* This loop wakes up all simulated ai/o threads */
3047
for (i = 0; i < os_aio_n_segments; i++) {
3049
os_event_set(os_aio_segment_wait_events[i]);
3053
/****************************************************************************
3054
Waits until there are no pending writes in os_aio_write_array. There can
3055
be other, synchronous, pending writes. */
3058
os_aio_wait_until_no_pending_writes(void)
3059
/*=====================================*/
3061
os_event_wait(os_aio_write_array->is_empty);
3064
/**************************************************************************
3065
Calculates segment number for a slot. */
3068
os_aio_get_segment_no_from_slot(
3069
/*============================*/
3070
/* out: segment number (which is the number
3071
used by, for example, i/o-handler threads) */
3072
os_aio_array_t* array, /* in: aio wait array */
3073
os_aio_slot_t* slot) /* in: slot in this array */
3078
if (array == os_aio_ibuf_array) {
3081
} else if (array == os_aio_log_array) {
3084
} else if (array == os_aio_read_array) {
3085
seg_len = os_aio_read_array->n_slots
3086
/ os_aio_read_array->n_segments;
3088
segment = 2 + slot->pos / seg_len;
3090
ut_a(array == os_aio_write_array);
3091
seg_len = os_aio_write_array->n_slots
3092
/ os_aio_write_array->n_segments;
3094
segment = os_aio_read_array->n_segments + 2
3095
+ slot->pos / seg_len;
3101
/**************************************************************************
3102
Calculates local segment number and aio array from global segment number. */
3105
os_aio_get_array_and_local_segment(
3106
/*===============================*/
3107
/* out: local segment number within
3109
os_aio_array_t** array, /* out: aio wait array */
3110
ulint global_segment)/* in: global segment number */
3114
ut_a(global_segment < os_aio_n_segments);
3116
if (global_segment == 0) {
3117
*array = os_aio_ibuf_array;
3120
} else if (global_segment == 1) {
3121
*array = os_aio_log_array;
3124
} else if (global_segment < os_aio_read_array->n_segments + 2) {
3125
*array = os_aio_read_array;
3127
segment = global_segment - 2;
3129
*array = os_aio_write_array;
3131
segment = global_segment - (os_aio_read_array->n_segments + 2);
3137
/***********************************************************************
3138
Gets an integer value designating a specified aio array. This is used
3139
to give numbers to signals in Posix aio. */
3141
#if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO)
3144
os_aio_get_array_no(
3145
/*================*/
3146
os_aio_array_t* array) /* in: aio array */
3148
if (array == os_aio_ibuf_array) {
3152
} else if (array == os_aio_log_array) {
3156
} else if (array == os_aio_read_array) {
3159
} else if (array == os_aio_write_array) {
3169
/***********************************************************************
3170
Gets the aio array for its number. */
3173
os_aio_get_array_from_no(
3174
/*=====================*/
3175
/* out: aio array */
3176
ulint n) /* in: array number */
3179
return(os_aio_ibuf_array);
3180
} else if (n == 1) {
3182
return(os_aio_log_array);
3183
} else if (n == 2) {
3185
return(os_aio_read_array);
3186
} else if (n == 3) {
3188
return(os_aio_write_array);
3195
#endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */
3197
/***********************************************************************
3198
Requests for a slot in the aio array. If no slot is available, waits until
3199
not_full-event becomes signaled. */
3202
os_aio_array_reserve_slot(
3203
/*======================*/
3204
/* out: pointer to slot */
3205
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3206
os_aio_array_t* array, /* in: aio array */
3207
fil_node_t* message1,/* in: message to be passed along with
3208
the aio operation */
3209
void* message2,/* in: message to be passed along with
3210
the aio operation */
3211
os_file_t file, /* in: file handle */
3212
const char* name, /* in: name of the file or path as a
3213
null-terminated string */
3214
void* buf, /* in: buffer where to read or from which
3216
ulint offset, /* in: least significant 32 bits of file
3218
ulint offset_high, /* in: most significant 32 bits of
3220
ulint len) /* in: length of the block to read or write */
3222
os_aio_slot_t* slot;
3224
OVERLAPPED* control;
3226
#elif defined(POSIX_ASYNC_IO)
3228
struct aiocb* control;
3232
os_mutex_enter(array->mutex);
3234
if (array->n_reserved == array->n_slots) {
3235
os_mutex_exit(array->mutex);
3237
if (!os_aio_use_native_aio) {
3238
/* If the handler threads are suspended, wake them
3239
so that we get more slots */
3241
os_aio_simulated_wake_handler_threads();
3244
os_event_wait(array->not_full);
3250
slot = os_aio_array_get_nth_slot(array, i);
3252
if (slot->reserved == FALSE) {
3257
array->n_reserved++;
3259
if (array->n_reserved == 1) {
3260
os_event_reset(array->is_empty);
3263
if (array->n_reserved == array->n_slots) {
3264
os_event_reset(array->not_full);
3267
slot->reserved = TRUE;
3268
slot->reservation_time = time(NULL);
3269
slot->message1 = message1;
3270
slot->message2 = message2;
3276
slot->offset = offset;
3277
slot->offset_high = offset_high;
3278
slot->io_already_done = FALSE;
3281
control = &(slot->control);
3282
control->Offset = (DWORD)offset;
3283
control->OffsetHigh = (DWORD)offset_high;
3284
os_event_reset(slot->event);
3286
#elif defined(POSIX_ASYNC_IO)
3288
#if (UNIV_WORD_SIZE == 8)
3289
offset = offset + (offset_high << 32);
3291
ut_a(offset_high == 0);
3293
control = &(slot->control);
3294
control->aio_fildes = file;
3295
control->aio_buf = buf;
3296
control->aio_nbytes = len;
3297
control->aio_offset = offset;
3298
control->aio_reqprio = 0;
3299
control->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
3300
control->aio_sigevent.sigev_signo
3301
= SIGRTMIN + 1 + os_aio_get_array_no(array);
3302
/* TODO: How to choose the signal numbers? */
3304
fprintf(stderr, "AIO signal number %lu\n",
3305
(ulint) control->aio_sigevent.sigev_signo);
3307
control->aio_sigevent.sigev_value.sival_ptr = slot;
3309
os_mutex_exit(array->mutex);
3314
/***********************************************************************
3315
Frees a slot in the aio array. */
3318
os_aio_array_free_slot(
3319
/*===================*/
3320
os_aio_array_t* array, /* in: aio array */
3321
os_aio_slot_t* slot) /* in: pointer to slot */
3326
os_mutex_enter(array->mutex);
3328
ut_ad(slot->reserved);
3330
slot->reserved = FALSE;
3332
array->n_reserved--;
3334
if (array->n_reserved == array->n_slots - 1) {
3335
os_event_set(array->not_full);
3338
if (array->n_reserved == 0) {
3339
os_event_set(array->is_empty);
3343
os_event_reset(slot->event);
3345
os_mutex_exit(array->mutex);
3348
/**************************************************************************
3349
Wakes up a simulated aio i/o-handler thread if it has something to do. */
3352
os_aio_simulated_wake_handler_thread(
3353
/*=================================*/
3354
ulint global_segment) /* in: the number of the segment in the aio
3357
os_aio_array_t* array;
3358
os_aio_slot_t* slot;
3363
ut_ad(!os_aio_use_native_aio);
3365
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3367
n = array->n_slots / array->n_segments;
3369
/* Look through n slots after the segment * n'th slot */
3371
os_mutex_enter(array->mutex);
3373
for (i = 0; i < n; i++) {
3374
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3376
if (slot->reserved) {
3377
/* Found an i/o request */
3383
os_mutex_exit(array->mutex);
3386
os_event_set(os_aio_segment_wait_events[global_segment]);
3390
/**************************************************************************
3391
Wakes up simulated aio i/o-handler threads if they have something to do. */
3394
os_aio_simulated_wake_handler_threads(void)
3395
/*=======================================*/
3399
if (os_aio_use_native_aio) {
3400
/* We do not use simulated aio: do nothing */
3405
os_aio_recommend_sleep_for_read_threads = FALSE;
3407
for (i = 0; i < os_aio_n_segments; i++) {
3408
os_aio_simulated_wake_handler_thread(i);
3412
/**************************************************************************
3413
This function can be called if one wants to post a batch of reads and
3414
prefers an i/o-handler thread to handle them all at once later. You must
3415
call os_aio_simulated_wake_handler_threads later to ensure the threads
3416
are not left sleeping! */
3419
os_aio_simulated_put_read_threads_to_sleep(void)
3420
/*============================================*/
3422
os_aio_array_t* array;
3425
os_aio_recommend_sleep_for_read_threads = TRUE;
3427
for (g = 0; g < os_aio_n_segments; g++) {
3428
os_aio_get_array_and_local_segment(&array, g);
3430
if (array == os_aio_read_array) {
3432
os_event_reset(os_aio_segment_wait_events[g]);
3437
/***********************************************************************
3438
Requests an asynchronous i/o operation. */
3443
/* out: TRUE if request was queued
3444
successfully, FALSE if fail */
3445
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3446
ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
3447
to OS_AIO_SIMULATED_WAKE_LATER: the
3448
last flag advises this function not to wake
3449
i/o-handler threads, but the caller will
3450
do the waking explicitly later, in this
3451
way the caller can post several requests in
3452
a batch; NOTE that the batch must not be
3453
so big that it exhausts the slots in aio
3454
arrays! NOTE that a simulated batch
3455
may introduce hidden chances of deadlocks,
3456
because i/os are not actually handled until
3457
all have been posted: use with great
3459
const char* name, /* in: name of the file or path as a
3460
null-terminated string */
3461
os_file_t file, /* in: handle to a file */
3462
void* buf, /* in: buffer where to read or from which
3464
ulint offset, /* in: least significant 32 bits of file
3465
offset where to read or write */
3466
ulint offset_high, /* in: most significant 32 bits of
3468
ulint n, /* in: number of bytes to read or write */
3469
fil_node_t* message1,/* in: messages for the aio handler (these
3470
can be used to identify a completed aio
3471
operation); if mode is OS_AIO_SYNC, these
3475
os_aio_array_t* array;
3476
os_aio_slot_t* slot;
3480
DWORD len = (DWORD) n;
3481
struct fil_node_struct * dummy_mess1;
3492
ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
3493
ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
3494
ut_ad(os_aio_validate());
3496
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
3497
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
3499
if (mode == OS_AIO_SYNC
3501
&& !os_aio_use_native_aio
3504
/* This is actually an ordinary synchronous read or write:
3505
no need to use an i/o-handler thread. NOTE that if we use
3506
Windows async i/o, Windows does not allow us to use
3507
ordinary synchronous os_file_read etc. on the same file,
3508
therefore we have built a special mechanism for synchronous
3509
wait in the Windows case. */
3511
if (type == OS_FILE_READ) {
3512
return(os_file_read(file, buf, offset,
3516
ut_a(type == OS_FILE_WRITE);
3518
return(os_file_write(name, file, buf, offset, offset_high, n));
3522
if (mode == OS_AIO_NORMAL) {
3523
if (type == OS_FILE_READ) {
3524
array = os_aio_read_array;
3526
array = os_aio_write_array;
3528
} else if (mode == OS_AIO_IBUF) {
3529
ut_ad(type == OS_FILE_READ);
3530
/* Reduce probability of deadlock bugs in connection with ibuf:
3531
do not let the ibuf i/o handler sleep */
3535
array = os_aio_ibuf_array;
3536
} else if (mode == OS_AIO_LOG) {
3538
array = os_aio_log_array;
3539
} else if (mode == OS_AIO_SYNC) {
3540
array = os_aio_sync_array;
3542
array = NULL; /* Eliminate compiler warning */
3546
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
3547
name, buf, offset, offset_high, n);
3548
if (type == OS_FILE_READ) {
3549
if (os_aio_use_native_aio) {
3552
os_bytes_read_since_printout += len;
3554
ret = ReadFile(file, buf, (DWORD)n, &len,
3556
#elif defined(POSIX_ASYNC_IO)
3557
slot->control.aio_lio_opcode = LIO_READ;
3558
err = (ulint) aio_read(&(slot->control));
3559
fprintf(stderr, "Starting POSIX aio read %lu\n", err);
3563
os_aio_simulated_wake_handler_thread(
3564
os_aio_get_segment_no_from_slot(
3568
} else if (type == OS_FILE_WRITE) {
3569
if (os_aio_use_native_aio) {
3572
ret = WriteFile(file, buf, (DWORD)n, &len,
3574
#elif defined(POSIX_ASYNC_IO)
3575
slot->control.aio_lio_opcode = LIO_WRITE;
3576
err = (ulint) aio_write(&(slot->control));
3577
fprintf(stderr, "Starting POSIX aio write %lu\n", err);
3581
os_aio_simulated_wake_handler_thread(
3582
os_aio_get_segment_no_from_slot(
3591
if (os_aio_use_native_aio) {
3592
if ((ret && len == n)
3593
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
3594
/* aio was queued successfully! */
3596
if (mode == OS_AIO_SYNC) {
3597
/* We want a synchronous i/o operation on a
3598
file where we also use async i/o: in Windows
3599
we must use the same wait mechanism as for
3602
retval = os_aio_windows_handle(ULINT_UNDEFINED,
3614
err = 1; /* Fall through the next if */
3618
/* aio was queued successfully! */
3623
os_aio_array_free_slot(array, slot);
3625
retry = os_file_handle_error(name,
3626
type == OS_FILE_READ
3627
? "aio read" : "aio write");
3637
/**************************************************************************
3638
This function is only used in Windows asynchronous i/o.
3639
Waits for an aio operation to complete. This function is used to wait the
3640
for completed requests. The aio array of pending requests is divided
3641
into segments. The thread specifies which segment or slot it wants to wait
3642
for. NOTE: this function will also take care of freeing the aio slot,
3643
therefore no other thread is allowed to do the freeing! */
3646
os_aio_windows_handle(
3647
/*==================*/
3648
/* out: TRUE if the aio operation succeeded */
3649
ulint segment, /* in: the number of the segment in the aio
3650
arrays to wait for; segment 0 is the ibuf
3651
i/o thread, segment 1 the log i/o thread,
3652
then follow the non-ibuf read threads, and as
3653
the last are the non-ibuf write threads; if
3654
this is ULINT_UNDEFINED, then it means that
3655
sync aio is used, and this parameter is
3657
ulint pos, /* this parameter is used only in sync aio:
3658
wait for the aio slot at this position */
3659
fil_node_t**message1, /* out: the messages passed with the aio
3660
request; note that also in the case where
3661
the aio operation failed, these output
3662
parameters are valid and can be used to
3663
restart the operation, for example */
3665
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
3667
ulint orig_seg = segment;
3668
os_aio_array_t* array;
3669
os_aio_slot_t* slot;
3676
if (segment == ULINT_UNDEFINED) {
3677
array = os_aio_sync_array;
3680
segment = os_aio_get_array_and_local_segment(&array, segment);
3683
/* NOTE! We only access constant fields in os_aio_array. Therefore
3684
we do not have to acquire the protecting mutex yet */
3686
ut_ad(os_aio_validate());
3687
ut_ad(segment < array->n_segments);
3689
n = array->n_slots / array->n_segments;
3691
if (array == os_aio_sync_array) {
3692
os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
3695
srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
3696
i = os_event_wait_multiple(n,
3697
(array->native_events)
3701
os_mutex_enter(array->mutex);
3703
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3705
ut_a(slot->reserved);
3707
if (orig_seg != ULINT_UNDEFINED) {
3708
srv_set_io_thread_op_info(orig_seg,
3709
"get windows aio return value");
3712
ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
3714
*message1 = slot->message1;
3715
*message2 = slot->message2;
3719
if (ret && len == slot->len) {
3722
# ifdef UNIV_DO_FLUSH
3723
if (slot->type == OS_FILE_WRITE
3724
&& !os_do_not_call_flush_at_each_write) {
3725
ut_a(TRUE == os_file_flush(slot->file));
3727
# endif /* UNIV_DO_FLUSH */
3729
os_file_handle_error(slot->name, "Windows aio");
3734
os_mutex_exit(array->mutex);
3736
os_aio_array_free_slot(array, slot);
3742
#ifdef POSIX_ASYNC_IO
3744
/**************************************************************************
3745
This function is only used in Posix asynchronous i/o. Waits for an aio
3746
operation to complete. */
3749
os_aio_posix_handle(
3750
/*================*/
3751
/* out: TRUE if the aio operation succeeded */
3752
ulint array_no, /* in: array number 0 - 3 */
3753
fil_node_t**message1, /* out: the messages passed with the aio
3754
request; note that also in the case where
3755
the aio operation failed, these output
3756
parameters are valid and can be used to
3757
restart the operation, for example */
3760
os_aio_array_t* array;
3761
os_aio_slot_t* slot;
3764
sigset_t proc_sigset;
3765
sigset_t thr_sigset;
3770
sigemptyset(&sigset);
3771
sigaddset(&sigset, SIGRTMIN + 1 + array_no);
3773
pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
3776
sigprocmask(0, NULL, &proc_sigset);
3777
pthread_sigmask(0, NULL, &thr_sigset);
3779
for (i = 32 ; i < 40; i++) {
3780
fprintf(stderr, "%lu : %lu %lu\n", (ulint)i,
3781
(ulint) sigismember(&proc_sigset, i),
3782
(ulint) sigismember(&thr_sigset, i));
3786
ret = sigwaitinfo(&sigset, &info);
3788
if (sig != SIGRTMIN + 1 + array_no) {
3795
fputs("Handling POSIX aio\n", stderr);
3797
array = os_aio_get_array_from_no(array_no);
3799
os_mutex_enter(array->mutex);
3801
slot = info.si_value.sival_ptr;
3803
ut_a(slot->reserved);
3805
*message1 = slot->message1;
3806
*message2 = slot->message2;
3808
# ifdef UNIV_DO_FLUSH
3809
if (slot->type == OS_FILE_WRITE
3810
&& !os_do_not_call_flush_at_each_write) {
3811
ut_a(TRUE == os_file_flush(slot->file));
3813
# endif /* UNIV_DO_FLUSH */
3815
os_mutex_exit(array->mutex);
3817
os_aio_array_free_slot(array, slot);
3823
/**************************************************************************
3824
Do a 'last millisecond' check that the page end is sensible;
3825
reported page checksum errors from Linux seem to wipe over the page end. */
3828
os_file_check_page_trailers(
3829
/*========================*/
3830
byte* combined_buf, /* in: combined write buffer */
3831
ulint total_len) /* in: size of combined_buf, in bytes
3832
(a multiple of UNIV_PAGE_SIZE) */
3836
for (len = 0; len + UNIV_PAGE_SIZE <= total_len;
3837
len += UNIV_PAGE_SIZE) {
3838
byte* buf = combined_buf + len;
3841
(memcmp(buf + (FIL_PAGE_LSN + 4),
3842
buf + (UNIV_PAGE_SIZE
3843
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
3844
ut_print_timestamp(stderr);
3846
" InnoDB: ERROR: The page to be written"
3848
"InnoDB: Writing a block of %lu bytes,"
3849
" currently at offset %lu\n",
3850
(ulong)total_len, (ulong)len);
3851
buf_page_print(buf);
3853
"InnoDB: ERROR: The page to be written"
3854
" seems corrupt!\n");
3859
/**************************************************************************
3860
Does simulated aio. This function should be called by an i/o-handler
3864
os_aio_simulated_handle(
3865
/*====================*/
3866
/* out: TRUE if the aio operation succeeded */
3867
ulint global_segment, /* in: the number of the segment in the aio
3868
arrays to wait for; segment 0 is the ibuf
3869
i/o thread, segment 1 the log i/o thread,
3870
then follow the non-ibuf read threads, and as
3871
the last are the non-ibuf write threads */
3872
fil_node_t**message1, /* out: the messages passed with the aio
3873
request; note that also in the case where
3874
the aio operation failed, these output
3875
parameters are valid and can be used to
3876
restart the operation, for example */
3878
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
3880
os_aio_array_t* array;
3882
os_aio_slot_t* slot;
3883
os_aio_slot_t* slot2;
3884
os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
3885
ulint n_consecutive;
3888
ulint lowest_offset;
3892
byte* combined_buf2;
3897
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3900
/* NOTE! We only access constant fields in os_aio_array. Therefore
3901
we do not have to acquire the protecting mutex yet */
3903
srv_set_io_thread_op_info(global_segment,
3904
"looking for i/o requests (a)");
3905
ut_ad(os_aio_validate());
3906
ut_ad(segment < array->n_segments);
3908
n = array->n_slots / array->n_segments;
3910
/* Look through n slots after the segment * n'th slot */
3912
if (array == os_aio_read_array
3913
&& os_aio_recommend_sleep_for_read_threads) {
3915
/* Give other threads chance to add several i/os to the array
3918
goto recommended_sleep;
3921
os_mutex_enter(array->mutex);
3923
srv_set_io_thread_op_info(global_segment,
3924
"looking for i/o requests (b)");
3926
/* Check if there is a slot for which the i/o has already been
3929
for (i = 0; i < n; i++) {
3930
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3932
if (slot->reserved && slot->io_already_done) {
3934
if (os_aio_print_debug) {
3936
"InnoDB: i/o for slot %lu"
3937
" already done, returning\n",
3949
/* If there are at least 2 seconds old requests, then pick the oldest
3950
one to prevent starvation. If several requests have the same age,
3951
then pick the one at the lowest offset. */
3954
lowest_offset = ULINT_MAX;
3956
for (i = 0; i < n; i++) {
3957
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3959
if (slot->reserved) {
3960
age = (ulint)difftime(time(NULL),
3961
slot->reservation_time);
3963
if ((age >= 2 && age > biggest_age)
3964
|| (age >= 2 && age == biggest_age
3965
&& slot->offset < lowest_offset)) {
3967
/* Found an i/o request */
3968
consecutive_ios[0] = slot;
3973
lowest_offset = slot->offset;
3978
if (n_consecutive == 0) {
3979
/* There were no old requests. Look for an i/o request at the
3980
lowest offset in the array (we ignore the high 32 bits of the
3981
offset in these heuristics) */
3983
lowest_offset = ULINT_MAX;
3985
for (i = 0; i < n; i++) {
3986
slot = os_aio_array_get_nth_slot(array,
3989
if (slot->reserved && slot->offset < lowest_offset) {
3991
/* Found an i/o request */
3992
consecutive_ios[0] = slot;
3996
lowest_offset = slot->offset;
4001
if (n_consecutive == 0) {
4003
/* No i/o requested at the moment */
4008
slot = consecutive_ios[0];
4010
/* Check if there are several consecutive blocks to read or write */
4013
for (i = 0; i < n; i++) {
4014
slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4016
if (slot2->reserved && slot2 != slot
4017
&& slot2->offset == slot->offset + slot->len
4018
/* check that sum does not wrap over */
4019
&& slot->offset + slot->len > slot->offset
4020
&& slot2->offset_high == slot->offset_high
4021
&& slot2->type == slot->type
4022
&& slot2->file == slot->file) {
4024
/* Found a consecutive i/o request */
4026
consecutive_ios[n_consecutive] = slot2;
4031
if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4033
goto consecutive_loop;
4040
srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4042
/* We have now collected n_consecutive i/o requests in the array;
4043
allocate a single buffer which can hold all data, and perform the
4047
slot = consecutive_ios[0];
4049
for (i = 0; i < n_consecutive; i++) {
4050
total_len += consecutive_ios[i]->len;
4053
if (n_consecutive == 1) {
4054
/* We can use the buffer of the i/o request */
4055
combined_buf = slot->buf;
4056
combined_buf2 = NULL;
4058
combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
4060
ut_a(combined_buf2);
4062
combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
4065
/* We release the array mutex for the time of the i/o: NOTE that
4066
this assumes that there is just one i/o-handler thread serving
4067
a single segment of slots! */
4069
os_mutex_exit(array->mutex);
4071
if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4072
/* Copy the buffers to the combined buffer */
4075
for (i = 0; i < n_consecutive; i++) {
4077
ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4078
consecutive_ios[i]->len);
4079
offs += consecutive_ios[i]->len;
4083
srv_set_io_thread_op_info(global_segment, "doing file i/o");
4085
if (os_aio_print_debug) {
4087
"InnoDB: doing i/o of type %lu at offset %lu %lu,"
4089
(ulong) slot->type, (ulong) slot->offset_high,
4090
(ulong) slot->offset, (ulong) total_len);
4093
/* Do the i/o with ordinary, synchronous i/o functions: */
4094
if (slot->type == OS_FILE_WRITE) {
4095
if (array == os_aio_write_array) {
4096
if ((total_len % UNIV_PAGE_SIZE != 0)
4097
|| (slot->offset % UNIV_PAGE_SIZE != 0)) {
4099
"InnoDB: Error: trying a displaced"
4100
" write to %s %lu %lu, len %lu\n",
4101
slot->name, (ulong) slot->offset_high,
4102
(ulong) slot->offset,
4107
os_file_check_page_trailers(combined_buf, total_len);
4110
ret = os_file_write(slot->name, slot->file, combined_buf,
4111
slot->offset, slot->offset_high,
4114
if (array == os_aio_write_array) {
4115
os_file_check_page_trailers(combined_buf, total_len);
4118
ret = os_file_read(slot->file, combined_buf,
4119
slot->offset, slot->offset_high, total_len);
4123
srv_set_io_thread_op_info(global_segment, "file i/o done");
4127
"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4128
n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4131
if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4132
/* Copy the combined buffer to individual buffers */
4135
for (i = 0; i < n_consecutive; i++) {
4137
ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4138
consecutive_ios[i]->len);
4139
offs += consecutive_ios[i]->len;
4143
if (combined_buf2) {
4144
ut_free(combined_buf2);
4147
os_mutex_enter(array->mutex);
4149
/* Mark the i/os done in slots */
4151
for (i = 0; i < n_consecutive; i++) {
4152
consecutive_ios[i]->io_already_done = TRUE;
4155
/* We return the messages for the first slot now, and if there were
4156
several slots, the messages will be returned with subsequent calls
4161
ut_a(slot->reserved);
4163
*message1 = slot->message1;
4164
*message2 = slot->message2;
4168
os_mutex_exit(array->mutex);
4170
os_aio_array_free_slot(array, slot);
4175
srv_set_io_thread_op_info(global_segment, "resetting wait event");
4177
/* We wait here until there again can be i/os in the segment
4180
os_event_reset(os_aio_segment_wait_events[global_segment]);
4182
os_mutex_exit(array->mutex);
4185
srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4187
os_event_wait(os_aio_segment_wait_events[global_segment]);
4189
if (os_aio_print_debug) {
4191
"InnoDB: i/o handler thread for i/o"
4192
" segment %lu wakes up\n",
4193
(ulong) global_segment);
4199
/**************************************************************************
4200
Validates the consistency of an aio array. */
4203
os_aio_array_validate(
4204
/*==================*/
4205
/* out: TRUE if ok */
4206
os_aio_array_t* array) /* in: aio wait array */
4208
os_aio_slot_t* slot;
4209
ulint n_reserved = 0;
4214
os_mutex_enter(array->mutex);
4216
ut_a(array->n_slots > 0);
4217
ut_a(array->n_segments > 0);
4219
for (i = 0; i < array->n_slots; i++) {
4220
slot = os_aio_array_get_nth_slot(array, i);
4222
if (slot->reserved) {
4224
ut_a(slot->len > 0);
4228
ut_a(array->n_reserved == n_reserved);
4230
os_mutex_exit(array->mutex);
4235
/**************************************************************************
4236
Validates the consistency the aio system. */
4239
os_aio_validate(void)
4240
/*=================*/
4241
/* out: TRUE if ok */
4243
os_aio_array_validate(os_aio_read_array);
4244
os_aio_array_validate(os_aio_write_array);
4245
os_aio_array_validate(os_aio_ibuf_array);
4246
os_aio_array_validate(os_aio_log_array);
4247
os_aio_array_validate(os_aio_sync_array);
4252
/**************************************************************************
4253
Prints info of the aio arrays. */
4258
FILE* file) /* in: file where to print */
4260
os_aio_array_t* array;
4261
os_aio_slot_t* slot;
4263
time_t current_time;
4264
double time_elapsed;
4265
double avg_bytes_read;
4268
for (i = 0; i < srv_n_file_io_threads; i++) {
4269
fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
4270
srv_io_thread_op_info[i],
4271
srv_io_thread_function[i]);
4274
if (os_aio_segment_wait_events[i]->is_set) {
4275
fprintf(file, " ev set");
4279
fprintf(file, "\n");
4282
fputs("Pending normal aio reads:", file);
4284
array = os_aio_read_array;
4288
os_mutex_enter(array->mutex);
4290
ut_a(array->n_slots > 0);
4291
ut_a(array->n_segments > 0);
4295
for (i = 0; i < array->n_slots; i++) {
4296
slot = os_aio_array_get_nth_slot(array, i);
4298
if (slot->reserved) {
4301
fprintf(stderr, "Reserved slot, messages %p %p\n",
4302
(void*) slot->message1,
4303
(void*) slot->message2);
4305
ut_a(slot->len > 0);
4309
ut_a(array->n_reserved == n_reserved);
4311
fprintf(file, " %lu", (ulong) n_reserved);
4313
os_mutex_exit(array->mutex);
4315
if (array == os_aio_read_array) {
4316
fputs(", aio writes:", file);
4318
array = os_aio_write_array;
4323
if (array == os_aio_write_array) {
4324
fputs(",\n ibuf aio reads:", file);
4325
array = os_aio_ibuf_array;
4330
if (array == os_aio_ibuf_array) {
4331
fputs(", log i/o's:", file);
4332
array = os_aio_log_array;
4337
if (array == os_aio_log_array) {
4338
fputs(", sync i/o's:", file);
4339
array = os_aio_sync_array;
4345
current_time = time(NULL);
4346
time_elapsed = 0.001 + difftime(current_time, os_last_printout);
4349
"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
4350
"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
4351
(ulong) fil_n_pending_log_flushes,
4352
(ulong) fil_n_pending_tablespace_flushes,
4353
(ulong) os_n_file_reads, (ulong) os_n_file_writes,
4354
(ulong) os_n_fsyncs);
4356
if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
4358
"%lu pending preads, %lu pending pwrites\n",
4359
(ulong) os_file_n_pending_preads,
4360
(ulong) os_file_n_pending_pwrites);
4363
if (os_n_file_reads == os_n_file_reads_old) {
4364
avg_bytes_read = 0.0;
4366
avg_bytes_read = (double) os_bytes_read_since_printout
4367
/ (os_n_file_reads - os_n_file_reads_old);
4371
"%.2f reads/s, %lu avg bytes/read,"
4372
" %.2f writes/s, %.2f fsyncs/s\n",
4373
(os_n_file_reads - os_n_file_reads_old)
4375
(ulong)avg_bytes_read,
4376
(os_n_file_writes - os_n_file_writes_old)
4378
(os_n_fsyncs - os_n_fsyncs_old)
4381
os_n_file_reads_old = os_n_file_reads;
4382
os_n_file_writes_old = os_n_file_writes;
4383
os_n_fsyncs_old = os_n_fsyncs;
4384
os_bytes_read_since_printout = 0;
4386
os_last_printout = current_time;
4389
/**************************************************************************
4390
Refreshes the statistics used to print per-second averages. */
4393
os_aio_refresh_stats(void)
4394
/*======================*/
4396
os_n_file_reads_old = os_n_file_reads;
4397
os_n_file_writes_old = os_n_file_writes;
4398
os_n_fsyncs_old = os_n_fsyncs;
4399
os_bytes_read_since_printout = 0;
4401
os_last_printout = time(NULL);
4405
/**************************************************************************
4406
Checks that all slots in the system have been freed, that is, there are
4407
no pending io operations. */
4410
os_aio_all_slots_free(void)
4411
/*=======================*/
4412
/* out: TRUE if all free */
4414
os_aio_array_t* array;
4417
array = os_aio_read_array;
4419
os_mutex_enter(array->mutex);
4421
n_res += array->n_reserved;
4423
os_mutex_exit(array->mutex);
4425
array = os_aio_write_array;
4427
os_mutex_enter(array->mutex);
4429
n_res += array->n_reserved;
4431
os_mutex_exit(array->mutex);
4433
array = os_aio_ibuf_array;
4435
os_mutex_enter(array->mutex);
4437
n_res += array->n_reserved;
4439
os_mutex_exit(array->mutex);
4441
array = os_aio_log_array;
4443
os_mutex_enter(array->mutex);
4445
n_res += array->n_reserved;
4447
os_mutex_exit(array->mutex);
4449
array = os_aio_sync_array;
4451
os_mutex_enter(array->mutex);
4453
n_res += array->n_reserved;
4455
os_mutex_exit(array->mutex);
4464
#endif /* UNIV_DEBUG */