~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/******************************************************
2
The interface to the operating system file i/o primitives
3
4
(c) 1995 Innobase Oy
5
6
Created 10/21/1995 Heikki Tuuri
7
*******************************************************/
8
9
#include "os0file.h"
10
#include "os0sync.h"
11
#include "os0thread.h"
12
#include "ut0mem.h"
13
#include "srv0srv.h"
14
#include "srv0start.h"
15
#include "fil0fil.h"
16
#include "buf0buf.h"
17
18
#if defined(UNIV_HOTBACKUP) && defined(__WIN__)
19
/* Add includes for the _stat() call to compile on Windows */
20
#include <sys/types.h>
21
#include <sys/stat.h>
22
#include <errno.h>
23
#endif /* UNIV_HOTBACKUP */
24
25
#ifdef POSIX_ASYNC_IO
26
/* We assume in this case that the OS has standard Posix aio (at least SunOS
27
2.6, HP-UX 11i and AIX 4.3 have) */
28
29
#endif
30
31
/* This specifies the file permissions InnoDB uses when it creates files in
32
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
33
my_umask */
34
35
#ifndef __WIN__
36
ulint	os_innodb_umask		= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
37
#else
38
ulint	os_innodb_umask		= 0;
39
#endif
40
41
#ifdef UNIV_DO_FLUSH
42
/* If the following is set to TRUE, we do not call os_file_flush in every
43
os_file_write. We can set this TRUE when the doublewrite buffer is used. */
44
ibool	os_do_not_call_flush_at_each_write	= FALSE;
45
#else
46
/* We do not call os_file_flush in every os_file_write. */
47
#endif /* UNIV_DO_FLUSH */
48
49
/* We use these mutexes to protect lseek + file i/o operation, if the
50
OS does not provide an atomic pread or pwrite, or similar */
51
#define OS_FILE_N_SEEK_MUTEXES	16
52
os_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
53
54
/* In simulated aio, merge at most this many consecutive i/os */
55
#define OS_AIO_MERGE_N_CONSECUTIVE	64
56
57
/* If this flag is TRUE, then we will use the native aio of the
58
OS (provided we compiled Innobase with it in), otherwise we will
59
use simulated aio we build below with threads */
60
61
ibool	os_aio_use_native_aio	= FALSE;
62
63
ibool	os_aio_print_debug	= FALSE;
64
65
/* The aio array slot structure */
66
typedef struct os_aio_slot_struct	os_aio_slot_t;
67
68
struct os_aio_slot_struct{
69
	ibool		is_read;	/* TRUE if a read operation */
70
	ulint		pos;		/* index of the slot in the aio
71
					array */
72
	ibool		reserved;	/* TRUE if this slot is reserved */
73
	time_t		reservation_time;/* time when reserved */
74
	ulint		len;		/* length of the block to read or
75
					write */
76
	byte*		buf;		/* buffer used in i/o */
77
	ulint		type;		/* OS_FILE_READ or OS_FILE_WRITE */
78
	ulint		offset;		/* 32 low bits of file offset in
79
					bytes */
80
	ulint		offset_high;	/* 32 high bits of file offset */
81
	os_file_t	file;		/* file where to read or write */
82
	const char*	name;		/* file name or path */
83
	ibool		io_already_done;/* used only in simulated aio:
84
					TRUE if the physical i/o already
85
					made and only the slot message
86
					needs to be passed to the caller
87
					of os_aio_simulated_handle */
88
	fil_node_t*	message1;	/* message which is given by the */
89
	void*		message2;	/* the requester of an aio operation
90
					and which can be used to identify
91
					which pending aio operation was
92
					completed */
93
#ifdef WIN_ASYNC_IO
94
	os_event_t	event;		/* event object we need in the
95
					OVERLAPPED struct */
96
	OVERLAPPED	control;	/* Windows control block for the
97
					aio request */
98
#elif defined(POSIX_ASYNC_IO)
99
	struct aiocb	control;	/* Posix control block for aio
100
					request */
101
#endif
102
};
103
104
/* The aio array structure */
105
typedef struct os_aio_array_struct	os_aio_array_t;
106
107
struct os_aio_array_struct{
108
	os_mutex_t	mutex;	  /* the mutex protecting the aio array */
109
	os_event_t	not_full; /* The event which is set to the signaled
110
				  state when there is space in the aio
111
				  outside the ibuf segment */
112
	os_event_t	is_empty; /* The event which is set to the signaled
113
				  state when there are no pending i/os
114
				  in this array */
115
	ulint		n_slots;  /* Total number of slots in the aio array.
116
				  This must be divisible by n_threads. */
117
	ulint		n_segments;/* Number of segments in the aio array of
118
				  pending aio requests. A thread can wait
119
				  separately for any one of the segments. */
120
	ulint		n_reserved;/* Number of reserved slots in the
121
				  aio array outside the ibuf segment */
122
	os_aio_slot_t*	slots;	  /* Pointer to the slots in the array */
123
#ifdef __WIN__
124
	os_native_event_t* native_events;
125
				  /* Pointer to an array of OS native event
126
				  handles where we copied the handles from
127
				  slots, in the same order. This can be used
128
				  in WaitForMultipleObjects; used only in
129
				  Windows */
130
#endif
131
};
132
133
/* Array of events used in simulated aio */
134
os_event_t*	os_aio_segment_wait_events	= NULL;
135
136
/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
137
are NULL when the module has not yet been initialized. */
138
static os_aio_array_t*	os_aio_read_array	= NULL;
139
static os_aio_array_t*	os_aio_write_array	= NULL;
140
static os_aio_array_t*	os_aio_ibuf_array	= NULL;
141
static os_aio_array_t*	os_aio_log_array	= NULL;
142
static os_aio_array_t*	os_aio_sync_array	= NULL;
143
144
static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
145
146
/* If the following is TRUE, read i/o handler threads try to
147
wait until a batch of new read requests have been posted */
148
static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
149
150
ulint	os_n_file_reads		= 0;
151
ulint	os_bytes_read_since_printout = 0;
152
ulint	os_n_file_writes	= 0;
153
ulint	os_n_fsyncs		= 0;
154
ulint	os_n_file_reads_old	= 0;
155
ulint	os_n_file_writes_old	= 0;
156
ulint	os_n_fsyncs_old		= 0;
157
time_t	os_last_printout;
158
159
ibool	os_has_said_disk_full	= FALSE;
160
161
/* The mutex protecting the following counts of pending I/O operations */
162
static os_mutex_t os_file_count_mutex;
163
ulint	os_file_n_pending_preads  = 0;
164
ulint	os_file_n_pending_pwrites = 0;
165
ulint	os_n_pending_writes = 0;
166
ulint	os_n_pending_reads = 0;
167
168
/***************************************************************************
169
Gets the operating system version. Currently works only on Windows. */
170
171
ulint
172
os_get_os_version(void)
173
/*===================*/
174
		  /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
175
{
176
#ifdef __WIN__
177
	OSVERSIONINFO	  os_info;
178
179
	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
180
181
	ut_a(GetVersionEx(&os_info));
182
183
	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
184
		return(OS_WIN31);
185
	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
186
		return(OS_WIN95);
187
	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
188
		if (os_info.dwMajorVersion <= 4) {
189
			return(OS_WINNT);
190
		} else {
191
			return(OS_WIN2000);
192
		}
193
	} else {
194
		ut_error;
195
		return(0);
196
	}
197
#else
198
	ut_error;
199
200
	return(0);
201
#endif
202
}
203
204
/***************************************************************************
205
Retrieves the last error number if an error occurs in a file io function.
206
The number should be retrieved before any other OS calls (because they may
207
overwrite the error number). If the number is not known to this program,
208
the OS error number + 100 is returned. */
209
210
ulint
211
os_file_get_last_error(
212
/*===================*/
213
					/* out: error number, or OS error
214
					number + 100 */
215
	ibool	report_all_errors)	/* in: TRUE if we want an error message
216
					printed of all errors */
217
{
218
	ulint	err;
219
220
#ifdef __WIN__
221
222
	err = (ulint) GetLastError();
223
224
	if (report_all_errors
225
	    || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
226
227
		ut_print_timestamp(stderr);
228
		fprintf(stderr,
229
			"  InnoDB: Operating system error number %lu"
230
			" in a file operation.\n", (ulong) err);
231
232
		if (err == ERROR_PATH_NOT_FOUND) {
233
			fprintf(stderr,
234
				"InnoDB: The error means the system"
235
				" cannot find the path specified.\n");
236
237
			if (srv_is_being_started) {
238
				fprintf(stderr,
239
					"InnoDB: If you are installing InnoDB,"
240
					" remember that you must create\n"
241
					"InnoDB: directories yourself, InnoDB"
242
					" does not create them.\n");
243
			}
244
		} else if (err == ERROR_ACCESS_DENIED) {
245
			fprintf(stderr,
246
				"InnoDB: The error means mysqld does not have"
247
				" the access rights to\n"
248
				"InnoDB: the directory. It may also be"
249
				" you have created a subdirectory\n"
250
				"InnoDB: of the same name as a data file.\n");
251
		} else if (err == ERROR_SHARING_VIOLATION
252
			   || err == ERROR_LOCK_VIOLATION) {
253
			fprintf(stderr,
254
				"InnoDB: The error means that another program"
255
				" is using InnoDB's files.\n"
256
				"InnoDB: This might be a backup or antivirus"
257
				" software or another instance\n"
258
				"InnoDB: of MySQL."
259
				" Please close it to get rid of this error.\n");
260
		} else {
261
			fprintf(stderr,
262
				"InnoDB: Some operating system error numbers"
263
				" are described at\n"
264
				"InnoDB: "
265
				"http://dev.mysql.com/doc/refman/5.1/en/"
266
				"operating-system-error-codes.html\n");
267
		}
268
	}
269
270
	fflush(stderr);
271
272
	if (err == ERROR_FILE_NOT_FOUND) {
273
		return(OS_FILE_NOT_FOUND);
274
	} else if (err == ERROR_DISK_FULL) {
275
		return(OS_FILE_DISK_FULL);
276
	} else if (err == ERROR_FILE_EXISTS) {
277
		return(OS_FILE_ALREADY_EXISTS);
278
	} else if (err == ERROR_SHARING_VIOLATION
279
		   || err == ERROR_LOCK_VIOLATION) {
280
		return(OS_FILE_SHARING_VIOLATION);
281
	} else {
282
		return(100 + err);
283
	}
284
#else
285
	err = (ulint) errno;
286
287
	if (report_all_errors
288
	    || (err != ENOSPC && err != EEXIST)) {
289
290
		ut_print_timestamp(stderr);
291
		fprintf(stderr,
292
			"  InnoDB: Operating system error number %lu"
293
			" in a file operation.\n", (ulong) err);
294
295
		if (err == ENOENT) {
296
			fprintf(stderr,
297
				"InnoDB: The error means the system"
298
				" cannot find the path specified.\n");
299
300
			if (srv_is_being_started) {
301
				fprintf(stderr,
302
					"InnoDB: If you are installing InnoDB,"
303
					" remember that you must create\n"
304
					"InnoDB: directories yourself, InnoDB"
305
					" does not create them.\n");
306
			}
307
		} else if (err == EACCES) {
308
			fprintf(stderr,
309
				"InnoDB: The error means mysqld does not have"
310
				" the access rights to\n"
311
				"InnoDB: the directory.\n");
312
		} else {
313
			if (strerror((int)err) != NULL) {
314
				fprintf(stderr,
315
					"InnoDB: Error number %lu"
316
					" means '%s'.\n",
317
					err, strerror((int)err));
318
			}
319
320
			fprintf(stderr,
321
				"InnoDB: Some operating system"
322
				" error numbers are described at\n"
323
				"InnoDB: "
324
				"http://dev.mysql.com/doc/refman/5.1/en/"
325
				"operating-system-error-codes.html\n");
326
		}
327
	}
328
329
	fflush(stderr);
330
331
	if (err == ENOSPC) {
332
		return(OS_FILE_DISK_FULL);
333
#ifdef POSIX_ASYNC_IO
334
	} else if (err == EAGAIN) {
335
		return(OS_FILE_AIO_RESOURCES_RESERVED);
336
#endif
337
	} else if (err == ENOENT) {
338
		return(OS_FILE_NOT_FOUND);
339
	} else if (err == EEXIST) {
340
		return(OS_FILE_ALREADY_EXISTS);
341
	} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
342
		return(OS_FILE_PATH_ERROR);
343
	} else {
344
		return(100 + err);
345
	}
346
#endif
347
}
348
349
/********************************************************************
350
Does error handling when a file operation fails.
351
Conditionally exits (calling exit(3)) based on should_exit value and the
352
error type */
353
354
static
355
ibool
356
os_file_handle_error_cond_exit(
357
/*===========================*/
358
					/* out: TRUE if we should retry the
359
					operation */
360
	const char*	name,		/* in: name of a file or NULL */
361
	const char*	operation,	/* in: operation */
362
	ibool		should_exit)	/* in: call exit(3) if unknown error
363
					and this parameter is TRUE */
364
{
365
	ulint	err;
366
367
	err = os_file_get_last_error(FALSE);
368
369
	if (err == OS_FILE_DISK_FULL) {
370
		/* We only print a warning about disk full once */
371
372
		if (os_has_said_disk_full) {
373
374
			return(FALSE);
375
		}
376
377
		if (name) {
378
			ut_print_timestamp(stderr);
379
			fprintf(stderr,
380
				"  InnoDB: Encountered a problem with"
381
				" file %s\n", name);
382
		}
383
384
		ut_print_timestamp(stderr);
385
		fprintf(stderr,
386
			"  InnoDB: Disk is full. Try to clean the disk"
387
			" to free space.\n");
388
389
		os_has_said_disk_full = TRUE;
390
391
		fflush(stderr);
392
393
		return(FALSE);
394
	} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
395
396
		return(TRUE);
397
	} else if (err == OS_FILE_ALREADY_EXISTS
398
		   || err == OS_FILE_PATH_ERROR) {
399
400
		return(FALSE);
401
	} else if (err == OS_FILE_SHARING_VIOLATION) {
402
403
		os_thread_sleep(10000000);  /* 10 sec */
404
		return(TRUE);
405
	} else {
406
		if (name) {
407
			fprintf(stderr, "InnoDB: File name %s\n", name);
408
		}
409
410
		fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
411
			operation);
412
413
		if (should_exit) {
414
			fprintf(stderr, "InnoDB: Cannot continue operation.\n");
415
416
			fflush(stderr);
417
418
			exit(1);
419
		}
420
	}
421
422
	return(FALSE);
423
}
424
425
/********************************************************************
426
Does error handling when a file operation fails. */
427
static
428
ibool
429
os_file_handle_error(
430
/*=================*/
431
				/* out: TRUE if we should retry the
432
				operation */
433
	const char*	name,	/* in: name of a file or NULL */
434
	const char*	operation)/* in: operation */
435
{
436
	/* exit in case of unknown error */
437
	return(os_file_handle_error_cond_exit(name, operation, TRUE));
438
}
439
440
/********************************************************************
441
Does error handling when a file operation fails. */
442
static
443
ibool
444
os_file_handle_error_no_exit(
445
/*=========================*/
446
				/* out: TRUE if we should retry the
447
				operation */
448
	const char*	name,	/* in: name of a file or NULL */
449
	const char*	operation)/* in: operation */
450
{
451
	/* don't exit in case of unknown error */
452
	return(os_file_handle_error_cond_exit(name, operation, FALSE));
453
}
454
455
#undef USE_FILE_LOCK
456
#define USE_FILE_LOCK
457
#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
458
/* InnoDB Hot Backup does not lock the data files.
459
 * On Windows, mandatory locking is used.
460
 */
461
# undef USE_FILE_LOCK
462
#endif
463
#ifdef USE_FILE_LOCK
464
/********************************************************************
465
Obtain an exclusive lock on a file. */
466
static
467
int
468
os_file_lock(
469
/*=========*/
470
				/* out: 0 on success */
471
	int		fd,	/* in: file descriptor */
472
	const char*	name)	/* in: file name */
473
{
474
	struct flock lk;
475
	lk.l_type = F_WRLCK;
476
	lk.l_whence = SEEK_SET;
477
	lk.l_start = lk.l_len = 0;
478
	if (fcntl(fd, F_SETLK, &lk) == -1) {
479
		fprintf(stderr,
480
			"InnoDB: Unable to lock %s, error: %d\n", name, errno);
481
482
		if (errno == EAGAIN || errno == EACCES) {
483
			fprintf(stderr,
484
				"InnoDB: Check that you do not already have"
485
				" another mysqld process\n"
486
				"InnoDB: using the same InnoDB data"
487
				" or log files.\n");
488
		}
489
490
		return(-1);
491
	}
492
493
	return(0);
494
}
495
#endif /* USE_FILE_LOCK */
496
497
/********************************************************************
498
Creates the seek mutexes used in positioned reads and writes. */
499
500
void
501
os_io_init_simple(void)
502
/*===================*/
503
{
504
	ulint	i;
505
506
	os_file_count_mutex = os_mutex_create(NULL);
507
508
	for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
509
		os_file_seek_mutexes[i] = os_mutex_create(NULL);
510
	}
511
}
512
513
#if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
514
/*************************************************************************
515
Creates a temporary file that will be deleted on close.
516
This function is defined in ha_innodb.cc. */
517
518
int
519
innobase_mysql_tmpfile(void);
520
/*========================*/
521
			/* out: temporary file descriptor, or < 0 on error */
522
#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
523
524
/***************************************************************************
525
Creates a temporary file.  This function is like tmpfile(3), but
526
the temporary file is created in the MySQL temporary directory.
527
On Netware, this function is like tmpfile(3), because the C run-time
528
library of Netware does not expose the delete-on-close flag. */
529
530
FILE*
531
os_file_create_tmpfile(void)
532
/*========================*/
533
			/* out: temporary file handle, or NULL on error */
534
{
535
#ifdef UNIV_HOTBACKUP
536
	ut_error;
537
538
	return(NULL);
539
#else
540
# ifdef __NETWARE__
541
	FILE*	file	= tmpfile();
542
# else /* __NETWARE__ */
543
	FILE*	file	= NULL;
544
	int	fd	= innobase_mysql_tmpfile();
545
546
	if (fd >= 0) {
547
		file = fdopen(fd, "w+b");
548
	}
549
# endif /* __NETWARE__ */
550
551
	if (!file) {
552
		ut_print_timestamp(stderr);
553
		fprintf(stderr,
554
			"  InnoDB: Error: unable to create temporary file;"
555
			" errno: %d\n", errno);
556
# ifndef __NETWARE__
557
		if (fd >= 0) {
558
			close(fd);
559
		}
560
# endif /* !__NETWARE__ */
561
	}
562
563
	return(file);
564
#endif /* UNIV_HOTBACKUP */
565
}
566
567
/***************************************************************************
568
The os_file_opendir() function opens a directory stream corresponding to the
569
directory named by the dirname argument. The directory stream is positioned
570
at the first entry. In both Unix and Windows we automatically skip the '.'
571
and '..' items at the start of the directory listing. */
572
573
os_file_dir_t
574
os_file_opendir(
575
/*============*/
576
					/* out: directory stream, NULL if
577
					error */
578
	const char*	dirname,	/* in: directory name; it must not
579
					contain a trailing '\' or '/' */
580
	ibool		error_is_fatal)	/* in: TRUE if we should treat an
581
					error as a fatal error; if we try to
582
					open symlinks then we do not wish a
583
					fatal error if it happens not to be
584
					a directory */
585
{
586
	os_file_dir_t		dir;
587
#ifdef __WIN__
588
	LPWIN32_FIND_DATA	lpFindFileData;
589
	char			path[OS_FILE_MAX_PATH + 3];
590
591
	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
592
593
	strcpy(path, dirname);
594
	strcpy(path + strlen(path), "\\*");
595
596
	/* Note that in Windows opening the 'directory stream' also retrieves
597
	the first entry in the directory. Since it is '.', that is no problem,
598
	as we will skip over the '.' and '..' entries anyway. */
599
600
	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
601
602
	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
603
604
	ut_free(lpFindFileData);
605
606
	if (dir == INVALID_HANDLE_VALUE) {
607
608
		if (error_is_fatal) {
609
			os_file_handle_error(dirname, "opendir");
610
		}
611
612
		return(NULL);
613
	}
614
615
	return(dir);
616
#else
617
	dir = opendir(dirname);
618
619
	if (dir == NULL && error_is_fatal) {
620
		os_file_handle_error(dirname, "opendir");
621
	}
622
623
	return(dir);
624
#endif
625
}
626
627
/***************************************************************************
628
Closes a directory stream. */
629
630
int
631
os_file_closedir(
632
/*=============*/
633
				/* out: 0 if success, -1 if failure */
634
	os_file_dir_t	dir)	/* in: directory stream */
635
{
636
#ifdef __WIN__
637
	BOOL		ret;
638
639
	ret = FindClose(dir);
640
641
	if (!ret) {
642
		os_file_handle_error_no_exit(NULL, "closedir");
643
644
		return(-1);
645
	}
646
647
	return(0);
648
#else
649
	int	ret;
650
651
	ret = closedir(dir);
652
653
	if (ret) {
654
		os_file_handle_error_no_exit(NULL, "closedir");
655
	}
656
657
	return(ret);
658
#endif
659
}
660
661
/***************************************************************************
662
This function returns information of the next file in the directory. We jump
663
over the '.' and '..' entries in the directory. */
664
665
int
666
os_file_readdir_next_file(
667
/*======================*/
668
				/* out: 0 if ok, -1 if error, 1 if at the end
669
				of the directory */
670
	const char*	dirname,/* in: directory name or path */
671
	os_file_dir_t	dir,	/* in: directory stream */
672
	os_file_stat_t*	info)	/* in/out: buffer where the info is returned */
673
{
674
#ifdef __WIN__
675
	LPWIN32_FIND_DATA	lpFindFileData;
676
	BOOL			ret;
677
678
	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
679
next_file:
680
	ret = FindNextFile(dir, lpFindFileData);
681
682
	if (ret) {
683
		ut_a(strlen((char *) lpFindFileData->cFileName)
684
		     < OS_FILE_MAX_PATH);
685
686
		if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
687
		    || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
688
689
			goto next_file;
690
		}
691
692
		strcpy(info->name, (char *) lpFindFileData->cFileName);
693
694
		info->size = (ib_longlong)(lpFindFileData->nFileSizeLow)
695
			+ (((ib_longlong)(lpFindFileData->nFileSizeHigh))
696
			   << 32);
697
698
		if (lpFindFileData->dwFileAttributes
699
		    & FILE_ATTRIBUTE_REPARSE_POINT) {
700
			/* TODO: test Windows symlinks */
701
			/* TODO: MySQL has apparently its own symlink
702
			implementation in Windows, dbname.sym can
703
			redirect a database directory:
704
			http://dev.mysql.com/doc/refman/5.1/en/
705
			windows-symbolic-links.html */
706
			info->type = OS_FILE_TYPE_LINK;
707
		} else if (lpFindFileData->dwFileAttributes
708
			   & FILE_ATTRIBUTE_DIRECTORY) {
709
			info->type = OS_FILE_TYPE_DIR;
710
		} else {
711
			/* It is probably safest to assume that all other
712
			file types are normal. Better to check them rather
713
			than blindly skip them. */
714
715
			info->type = OS_FILE_TYPE_FILE;
716
		}
717
	}
718
719
	ut_free(lpFindFileData);
720
721
	if (ret) {
722
		return(0);
723
	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
724
725
		return(1);
726
	} else {
727
		os_file_handle_error_no_exit(dirname,
728
					     "readdir_next_file");
729
		return(-1);
730
	}
731
#else
732
	struct dirent*	ent;
733
	char*		full_path;
734
	int		ret;
735
	struct stat	statinfo;
736
#ifdef HAVE_READDIR_R
737
	char		dirent_buf[sizeof(struct dirent)
738
				   + _POSIX_PATH_MAX + 100];
739
	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
740
	the max file name len; but in most standards, the
741
	length is NAME_MAX; we add 100 to be even safer */
742
#endif
743
744
next_file:
745
746
#ifdef HAVE_READDIR_R
747
	ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
748
749
	if (ret != 0) {
750
		fprintf(stderr,
751
			"InnoDB: cannot read directory %s, error %lu\n",
752
			dirname, (ulong)ret);
753
754
		return(-1);
755
	}
756
757
	if (ent == NULL) {
758
		/* End of directory */
759
760
		return(1);
761
	}
762
763
	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
764
#else
765
	ent = readdir(dir);
766
767
	if (ent == NULL) {
768
769
		return(1);
770
	}
771
#endif
772
	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
773
774
	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
775
776
		goto next_file;
777
	}
778
779
	strcpy(info->name, ent->d_name);
780
781
	full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
782
783
	sprintf(full_path, "%s/%s", dirname, ent->d_name);
784
785
	ret = stat(full_path, &statinfo);
786
787
	if (ret) {
788
		os_file_handle_error_no_exit(full_path, "stat");
789
790
		ut_free(full_path);
791
792
		return(-1);
793
	}
794
795
	info->size = (ib_longlong)statinfo.st_size;
796
797
	if (S_ISDIR(statinfo.st_mode)) {
798
		info->type = OS_FILE_TYPE_DIR;
799
	} else if (S_ISLNK(statinfo.st_mode)) {
800
		info->type = OS_FILE_TYPE_LINK;
801
	} else if (S_ISREG(statinfo.st_mode)) {
802
		info->type = OS_FILE_TYPE_FILE;
803
	} else {
804
		info->type = OS_FILE_TYPE_UNKNOWN;
805
	}
806
807
	ut_free(full_path);
808
809
	return(0);
810
#endif
811
}
812
813
/*********************************************************************
814
This function attempts to create a directory named pathname. The new directory
815
gets default permissions. On Unix the permissions are (0770 & ~umask). If the
816
directory exists already, nothing is done and the call succeeds, unless the
817
fail_if_exists arguments is true. */
818
819
ibool
820
os_file_create_directory(
821
/*=====================*/
822
					/* out: TRUE if call succeeds,
823
					FALSE on error */
824
	const char*	pathname,	/* in: directory name as
825
					null-terminated string */
826
	ibool		fail_if_exists)	/* in: if TRUE, pre-existing directory
827
					is treated as an error. */
828
{
829
#ifdef __WIN__
830
	BOOL	rcode;
831
832
	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
833
	if (!(rcode != 0
834
	      || (GetLastError() == ERROR_ALREADY_EXISTS
835
		  && !fail_if_exists))) {
836
		/* failure */
837
		os_file_handle_error(pathname, "CreateDirectory");
838
839
		return(FALSE);
840
	}
841
842
	return (TRUE);
843
#else
844
	int	rcode;
845
846
	rcode = mkdir(pathname, 0770);
847
848
	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
849
		/* failure */
850
		os_file_handle_error(pathname, "mkdir");
851
852
		return(FALSE);
853
	}
854
855
	return (TRUE);
856
#endif
857
}
858
859
/********************************************************************
860
A simple function to open or create a file. */
861
862
os_file_t
863
os_file_create_simple(
864
/*==================*/
865
				/* out, own: handle to the file, not defined
866
				if error, error number can be retrieved with
867
				os_file_get_last_error */
868
	const char*	name,	/* in: name of the file or path as a
869
				null-terminated string */
870
	ulint		create_mode,/* in: OS_FILE_OPEN if an existing file is
871
				opened (if does not exist, error), or
872
				OS_FILE_CREATE if a new file is created
873
				(if exists, error), or
874
				OS_FILE_CREATE_PATH if new file
875
				(if exists, error) and subdirectories along
876
				its path are created (if needed)*/
877
	ulint		access_type,/* in: OS_FILE_READ_ONLY or
878
				OS_FILE_READ_WRITE */
879
	ibool*		success)/* out: TRUE if succeed, FALSE if error */
880
{
881
#ifdef __WIN__
882
	os_file_t	file;
883
	DWORD		create_flag;
884
	DWORD		access;
885
	DWORD		attributes	= 0;
886
	ibool		retry;
887
888
try_again:
889
	ut_a(name);
890
891
	if (create_mode == OS_FILE_OPEN) {
892
		create_flag = OPEN_EXISTING;
893
	} else if (create_mode == OS_FILE_CREATE) {
894
		create_flag = CREATE_NEW;
895
	} else if (create_mode == OS_FILE_CREATE_PATH) {
896
		/* create subdirs along the path if needed  */
897
		*success = os_file_create_subdirs_if_needed(name);
898
		if (!*success) {
899
			ut_error;
900
		}
901
		create_flag = CREATE_NEW;
902
		create_mode = OS_FILE_CREATE;
903
	} else {
904
		create_flag = 0;
905
		ut_error;
906
	}
907
908
	if (access_type == OS_FILE_READ_ONLY) {
909
		access = GENERIC_READ;
910
	} else if (access_type == OS_FILE_READ_WRITE) {
911
		access = GENERIC_READ | GENERIC_WRITE;
912
	} else {
913
		access = 0;
914
		ut_error;
915
	}
916
917
	file = CreateFile((LPCTSTR) name,
918
			  access,
919
			  FILE_SHARE_READ | FILE_SHARE_WRITE,
920
			  /* file can be read and written also
921
			  by other processes */
922
			  NULL,	/* default security attributes */
923
			  create_flag,
924
			  attributes,
925
			  NULL);	/* no template file */
926
927
	if (file == INVALID_HANDLE_VALUE) {
928
		*success = FALSE;
929
930
		retry = os_file_handle_error(name,
931
					     create_mode == OS_FILE_OPEN ?
932
					     "open" : "create");
933
		if (retry) {
934
			goto try_again;
935
		}
936
	} else {
937
		*success = TRUE;
938
	}
939
940
	return(file);
941
#else /* __WIN__ */
942
	os_file_t	file;
943
	int		create_flag;
944
	ibool		retry;
945
946
try_again:
947
	ut_a(name);
948
949
	if (create_mode == OS_FILE_OPEN) {
950
		if (access_type == OS_FILE_READ_ONLY) {
951
			create_flag = O_RDONLY;
952
		} else {
953
			create_flag = O_RDWR;
954
		}
955
	} else if (create_mode == OS_FILE_CREATE) {
956
		create_flag = O_RDWR | O_CREAT | O_EXCL;
957
	} else if (create_mode == OS_FILE_CREATE_PATH) {
958
		/* create subdirs along the path if needed  */
959
		*success = os_file_create_subdirs_if_needed(name);
960
		if (!*success) {
961
			return (-1);
962
		}
963
		create_flag = O_RDWR | O_CREAT | O_EXCL;
964
		create_mode = OS_FILE_CREATE;
965
	} else {
966
		create_flag = 0;
967
		ut_error;
968
	}
969
970
	if (create_mode == OS_FILE_CREATE) {
971
		file = open(name, create_flag, S_IRUSR | S_IWUSR
972
			    | S_IRGRP | S_IWGRP);
973
	} else {
974
		file = open(name, create_flag);
975
	}
976
977
	if (file == -1) {
978
		*success = FALSE;
979
980
		retry = os_file_handle_error(name,
981
					     create_mode == OS_FILE_OPEN ?
982
					     "open" : "create");
983
		if (retry) {
984
			goto try_again;
985
		}
986
#ifdef USE_FILE_LOCK
987
	} else if (access_type == OS_FILE_READ_WRITE
988
		   && os_file_lock(file, name)) {
989
		*success = FALSE;
990
		close(file);
991
		file = -1;
992
#endif
993
	} else {
994
		*success = TRUE;
995
	}
996
997
	return(file);
998
#endif /* __WIN__ */
999
}
1000
1001
/********************************************************************
1002
A simple function to open or create a file. */
1003
1004
os_file_t
1005
os_file_create_simple_no_error_handling(
1006
/*====================================*/
1007
				/* out, own: handle to the file, not defined
1008
				if error, error number can be retrieved with
1009
				os_file_get_last_error */
1010
	const char*	name,	/* in: name of the file or path as a
1011
				null-terminated string */
1012
	ulint		create_mode,/* in: OS_FILE_OPEN if an existing file
1013
				is opened (if does not exist, error), or
1014
				OS_FILE_CREATE if a new file is created
1015
				(if exists, error) */
1016
	ulint		access_type,/* in: OS_FILE_READ_ONLY,
1017
				OS_FILE_READ_WRITE, or
1018
				OS_FILE_READ_ALLOW_DELETE; the last option is
1019
				used by a backup program reading the file */
1020
	ibool*		success)/* out: TRUE if succeed, FALSE if error */
1021
{
1022
#ifdef __WIN__
1023
	os_file_t	file;
1024
	DWORD		create_flag;
1025
	DWORD		access;
1026
	DWORD		attributes	= 0;
1027
	DWORD		share_mode	= FILE_SHARE_READ | FILE_SHARE_WRITE;
1028
1029
	ut_a(name);
1030
1031
	if (create_mode == OS_FILE_OPEN) {
1032
		create_flag = OPEN_EXISTING;
1033
	} else if (create_mode == OS_FILE_CREATE) {
1034
		create_flag = CREATE_NEW;
1035
	} else {
1036
		create_flag = 0;
1037
		ut_error;
1038
	}
1039
1040
	if (access_type == OS_FILE_READ_ONLY) {
1041
		access = GENERIC_READ;
1042
	} else if (access_type == OS_FILE_READ_WRITE) {
1043
		access = GENERIC_READ | GENERIC_WRITE;
1044
	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1045
		access = GENERIC_READ;
1046
		share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1047
			| FILE_SHARE_WRITE;	/* A backup program has to give
1048
						mysqld the maximum freedom to
1049
						do what it likes with the
1050
						file */
1051
	} else {
1052
		access = 0;
1053
		ut_error;
1054
	}
1055
1056
	file = CreateFile((LPCTSTR) name,
1057
			  access,
1058
			  share_mode,
1059
			  NULL,	/* default security attributes */
1060
			  create_flag,
1061
			  attributes,
1062
			  NULL);	/* no template file */
1063
1064
	if (file == INVALID_HANDLE_VALUE) {
1065
		*success = FALSE;
1066
	} else {
1067
		*success = TRUE;
1068
	}
1069
1070
	return(file);
1071
#else /* __WIN__ */
1072
	os_file_t	file;
1073
	int		create_flag;
1074
1075
	ut_a(name);
1076
1077
	if (create_mode == OS_FILE_OPEN) {
1078
		if (access_type == OS_FILE_READ_ONLY) {
1079
			create_flag = O_RDONLY;
1080
		} else {
1081
			create_flag = O_RDWR;
1082
		}
1083
	} else if (create_mode == OS_FILE_CREATE) {
1084
		create_flag = O_RDWR | O_CREAT | O_EXCL;
1085
	} else {
1086
		create_flag = 0;
1087
		ut_error;
1088
	}
1089
1090
	if (create_mode == OS_FILE_CREATE) {
1091
		file = open(name, create_flag, S_IRUSR | S_IWUSR
1092
			    | S_IRGRP | S_IWGRP);
1093
	} else {
1094
		file = open(name, create_flag);
1095
	}
1096
1097
	if (file == -1) {
1098
		*success = FALSE;
1099
#ifdef USE_FILE_LOCK
1100
	} else if (access_type == OS_FILE_READ_WRITE
1101
		   && os_file_lock(file, name)) {
1102
		*success = FALSE;
1103
		close(file);
1104
		file = -1;
1105
#endif
1106
	} else {
1107
		*success = TRUE;
1108
	}
1109
1110
	return(file);
1111
#endif /* __WIN__ */
1112
}
1113
1114
/********************************************************************
1115
Tries to disable OS caching on an opened file descriptor. */
1116
1117
void
1118
os_file_set_nocache(
1119
/*================*/
1120
	int		fd,		/* in: file descriptor to alter */
1121
	const char*	file_name,	/* in: used in the diagnostic message */
1122
	const char*	operation_name)	/* in: used in the diagnostic message,
1123
					we call os_file_set_nocache()
1124
					immediately after opening or creating
1125
					a file, so this is either "open" or
1126
					"create" */
1127
{
1128
	/* some versions of Solaris may not have DIRECTIO_ON */
1129
#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1130
	if (directio(fd, DIRECTIO_ON) == -1) {
1131
		int	errno_save;
1132
		errno_save = (int)errno;
1133
		ut_print_timestamp(stderr);
1134
		fprintf(stderr,
1135
			"  InnoDB: Failed to set DIRECTIO_ON "
1136
			"on file %s: %s: %s, continuing anyway\n",
1137
			file_name, operation_name, strerror(errno_save));
1138
	}
1139
#elif defined(O_DIRECT)
1140
	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1141
		int	errno_save;
1142
		errno_save = (int)errno;
1143
		ut_print_timestamp(stderr);
1144
		fprintf(stderr,
1145
			"  InnoDB: Failed to set O_DIRECT "
1146
			"on file %s: %s: %s, continuing anyway\n",
1147
			file_name, operation_name, strerror(errno_save));
1148
		if (errno_save == EINVAL) {
1149
			ut_print_timestamp(stderr);
1150
			fprintf(stderr,
1151
				"  InnoDB: O_DIRECT is known to result in "
1152
				"'Invalid argument' on Linux on tmpfs, "
1153
				"see MySQL Bug#26662\n");
1154
		}
1155
	}
1156
#endif
1157
}
1158
1159
/********************************************************************
1160
Opens an existing file or creates a new. */
1161
1162
os_file_t
1163
os_file_create(
1164
/*===========*/
1165
				/* out, own: handle to the file, not defined
1166
				if error, error number can be retrieved with
1167
				os_file_get_last_error */
1168
	const char*	name,	/* in: name of the file or path as a
1169
				null-terminated string */
1170
	ulint		create_mode,/* in: OS_FILE_OPEN if an existing file
1171
				is opened (if does not exist, error), or
1172
				OS_FILE_CREATE if a new file is created
1173
				(if exists, error),
1174
				OS_FILE_OVERWRITE if a new file is created
1175
				or an old overwritten;
1176
				OS_FILE_OPEN_RAW, if a raw device or disk
1177
				partition should be opened */
1178
	ulint		purpose,/* in: OS_FILE_AIO, if asynchronous,
1179
				non-buffered i/o is desired,
1180
				OS_FILE_NORMAL, if any normal file;
1181
				NOTE that it also depends on type, os_aio_..
1182
				and srv_.. variables whether we really use
1183
				async i/o or unbuffered i/o: look in the
1184
				function source code for the exact rules */
1185
	ulint		type,	/* in: OS_DATA_FILE or OS_LOG_FILE */
1186
	ibool*		success)/* out: TRUE if succeed, FALSE if error */
1187
{
1188
#ifdef __WIN__
1189
	os_file_t	file;
1190
	DWORD		share_mode	= FILE_SHARE_READ;
1191
	DWORD		create_flag;
1192
	DWORD		attributes;
1193
	ibool		retry;
1194
try_again:
1195
	ut_a(name);
1196
1197
	if (create_mode == OS_FILE_OPEN_RAW) {
1198
		create_flag = OPEN_EXISTING;
1199
		share_mode = FILE_SHARE_WRITE;
1200
	} else if (create_mode == OS_FILE_OPEN
1201
		   || create_mode == OS_FILE_OPEN_RETRY) {
1202
		create_flag = OPEN_EXISTING;
1203
	} else if (create_mode == OS_FILE_CREATE) {
1204
		create_flag = CREATE_NEW;
1205
	} else if (create_mode == OS_FILE_OVERWRITE) {
1206
		create_flag = CREATE_ALWAYS;
1207
	} else {
1208
		create_flag = 0;
1209
		ut_error;
1210
	}
1211
1212
	if (purpose == OS_FILE_AIO) {
1213
		/* If specified, use asynchronous (overlapped) io and no
1214
		buffering of writes in the OS */
1215
		attributes = 0;
1216
#ifdef WIN_ASYNC_IO
1217
		if (os_aio_use_native_aio) {
1218
			attributes = attributes | FILE_FLAG_OVERLAPPED;
1219
		}
1220
#endif
1221
#ifdef UNIV_NON_BUFFERED_IO
1222
		if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1223
			/* Do not use unbuffered i/o to log files because
1224
			value 2 denotes that we do not flush the log at every
1225
			commit, but only once per second */
1226
		} else if (srv_win_file_flush_method
1227
			   == SRV_WIN_IO_UNBUFFERED) {
1228
			attributes = attributes | FILE_FLAG_NO_BUFFERING;
1229
		}
1230
#endif
1231
	} else if (purpose == OS_FILE_NORMAL) {
1232
		attributes = 0;
1233
#ifdef UNIV_NON_BUFFERED_IO
1234
		if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1235
			/* Do not use unbuffered i/o to log files because
1236
			value 2 denotes that we do not flush the log at every
1237
			commit, but only once per second */
1238
		} else if (srv_win_file_flush_method
1239
			   == SRV_WIN_IO_UNBUFFERED) {
1240
			attributes = attributes | FILE_FLAG_NO_BUFFERING;
1241
		}
1242
#endif
1243
	} else {
1244
		attributes = 0;
1245
		ut_error;
1246
	}
1247
1248
	file = CreateFile((LPCTSTR) name,
1249
			  GENERIC_READ | GENERIC_WRITE, /* read and write
1250
							access */
1251
			  share_mode,	/* File can be read also by other
1252
					processes; we must give the read
1253
					permission because of ibbackup. We do
1254
					not give the write permission to
1255
					others because if one would succeed to
1256
					start 2 instances of mysqld on the
1257
					SAME files, that could cause severe
1258
					database corruption! When opening
1259
					raw disk partitions, Microsoft manuals
1260
					say that we must give also the write
1261
					permission. */
1262
			  NULL,	/* default security attributes */
1263
			  create_flag,
1264
			  attributes,
1265
			  NULL);	/* no template file */
1266
1267
	if (file == INVALID_HANDLE_VALUE) {
1268
		*success = FALSE;
1269
1270
		retry = os_file_handle_error(name,
1271
					     create_mode == OS_FILE_CREATE ?
1272
					     "create" : "open");
1273
		if (retry) {
1274
			goto try_again;
1275
		}
1276
	} else {
1277
		*success = TRUE;
1278
	}
1279
1280
	return(file);
1281
#else /* __WIN__ */
1282
	os_file_t	file;
1283
	int		create_flag;
1284
	ibool		retry;
1285
	const char*	mode_str	= NULL;
1286
	const char*	type_str	= NULL;
1287
	const char*	purpose_str	= NULL;
1288
1289
try_again:
1290
	ut_a(name);
1291
1292
	if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1293
	    || create_mode == OS_FILE_OPEN_RETRY) {
1294
		mode_str = "OPEN";
1295
		create_flag = O_RDWR;
1296
	} else if (create_mode == OS_FILE_CREATE) {
1297
		mode_str = "CREATE";
1298
		create_flag = O_RDWR | O_CREAT | O_EXCL;
1299
	} else if (create_mode == OS_FILE_OVERWRITE) {
1300
		mode_str = "OVERWRITE";
1301
		create_flag = O_RDWR | O_CREAT | O_TRUNC;
1302
	} else {
1303
		create_flag = 0;
1304
		ut_error;
1305
	}
1306
1307
	if (type == OS_LOG_FILE) {
1308
		type_str = "LOG";
1309
	} else if (type == OS_DATA_FILE) {
1310
		type_str = "DATA";
1311
	} else {
1312
		ut_error;
1313
	}
1314
1315
	if (purpose == OS_FILE_AIO) {
1316
		purpose_str = "AIO";
1317
	} else if (purpose == OS_FILE_NORMAL) {
1318
		purpose_str = "NORMAL";
1319
	} else {
1320
		ut_error;
1321
	}
1322
1323
#if 0
1324
	fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
1325
		name, mode_str, type_str, purpose_str);
1326
#endif
1327
#ifdef O_SYNC
1328
	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1329
	O_SYNC because the datasync options seemed to corrupt files in 2001
1330
	in both Linux and Solaris */
1331
	if (type == OS_LOG_FILE
1332
	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1333
1334
# if 0
1335
		fprintf(stderr, "Using O_SYNC for file %s\n", name);
1336
# endif
1337
1338
		create_flag = create_flag | O_SYNC;
1339
	}
1340
#endif /* O_SYNC */
1341
1342
	file = open(name, create_flag, os_innodb_umask);
1343
1344
	if (file == -1) {
1345
		*success = FALSE;
1346
1347
		retry = os_file_handle_error(name,
1348
					     create_mode == OS_FILE_CREATE ?
1349
					     "create" : "open");
1350
		if (retry) {
1351
			goto try_again;
1352
		} else {
1353
			return(file /* -1 */);
1354
		}
1355
	}
1356
	/* else */
1357
1358
	*success = TRUE;
1359
1360
	/* We disable OS caching (O_DIRECT) only on data files */
1361
	if (type != OS_LOG_FILE
1362
	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1363
		
1364
		os_file_set_nocache(file, name, mode_str);
1365
	}
1366
1367
#ifdef USE_FILE_LOCK
1368
	if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1369
1370
		if (create_mode == OS_FILE_OPEN_RETRY) {
1371
			int i;
1372
			ut_print_timestamp(stderr);
1373
			fputs("  InnoDB: Retrying to lock"
1374
			      " the first data file\n",
1375
			      stderr);
1376
			for (i = 0; i < 100; i++) {
1377
				os_thread_sleep(1000000);
1378
				if (!os_file_lock(file, name)) {
1379
					*success = TRUE;
1380
					return(file);
1381
				}
1382
			}
1383
			ut_print_timestamp(stderr);
1384
			fputs("  InnoDB: Unable to open the first data file\n",
1385
			      stderr);
1386
		}
1387
1388
		*success = FALSE;
1389
		close(file);
1390
		file = -1;
1391
	}
1392
#endif /* USE_FILE_LOCK */
1393
1394
	return(file);
1395
#endif /* __WIN__ */
1396
}
1397
1398
/***************************************************************************
1399
Deletes a file if it exists. The file has to be closed before calling this. */
1400
1401
ibool
1402
os_file_delete_if_exists(
1403
/*=====================*/
1404
				/* out: TRUE if success */
1405
	const char*	name)	/* in: file path as a null-terminated string */
1406
{
1407
#ifdef __WIN__
1408
	BOOL	ret;
1409
	ulint	count	= 0;
1410
loop:
1411
	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1412
	it */
1413
1414
	ret = DeleteFile((LPCTSTR)name);
1415
1416
	if (ret) {
1417
		return(TRUE);
1418
	}
1419
1420
	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1421
		/* the file does not exist, this not an error */
1422
1423
		return(TRUE);
1424
	}
1425
1426
	count++;
1427
1428
	if (count > 100 && 0 == (count % 10)) {
1429
		fprintf(stderr,
1430
			"InnoDB: Warning: cannot delete file %s\n"
1431
			"InnoDB: Are you running ibbackup"
1432
			" to back up the file?\n", name);
1433
1434
		os_file_get_last_error(TRUE); /* print error information */
1435
	}
1436
1437
	os_thread_sleep(1000000);	/* sleep for a second */
1438
1439
	if (count > 2000) {
1440
1441
		return(FALSE);
1442
	}
1443
1444
	goto loop;
1445
#else
1446
	int	ret;
1447
1448
	ret = unlink((const char*)name);
1449
1450
	if (ret != 0 && errno != ENOENT) {
1451
		os_file_handle_error_no_exit(name, "delete");
1452
1453
		return(FALSE);
1454
	}
1455
1456
	return(TRUE);
1457
#endif
1458
}
1459
1460
/***************************************************************************
1461
Deletes a file. The file has to be closed before calling this. */
1462
1463
ibool
1464
os_file_delete(
1465
/*===========*/
1466
				/* out: TRUE if success */
1467
	const char*	name)	/* in: file path as a null-terminated string */
1468
{
1469
#ifdef __WIN__
1470
	BOOL	ret;
1471
	ulint	count	= 0;
1472
loop:
1473
	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1474
	it */
1475
1476
	ret = DeleteFile((LPCTSTR)name);
1477
1478
	if (ret) {
1479
		return(TRUE);
1480
	}
1481
1482
	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1483
		/* If the file does not exist, we classify this as a 'mild'
1484
		error and return */
1485
1486
		return(FALSE);
1487
	}
1488
1489
	count++;
1490
1491
	if (count > 100 && 0 == (count % 10)) {
1492
		fprintf(stderr,
1493
			"InnoDB: Warning: cannot delete file %s\n"
1494
			"InnoDB: Are you running ibbackup"
1495
			" to back up the file?\n", name);
1496
1497
		os_file_get_last_error(TRUE); /* print error information */
1498
	}
1499
1500
	os_thread_sleep(1000000);	/* sleep for a second */
1501
1502
	if (count > 2000) {
1503
1504
		return(FALSE);
1505
	}
1506
1507
	goto loop;
1508
#else
1509
	int	ret;
1510
1511
	ret = unlink((const char*)name);
1512
1513
	if (ret != 0) {
1514
		os_file_handle_error_no_exit(name, "delete");
1515
1516
		return(FALSE);
1517
	}
1518
1519
	return(TRUE);
1520
#endif
1521
}
1522
1523
/***************************************************************************
1524
Renames a file (can also move it to another directory). It is safest that the
1525
file is closed before calling this function. */
1526
1527
ibool
1528
os_file_rename(
1529
/*===========*/
1530
				/* out: TRUE if success */
1531
	const char*	oldpath,/* in: old file path as a null-terminated
1532
				string */
1533
	const char*	newpath)/* in: new file path */
1534
{
1535
#ifdef __WIN__
1536
	BOOL	ret;
1537
1538
	ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1539
1540
	if (ret) {
1541
		return(TRUE);
1542
	}
1543
1544
	os_file_handle_error_no_exit(oldpath, "rename");
1545
1546
	return(FALSE);
1547
#else
1548
	int	ret;
1549
1550
	ret = rename((const char*)oldpath, (const char*)newpath);
1551
1552
	if (ret != 0) {
1553
		os_file_handle_error_no_exit(oldpath, "rename");
1554
1555
		return(FALSE);
1556
	}
1557
1558
	return(TRUE);
1559
#endif
1560
}
1561
1562
/***************************************************************************
1563
Closes a file handle. In case of error, error number can be retrieved with
1564
os_file_get_last_error. */
1565
1566
ibool
1567
os_file_close(
1568
/*==========*/
1569
				/* out: TRUE if success */
1570
	os_file_t	file)	/* in, own: handle to a file */
1571
{
1572
#ifdef __WIN__
1573
	BOOL	ret;
1574
1575
	ut_a(file);
1576
1577
	ret = CloseHandle(file);
1578
1579
	if (ret) {
1580
		return(TRUE);
1581
	}
1582
1583
	os_file_handle_error(NULL, "close");
1584
1585
	return(FALSE);
1586
#else
1587
	int	ret;
1588
1589
	ret = close(file);
1590
1591
	if (ret == -1) {
1592
		os_file_handle_error(NULL, "close");
1593
1594
		return(FALSE);
1595
	}
1596
1597
	return(TRUE);
1598
#endif
1599
}
1600
1601
/***************************************************************************
1602
Closes a file handle. */
1603
1604
ibool
1605
os_file_close_no_error_handling(
1606
/*============================*/
1607
				/* out: TRUE if success */
1608
	os_file_t	file)	/* in, own: handle to a file */
1609
{
1610
#ifdef __WIN__
1611
	BOOL	ret;
1612
1613
	ut_a(file);
1614
1615
	ret = CloseHandle(file);
1616
1617
	if (ret) {
1618
		return(TRUE);
1619
	}
1620
1621
	return(FALSE);
1622
#else
1623
	int	ret;
1624
1625
	ret = close(file);
1626
1627
	if (ret == -1) {
1628
1629
		return(FALSE);
1630
	}
1631
1632
	return(TRUE);
1633
#endif
1634
}
1635
1636
/***************************************************************************
1637
Gets a file size. */
1638
1639
ibool
1640
os_file_get_size(
1641
/*=============*/
1642
				/* out: TRUE if success */
1643
	os_file_t	file,	/* in: handle to a file */
1644
	ulint*		size,	/* out: least significant 32 bits of file
1645
				size */
1646
	ulint*		size_high)/* out: most significant 32 bits of size */
1647
{
1648
#ifdef __WIN__
1649
	DWORD	high;
1650
	DWORD	low;
1651
1652
	low = GetFileSize(file, &high);
1653
1654
	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1655
		return(FALSE);
1656
	}
1657
1658
	*size = low;
1659
	*size_high = high;
1660
1661
	return(TRUE);
1662
#else
1663
	off_t	offs;
1664
1665
	offs = lseek(file, 0, SEEK_END);
1666
1667
	if (offs == ((off_t)-1)) {
1668
1669
		return(FALSE);
1670
	}
1671
1672
	if (sizeof(off_t) > 4) {
1673
		*size = (ulint)(offs & 0xFFFFFFFFUL);
1674
		*size_high = (ulint)(offs >> 32);
1675
	} else {
1676
		*size = (ulint) offs;
1677
		*size_high = 0;
1678
	}
1679
1680
	return(TRUE);
1681
#endif
1682
}
1683
1684
/***************************************************************************
1685
Gets file size as a 64-bit integer ib_longlong. */
1686
1687
ib_longlong
1688
os_file_get_size_as_iblonglong(
1689
/*===========================*/
1690
				/* out: size in bytes, -1 if error */
1691
	os_file_t	file)	/* in: handle to a file */
1692
{
1693
	ulint	size;
1694
	ulint	size_high;
1695
	ibool	success;
1696
1697
	success = os_file_get_size(file, &size, &size_high);
1698
1699
	if (!success) {
1700
1701
		return(-1);
1702
	}
1703
1704
	return((((ib_longlong)size_high) << 32) + (ib_longlong)size);
1705
}
1706
1707
/***************************************************************************
1708
Write the specified number of zeros to a newly created file. */
1709
1710
ibool
1711
os_file_set_size(
1712
/*=============*/
1713
				/* out: TRUE if success */
1714
	const char*	name,	/* in: name of the file or path as a
1715
				null-terminated string */
1716
	os_file_t	file,	/* in: handle to a file */
1717
	ulint		size,	/* in: least significant 32 bits of file
1718
				size */
1719
	ulint		size_high)/* in: most significant 32 bits of size */
1720
{
1721
	ib_longlong	current_size;
1722
	ib_longlong	desired_size;
1723
	ibool		ret;
1724
	byte*		buf;
1725
	byte*		buf2;
1726
	ulint		buf_size;
1727
1728
	ut_a(size == (size & 0xFFFFFFFF));
1729
1730
	current_size = 0;
1731
	desired_size = (ib_longlong)size + (((ib_longlong)size_high) << 32);
1732
1733
	/* Write up to 1 megabyte at a time. */
1734
	buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1735
		* UNIV_PAGE_SIZE;
1736
	buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
1737
1738
	/* Align the buffer for possible raw i/o */
1739
	buf = ut_align(buf2, UNIV_PAGE_SIZE);
1740
1741
	/* Write buffer full of zeros */
1742
	memset(buf, 0, buf_size);
1743
1744
	if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
1745
1746
		fprintf(stderr, "InnoDB: Progress in MB:");
1747
	}
1748
1749
	while (current_size < desired_size) {
1750
		ulint	n_bytes;
1751
1752
		if (desired_size - current_size < (ib_longlong) buf_size) {
1753
			n_bytes = (ulint) (desired_size - current_size);
1754
		} else {
1755
			n_bytes = buf_size;
1756
		}
1757
1758
		ret = os_file_write(name, file, buf,
1759
				    (ulint)(current_size & 0xFFFFFFFF),
1760
				    (ulint)(current_size >> 32),
1761
				    n_bytes);
1762
		if (!ret) {
1763
			ut_free(buf2);
1764
			goto error_handling;
1765
		}
1766
1767
		/* Print about progress for each 100 MB written */
1768
		if ((ib_longlong) (current_size + n_bytes) / (ib_longlong)(100 * 1024 * 1024)
1769
		    != current_size / (ib_longlong)(100 * 1024 * 1024)) {
1770
1771
			fprintf(stderr, " %lu00",
1772
				(ulong) ((current_size + n_bytes)
1773
					 / (ib_longlong)(100 * 1024 * 1024)));
1774
		}
1775
1776
		current_size += n_bytes;
1777
	}
1778
1779
	if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
1780
1781
		fprintf(stderr, "\n");
1782
	}
1783
1784
	ut_free(buf2);
1785
1786
	ret = os_file_flush(file);
1787
1788
	if (ret) {
1789
		return(TRUE);
1790
	}
1791
1792
error_handling:
1793
	return(FALSE);
1794
}
1795
1796
/***************************************************************************
1797
Truncates a file at its current position. */
1798
1799
ibool
1800
os_file_set_eof(
1801
/*============*/
1802
				/* out: TRUE if success */
1803
	FILE*		file)	/* in: file to be truncated */
1804
{
1805
#ifdef __WIN__
1806
	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
1807
	return(SetEndOfFile(h));
1808
#else /* __WIN__ */
1809
	return(!ftruncate(fileno(file), ftell(file)));
1810
#endif /* __WIN__ */
1811
}
1812
1813
#ifndef __WIN__
1814
/***************************************************************************
1815
Wrapper to fsync(2) that retries the call on some errors.
1816
Returns the value 0 if successful; otherwise the value -1 is returned and
1817
the global variable errno is set to indicate the error. */
1818
1819
static
1820
int
1821
os_file_fsync(
1822
/*==========*/
1823
				/* out: 0 if success, -1 otherwise */
1824
	os_file_t	file)	/* in: handle to a file */
1825
{
1826
	int	ret;
1827
	int	failures;
1828
	ibool	retry;
1829
1830
	failures = 0;
1831
1832
	do {
1833
		ret = fsync(file);
1834
1835
		os_n_fsyncs++;
1836
1837
		if (ret == -1 && errno == ENOLCK) {
1838
1839
			if (failures % 100 == 0) {
1840
1841
				ut_print_timestamp(stderr);
1842
				fprintf(stderr,
1843
					"  InnoDB: fsync(): "
1844
					"No locks available; retrying\n");
1845
			}
1846
1847
			os_thread_sleep(200000 /* 0.2 sec */);
1848
1849
			failures++;
1850
1851
			retry = TRUE;
1852
		} else {
1853
1854
			retry = FALSE;
1855
		}
1856
	} while (retry);
1857
1858
	return(ret);
1859
}
1860
#endif /* !__WIN__ */
1861
1862
/***************************************************************************
1863
Flushes the write buffers of a given file to the disk. */
1864
1865
ibool
1866
os_file_flush(
1867
/*==========*/
1868
				/* out: TRUE if success */
1869
	os_file_t	file)	/* in, own: handle to a file */
1870
{
1871
#ifdef __WIN__
1872
	BOOL	ret;
1873
1874
	ut_a(file);
1875
1876
	os_n_fsyncs++;
1877
1878
	ret = FlushFileBuffers(file);
1879
1880
	if (ret) {
1881
		return(TRUE);
1882
	}
1883
1884
	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
1885
	actually a raw device, we choose to ignore that error if we are using
1886
	raw disks */
1887
1888
	if (srv_start_raw_disk_in_use && GetLastError()
1889
	    == ERROR_INVALID_FUNCTION) {
1890
		return(TRUE);
1891
	}
1892
1893
	os_file_handle_error(NULL, "flush");
1894
1895
	/* It is a fatal error if a file flush does not succeed, because then
1896
	the database can get corrupt on disk */
1897
	ut_error;
1898
1899
	return(FALSE);
1900
#else
1901
	int	ret;
1902
1903
#if defined(HAVE_DARWIN_THREADS)
1904
# ifndef F_FULLFSYNC
1905
	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
1906
#  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
1907
# elif F_FULLFSYNC != 51
1908
#  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
1909
# endif
1910
	/* Apple has disabled fsync() for internal disk drives in OS X. That
1911
	caused corruption for a user when he tested a power outage. Let us in
1912
	OS X use a nonstandard flush method recommended by an Apple
1913
	engineer. */
1914
1915
	if (!srv_have_fullfsync) {
1916
		/* If we are not on an operating system that supports this,
1917
		then fall back to a plain fsync. */
1918
1919
		ret = os_file_fsync(file);
1920
	} else {
1921
		ret = fcntl(file, F_FULLFSYNC, NULL);
1922
1923
		if (ret) {
1924
			/* If we are not on a file system that supports this,
1925
			then fall back to a plain fsync. */
1926
			ret = os_file_fsync(file);
1927
		}
1928
	}
1929
#else
1930
	ret = os_file_fsync(file);
1931
#endif
1932
1933
	if (ret == 0) {
1934
		return(TRUE);
1935
	}
1936
1937
	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
1938
	we choose to ignore that error if we are using raw disks */
1939
1940
	if (srv_start_raw_disk_in_use && errno == EINVAL) {
1941
1942
		return(TRUE);
1943
	}
1944
1945
	ut_print_timestamp(stderr);
1946
1947
	fprintf(stderr,
1948
		"  InnoDB: Error: the OS said file flush did not succeed\n");
1949
1950
	os_file_handle_error(NULL, "flush");
1951
1952
	/* It is a fatal error if a file flush does not succeed, because then
1953
	the database can get corrupt on disk */
1954
	ut_error;
1955
1956
	return(FALSE);
1957
#endif
1958
}
1959
1960
#ifndef __WIN__
1961
/***********************************************************************
1962
Does a synchronous read operation in Posix. */
1963
static
1964
ssize_t
1965
os_file_pread(
1966
/*==========*/
1967
				/* out: number of bytes read, -1 if error */
1968
	os_file_t	file,	/* in: handle to a file */
1969
	void*		buf,	/* in: buffer where to read */
1970
	ulint		n,	/* in: number of bytes to read */
1971
	ulint		offset,	/* in: least significant 32 bits of file
1972
				offset from where to read */
1973
	ulint		offset_high) /* in: most significant 32 bits of
1974
				offset */
1975
{
1976
	off_t	offs;
1977
	ssize_t	n_bytes;
1978
1979
	ut_a((offset & 0xFFFFFFFFUL) == offset);
1980
1981
	/* If off_t is > 4 bytes in size, then we assume we can pass a
1982
	64-bit address */
1983
1984
	if (sizeof(off_t) > 4) {
1985
		offs = (off_t)offset + (((off_t)offset_high) << 32);
1986
1987
	} else {
1988
		offs = (off_t)offset;
1989
1990
		if (offset_high > 0) {
1991
			fprintf(stderr,
1992
				"InnoDB: Error: file read at offset > 4 GB\n");
1993
		}
1994
	}
1995
1996
	os_n_file_reads++;
1997
1998
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
1999
	os_mutex_enter(os_file_count_mutex);
2000
	os_file_n_pending_preads++;
2001
	os_n_pending_reads++;
2002
	os_mutex_exit(os_file_count_mutex);
2003
2004
	n_bytes = pread(file, buf, (ssize_t)n, offs);
2005
2006
	os_mutex_enter(os_file_count_mutex);
2007
	os_file_n_pending_preads--;
2008
	os_n_pending_reads--;
2009
	os_mutex_exit(os_file_count_mutex);
2010
2011
	return(n_bytes);
2012
#else
2013
	{
2014
		off_t	ret_offset;
2015
		ssize_t	ret;
2016
		ulint	i;
2017
2018
		os_mutex_enter(os_file_count_mutex);
2019
		os_n_pending_reads++;
2020
		os_mutex_exit(os_file_count_mutex);
2021
2022
		/* Protect the seek / read operation with a mutex */
2023
		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2024
2025
		os_mutex_enter(os_file_seek_mutexes[i]);
2026
2027
		ret_offset = lseek(file, offs, SEEK_SET);
2028
2029
		if (ret_offset < 0) {
2030
			ret = -1;
2031
		} else {
2032
			ret = read(file, buf, (ssize_t)n);
2033
		}
2034
2035
		os_mutex_exit(os_file_seek_mutexes[i]);
2036
2037
		os_mutex_enter(os_file_count_mutex);
2038
		os_n_pending_reads--;
2039
		os_mutex_exit(os_file_count_mutex);
2040
2041
		return(ret);
2042
	}
2043
#endif
2044
}
2045
2046
/***********************************************************************
2047
Does a synchronous write operation in Posix. */
2048
static
2049
ssize_t
2050
os_file_pwrite(
2051
/*===========*/
2052
				/* out: number of bytes written, -1 if error */
2053
	os_file_t	file,	/* in: handle to a file */
2054
	const void*	buf,	/* in: buffer from where to write */
2055
	ulint		n,	/* in: number of bytes to write */
2056
	ulint		offset,	/* in: least significant 32 bits of file
2057
				offset where to write */
2058
	ulint		offset_high) /* in: most significant 32 bits of
2059
				offset */
2060
{
2061
	ssize_t	ret;
2062
	off_t	offs;
2063
2064
	ut_a((offset & 0xFFFFFFFFUL) == offset);
2065
2066
	/* If off_t is > 4 bytes in size, then we assume we can pass a
2067
	64-bit address */
2068
2069
	if (sizeof(off_t) > 4) {
2070
		offs = (off_t)offset + (((off_t)offset_high) << 32);
2071
	} else {
2072
		offs = (off_t)offset;
2073
2074
		if (offset_high > 0) {
2075
			fprintf(stderr,
2076
				"InnoDB: Error: file write"
2077
				" at offset > 4 GB\n");
2078
		}
2079
	}
2080
2081
	os_n_file_writes++;
2082
2083
#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2084
	os_mutex_enter(os_file_count_mutex);
2085
	os_file_n_pending_pwrites++;
2086
	os_n_pending_writes++;
2087
	os_mutex_exit(os_file_count_mutex);
2088
2089
	ret = pwrite(file, buf, (ssize_t)n, offs);
2090
2091
	os_mutex_enter(os_file_count_mutex);
2092
	os_file_n_pending_pwrites--;
2093
	os_n_pending_writes--;
2094
	os_mutex_exit(os_file_count_mutex);
2095
2096
# ifdef UNIV_DO_FLUSH
2097
	if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2098
	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2099
	    && !os_do_not_call_flush_at_each_write) {
2100
2101
		/* Always do fsync to reduce the probability that when
2102
		the OS crashes, a database page is only partially
2103
		physically written to disk. */
2104
2105
		ut_a(TRUE == os_file_flush(file));
2106
	}
2107
# endif /* UNIV_DO_FLUSH */
2108
2109
	return(ret);
2110
#else
2111
	{
2112
		off_t	ret_offset;
2113
		ulint	i;
2114
2115
		os_mutex_enter(os_file_count_mutex);
2116
		os_n_pending_writes++;
2117
		os_mutex_exit(os_file_count_mutex);
2118
2119
		/* Protect the seek / write operation with a mutex */
2120
		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2121
2122
		os_mutex_enter(os_file_seek_mutexes[i]);
2123
2124
		ret_offset = lseek(file, offs, SEEK_SET);
2125
2126
		if (ret_offset < 0) {
2127
			ret = -1;
2128
2129
			goto func_exit;
2130
		}
2131
2132
		ret = write(file, buf, (ssize_t)n);
2133
2134
# ifdef UNIV_DO_FLUSH
2135
		if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2136
		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2137
		    && !os_do_not_call_flush_at_each_write) {
2138
2139
			/* Always do fsync to reduce the probability that when
2140
			the OS crashes, a database page is only partially
2141
			physically written to disk. */
2142
2143
			ut_a(TRUE == os_file_flush(file));
2144
		}
2145
# endif /* UNIV_DO_FLUSH */
2146
2147
func_exit:
2148
		os_mutex_exit(os_file_seek_mutexes[i]);
2149
2150
		os_mutex_enter(os_file_count_mutex);
2151
		os_n_pending_writes--;
2152
		os_mutex_exit(os_file_count_mutex);
2153
2154
		return(ret);
2155
	}
2156
#endif
2157
}
2158
#endif
2159
2160
/***********************************************************************
2161
Requests a synchronous positioned read operation. */
2162
2163
ibool
2164
os_file_read(
2165
/*=========*/
2166
				/* out: TRUE if request was
2167
				successful, FALSE if fail */
2168
	os_file_t	file,	/* in: handle to a file */
2169
	void*		buf,	/* in: buffer where to read */
2170
	ulint		offset,	/* in: least significant 32 bits of file
2171
				offset where to read */
2172
	ulint		offset_high, /* in: most significant 32 bits of
2173
				offset */
2174
	ulint		n)	/* in: number of bytes to read */
2175
{
2176
#ifdef __WIN__
2177
	BOOL		ret;
2178
	DWORD		len;
2179
	DWORD		ret2;
2180
	DWORD		low;
2181
	DWORD		high;
2182
	ibool		retry;
2183
	ulint		i;
2184
2185
	ut_a((offset & 0xFFFFFFFFUL) == offset);
2186
2187
	os_n_file_reads++;
2188
	os_bytes_read_since_printout += n;
2189
2190
try_again:
2191
	ut_ad(file);
2192
	ut_ad(buf);
2193
	ut_ad(n > 0);
2194
2195
	low = (DWORD) offset;
2196
	high = (DWORD) offset_high;
2197
2198
	os_mutex_enter(os_file_count_mutex);
2199
	os_n_pending_reads++;
2200
	os_mutex_exit(os_file_count_mutex);
2201
2202
	/* Protect the seek / read operation with a mutex */
2203
	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2204
2205
	os_mutex_enter(os_file_seek_mutexes[i]);
2206
2207
	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2208
2209
	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2210
2211
		os_mutex_exit(os_file_seek_mutexes[i]);
2212
2213
		os_mutex_enter(os_file_count_mutex);
2214
		os_n_pending_reads--;
2215
		os_mutex_exit(os_file_count_mutex);
2216
2217
		goto error_handling;
2218
	}
2219
2220
	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2221
2222
	os_mutex_exit(os_file_seek_mutexes[i]);
2223
2224
	os_mutex_enter(os_file_count_mutex);
2225
	os_n_pending_reads--;
2226
	os_mutex_exit(os_file_count_mutex);
2227
2228
	if (ret && len == n) {
2229
		return(TRUE);
2230
	}
2231
#else
2232
	ibool	retry;
2233
	ssize_t	ret;
2234
2235
	os_bytes_read_since_printout += n;
2236
2237
try_again:
2238
	ret = os_file_pread(file, buf, n, offset, offset_high);
2239
2240
	if ((ulint)ret == n) {
2241
2242
		return(TRUE);
2243
	}
2244
2245
	fprintf(stderr,
2246
		"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2247
		"InnoDB: Was only able to read %ld.\n",
2248
		(ulong)n, (ulong)offset_high,
2249
		(ulong)offset, (long)ret);
2250
#endif
2251
#ifdef __WIN__
2252
error_handling:
2253
#endif
2254
	retry = os_file_handle_error(NULL, "read");
2255
2256
	if (retry) {
2257
		goto try_again;
2258
	}
2259
2260
	fprintf(stderr,
2261
		"InnoDB: Fatal error: cannot read from file."
2262
		" OS error number %lu.\n",
2263
#ifdef __WIN__
2264
		(ulong) GetLastError()
2265
#else
2266
		(ulong) errno
2267
#endif
2268
		);
2269
	fflush(stderr);
2270
2271
	ut_error;
2272
2273
	return(FALSE);
2274
}
2275
2276
/***********************************************************************
2277
Requests a synchronous positioned read operation. This function does not do
2278
any error handling. In case of error it returns FALSE. */
2279
2280
ibool
2281
os_file_read_no_error_handling(
2282
/*===========================*/
2283
				/* out: TRUE if request was
2284
				successful, FALSE if fail */
2285
	os_file_t	file,	/* in: handle to a file */
2286
	void*		buf,	/* in: buffer where to read */
2287
	ulint		offset,	/* in: least significant 32 bits of file
2288
				offset where to read */
2289
	ulint		offset_high, /* in: most significant 32 bits of
2290
				offset */
2291
	ulint		n)	/* in: number of bytes to read */
2292
{
2293
#ifdef __WIN__
2294
	BOOL		ret;
2295
	DWORD		len;
2296
	DWORD		ret2;
2297
	DWORD		low;
2298
	DWORD		high;
2299
	ibool		retry;
2300
	ulint		i;
2301
2302
	ut_a((offset & 0xFFFFFFFFUL) == offset);
2303
2304
	os_n_file_reads++;
2305
	os_bytes_read_since_printout += n;
2306
2307
try_again:
2308
	ut_ad(file);
2309
	ut_ad(buf);
2310
	ut_ad(n > 0);
2311
2312
	low = (DWORD) offset;
2313
	high = (DWORD) offset_high;
2314
2315
	os_mutex_enter(os_file_count_mutex);
2316
	os_n_pending_reads++;
2317
	os_mutex_exit(os_file_count_mutex);
2318
2319
	/* Protect the seek / read operation with a mutex */
2320
	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2321
2322
	os_mutex_enter(os_file_seek_mutexes[i]);
2323
2324
	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2325
2326
	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2327
2328
		os_mutex_exit(os_file_seek_mutexes[i]);
2329
2330
		os_mutex_enter(os_file_count_mutex);
2331
		os_n_pending_reads--;
2332
		os_mutex_exit(os_file_count_mutex);
2333
2334
		goto error_handling;
2335
	}
2336
2337
	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2338
2339
	os_mutex_exit(os_file_seek_mutexes[i]);
2340
2341
	os_mutex_enter(os_file_count_mutex);
2342
	os_n_pending_reads--;
2343
	os_mutex_exit(os_file_count_mutex);
2344
2345
	if (ret && len == n) {
2346
		return(TRUE);
2347
	}
2348
#else
2349
	ibool	retry;
2350
	ssize_t	ret;
2351
2352
	os_bytes_read_since_printout += n;
2353
2354
try_again:
2355
	ret = os_file_pread(file, buf, n, offset, offset_high);
2356
2357
	if ((ulint)ret == n) {
2358
2359
		return(TRUE);
2360
	}
2361
#endif
2362
#ifdef __WIN__
2363
error_handling:
2364
#endif
2365
	retry = os_file_handle_error_no_exit(NULL, "read");
2366
2367
	if (retry) {
2368
		goto try_again;
2369
	}
2370
2371
	return(FALSE);
2372
}
2373
2374
/***********************************************************************
2375
Rewind file to its start, read at most size - 1 bytes from it to str, and
2376
NUL-terminate str. All errors are silently ignored. This function is
2377
mostly meant to be used with temporary files. */
2378
2379
void
2380
os_file_read_string(
2381
/*================*/
2382
	FILE*	file,	/* in: file to read from */
2383
	char*	str,	/* in: buffer where to read */
2384
	ulint	size)	/* in: size of buffer */
2385
{
2386
	size_t	flen;
2387
2388
	if (size == 0) {
2389
		return;
2390
	}
2391
2392
	rewind(file);
2393
	flen = fread(str, 1, size - 1, file);
2394
	str[flen] = '\0';
2395
}
2396
2397
/***********************************************************************
2398
Requests a synchronous write operation. */
2399
2400
ibool
2401
os_file_write(
2402
/*==========*/
2403
				/* out: TRUE if request was
2404
				successful, FALSE if fail */
2405
	const char*	name,	/* in: name of the file or path as a
2406
				null-terminated string */
2407
	os_file_t	file,	/* in: handle to a file */
2408
	const void*	buf,	/* in: buffer from which to write */
2409
	ulint		offset,	/* in: least significant 32 bits of file
2410
				offset where to write */
2411
	ulint		offset_high, /* in: most significant 32 bits of
2412
				offset */
2413
	ulint		n)	/* in: number of bytes to write */
2414
{
2415
#ifdef __WIN__
2416
	BOOL		ret;
2417
	DWORD		len;
2418
	DWORD		ret2;
2419
	DWORD		low;
2420
	DWORD		high;
2421
	ulint		i;
2422
	ulint		n_retries	= 0;
2423
	ulint		err;
2424
2425
	ut_a((offset & 0xFFFFFFFF) == offset);
2426
2427
	os_n_file_writes++;
2428
2429
	ut_ad(file);
2430
	ut_ad(buf);
2431
	ut_ad(n > 0);
2432
retry:
2433
	low = (DWORD) offset;
2434
	high = (DWORD) offset_high;
2435
2436
	os_mutex_enter(os_file_count_mutex);
2437
	os_n_pending_writes++;
2438
	os_mutex_exit(os_file_count_mutex);
2439
2440
	/* Protect the seek / write operation with a mutex */
2441
	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2442
2443
	os_mutex_enter(os_file_seek_mutexes[i]);
2444
2445
	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2446
2447
	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2448
2449
		os_mutex_exit(os_file_seek_mutexes[i]);
2450
2451
		os_mutex_enter(os_file_count_mutex);
2452
		os_n_pending_writes--;
2453
		os_mutex_exit(os_file_count_mutex);
2454
2455
		ut_print_timestamp(stderr);
2456
2457
		fprintf(stderr,
2458
			"  InnoDB: Error: File pointer positioning to"
2459
			" file %s failed at\n"
2460
			"InnoDB: offset %lu %lu. Operating system"
2461
			" error number %lu.\n"
2462
			"InnoDB: Some operating system error numbers"
2463
			" are described at\n"
2464
			"InnoDB: "
2465
			"http://dev.mysql.com/doc/refman/5.1/en/"
2466
			"operating-system-error-codes.html\n",
2467
			name, (ulong) offset_high, (ulong) offset,
2468
			(ulong) GetLastError());
2469
2470
		return(FALSE);
2471
	}
2472
2473
	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2474
2475
	/* Always do fsync to reduce the probability that when the OS crashes,
2476
	a database page is only partially physically written to disk. */
2477
2478
# ifdef UNIV_DO_FLUSH
2479
	if (!os_do_not_call_flush_at_each_write) {
2480
		ut_a(TRUE == os_file_flush(file));
2481
	}
2482
# endif /* UNIV_DO_FLUSH */
2483
2484
	os_mutex_exit(os_file_seek_mutexes[i]);
2485
2486
	os_mutex_enter(os_file_count_mutex);
2487
	os_n_pending_writes--;
2488
	os_mutex_exit(os_file_count_mutex);
2489
2490
	if (ret && len == n) {
2491
2492
		return(TRUE);
2493
	}
2494
2495
	/* If some background file system backup tool is running, then, at
2496
	least in Windows 2000, we may get here a specific error. Let us
2497
	retry the operation 100 times, with 1 second waits. */
2498
2499
	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2500
2501
		os_thread_sleep(1000000);
2502
2503
		n_retries++;
2504
2505
		goto retry;
2506
	}
2507
2508
	if (!os_has_said_disk_full) {
2509
2510
		err = (ulint)GetLastError();
2511
2512
		ut_print_timestamp(stderr);
2513
2514
		fprintf(stderr,
2515
			"  InnoDB: Error: Write to file %s failed"
2516
			" at offset %lu %lu.\n"
2517
			"InnoDB: %lu bytes should have been written,"
2518
			" only %lu were written.\n"
2519
			"InnoDB: Operating system error number %lu.\n"
2520
			"InnoDB: Check that your OS and file system"
2521
			" support files of this size.\n"
2522
			"InnoDB: Check also that the disk is not full"
2523
			" or a disk quota exceeded.\n",
2524
			name, (ulong) offset_high, (ulong) offset,
2525
			(ulong) n, (ulong) len, (ulong) err);
2526
2527
		if (strerror((int)err) != NULL) {
2528
			fprintf(stderr,
2529
				"InnoDB: Error number %lu means '%s'.\n",
2530
				(ulong) err, strerror((int)err));
2531
		}
2532
2533
		fprintf(stderr,
2534
			"InnoDB: Some operating system error numbers"
2535
			" are described at\n"
2536
			"InnoDB: "
2537
			"http://dev.mysql.com/doc/refman/5.1/en/"
2538
			"operating-system-error-codes.html\n");
2539
2540
		os_has_said_disk_full = TRUE;
2541
	}
2542
2543
	return(FALSE);
2544
#else
2545
	ssize_t	ret;
2546
2547
	ret = os_file_pwrite(file, buf, n, offset, offset_high);
2548
2549
	if ((ulint)ret == n) {
2550
2551
		return(TRUE);
2552
	}
2553
2554
	if (!os_has_said_disk_full) {
2555
2556
		ut_print_timestamp(stderr);
2557
2558
		fprintf(stderr,
2559
			"  InnoDB: Error: Write to file %s failed"
2560
			" at offset %lu %lu.\n"
2561
			"InnoDB: %lu bytes should have been written,"
2562
			" only %ld were written.\n"
2563
			"InnoDB: Operating system error number %lu.\n"
2564
			"InnoDB: Check that your OS and file system"
2565
			" support files of this size.\n"
2566
			"InnoDB: Check also that the disk is not full"
2567
			" or a disk quota exceeded.\n",
2568
			name, offset_high, offset, n, (long int)ret,
2569
			(ulint)errno);
2570
		if (strerror(errno) != NULL) {
2571
			fprintf(stderr,
2572
				"InnoDB: Error number %lu means '%s'.\n",
2573
				(ulint)errno, strerror(errno));
2574
		}
2575
2576
		fprintf(stderr,
2577
			"InnoDB: Some operating system error numbers"
2578
			" are described at\n"
2579
			"InnoDB: "
2580
			"http://dev.mysql.com/doc/refman/5.1/en/"
2581
			"operating-system-error-codes.html\n");
2582
2583
		os_has_said_disk_full = TRUE;
2584
	}
2585
2586
	return(FALSE);
2587
#endif
2588
}
2589
2590
/***********************************************************************
2591
Check the existence and type of the given file. */
2592
2593
ibool
2594
os_file_status(
2595
/*===========*/
2596
				/* out: TRUE if call succeeded */
2597
	const char*	path,	/* in:	pathname of the file */
2598
	ibool*		exists,	/* out: TRUE if file exists */
2599
	os_file_type_t* type)	/* out: type of the file (if it exists) */
2600
{
2601
#ifdef __WIN__
2602
	int		ret;
2603
	struct _stat	statinfo;
2604
2605
	ret = _stat(path, &statinfo);
2606
	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2607
		/* file does not exist */
2608
		*exists = FALSE;
2609
		return(TRUE);
2610
	} else if (ret) {
2611
		/* file exists, but stat call failed */
2612
2613
		os_file_handle_error_no_exit(path, "stat");
2614
2615
		return(FALSE);
2616
	}
2617
2618
	if (_S_IFDIR & statinfo.st_mode) {
2619
		*type = OS_FILE_TYPE_DIR;
2620
	} else if (_S_IFREG & statinfo.st_mode) {
2621
		*type = OS_FILE_TYPE_FILE;
2622
	} else {
2623
		*type = OS_FILE_TYPE_UNKNOWN;
2624
	}
2625
2626
	*exists = TRUE;
2627
2628
	return(TRUE);
2629
#else
2630
	int		ret;
2631
	struct stat	statinfo;
2632
2633
	ret = stat(path, &statinfo);
2634
	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2635
		/* file does not exist */
2636
		*exists = FALSE;
2637
		return(TRUE);
2638
	} else if (ret) {
2639
		/* file exists, but stat call failed */
2640
2641
		os_file_handle_error_no_exit(path, "stat");
2642
2643
		return(FALSE);
2644
	}
2645
2646
	if (S_ISDIR(statinfo.st_mode)) {
2647
		*type = OS_FILE_TYPE_DIR;
2648
	} else if (S_ISLNK(statinfo.st_mode)) {
2649
		*type = OS_FILE_TYPE_LINK;
2650
	} else if (S_ISREG(statinfo.st_mode)) {
2651
		*type = OS_FILE_TYPE_FILE;
2652
	} else {
2653
		*type = OS_FILE_TYPE_UNKNOWN;
2654
	}
2655
2656
	*exists = TRUE;
2657
2658
	return(TRUE);
2659
#endif
2660
}
2661
2662
/***********************************************************************
2663
This function returns information about the specified file */
2664
2665
ibool
2666
os_file_get_status(
2667
/*===============*/
2668
					/* out: TRUE if stat
2669
					information found */
2670
	const char*	path,		/* in:	pathname of the file */
2671
	os_file_stat_t* stat_info)	/* information of a file in a
2672
					directory */
2673
{
2674
#ifdef __WIN__
2675
	int		ret;
2676
	struct _stat	statinfo;
2677
2678
	ret = _stat(path, &statinfo);
2679
	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2680
		/* file does not exist */
2681
2682
		return(FALSE);
2683
	} else if (ret) {
2684
		/* file exists, but stat call failed */
2685
2686
		os_file_handle_error_no_exit(path, "stat");
2687
2688
		return(FALSE);
2689
	}
2690
	if (_S_IFDIR & statinfo.st_mode) {
2691
		stat_info->type = OS_FILE_TYPE_DIR;
2692
	} else if (_S_IFREG & statinfo.st_mode) {
2693
		stat_info->type = OS_FILE_TYPE_FILE;
2694
	} else {
2695
		stat_info->type = OS_FILE_TYPE_UNKNOWN;
2696
	}
2697
2698
	stat_info->ctime = statinfo.st_ctime;
2699
	stat_info->atime = statinfo.st_atime;
2700
	stat_info->mtime = statinfo.st_mtime;
2701
	stat_info->size	 = statinfo.st_size;
2702
2703
	return(TRUE);
2704
#else
2705
	int		ret;
2706
	struct stat	statinfo;
2707
2708
	ret = stat(path, &statinfo);
2709
2710
	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2711
		/* file does not exist */
2712
2713
		return(FALSE);
2714
	} else if (ret) {
2715
		/* file exists, but stat call failed */
2716
2717
		os_file_handle_error_no_exit(path, "stat");
2718
2719
		return(FALSE);
2720
	}
2721
2722
	if (S_ISDIR(statinfo.st_mode)) {
2723
		stat_info->type = OS_FILE_TYPE_DIR;
2724
	} else if (S_ISLNK(statinfo.st_mode)) {
2725
		stat_info->type = OS_FILE_TYPE_LINK;
2726
	} else if (S_ISREG(statinfo.st_mode)) {
2727
		stat_info->type = OS_FILE_TYPE_FILE;
2728
	} else {
2729
		stat_info->type = OS_FILE_TYPE_UNKNOWN;
2730
	}
2731
2732
	stat_info->ctime = statinfo.st_ctime;
2733
	stat_info->atime = statinfo.st_atime;
2734
	stat_info->mtime = statinfo.st_mtime;
2735
	stat_info->size	 = statinfo.st_size;
2736
2737
	return(TRUE);
2738
#endif
2739
}
2740
2741
/* path name separator character */
2742
#ifdef __WIN__
2743
#  define OS_FILE_PATH_SEPARATOR	'\\'
2744
#else
2745
#  define OS_FILE_PATH_SEPARATOR	'/'
2746
#endif
2747
2748
/********************************************************************
2749
The function os_file_dirname returns a directory component of a
2750
null-terminated pathname string.  In the usual case, dirname returns
2751
the string up to, but not including, the final '/', and basename
2752
is the component following the final '/'.  Trailing '/' charac­
2753
ters are not counted as part of the pathname.
2754
2755
If path does not contain a slash, dirname returns the string ".".
2756
2757
Concatenating the string returned by dirname, a "/", and the basename
2758
yields a complete pathname.
2759
2760
The return value is  a copy of the directory component of the pathname.
2761
The copy is allocated from heap. It is the caller responsibility
2762
to free it after it is no longer needed.
2763
2764
The following list of examples (taken from SUSv2) shows the strings
2765
returned by dirname and basename for different paths:
2766
2767
       path	      dirname	     basename
2768
       "/usr/lib"     "/usr"	     "lib"
2769
       "/usr/"	      "/"	     "usr"
2770
       "usr"	      "."	     "usr"
2771
       "/"	      "/"	     "/"
2772
       "."	      "."	     "."
2773
       ".."	      "."	     ".."
2774
*/
2775
2776
char*
2777
os_file_dirname(
2778
/*============*/
2779
				/* out, own: directory component of the
2780
				pathname */
2781
	const char*	path)	/* in: pathname */
2782
{
2783
	/* Find the offset of the last slash */
2784
	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
2785
	if (!last_slash) {
2786
		/* No slash in the path, return "." */
2787
2788
		return(mem_strdup("."));
2789
	}
2790
2791
	/* Ok, there is a slash */
2792
2793
	if (last_slash == path) {
2794
		/* last slash is the first char of the path */
2795
2796
		return(mem_strdup("/"));
2797
	}
2798
2799
	/* Non-trivial directory component */
2800
2801
	return(mem_strdupl(path, last_slash - path));
2802
}
2803
2804
/********************************************************************
2805
Creates all missing subdirectories along the given path. */
2806
2807
ibool
2808
os_file_create_subdirs_if_needed(
2809
/*=============================*/
2810
				/* out: TRUE if call succeeded
2811
				   FALSE otherwise */
2812
	const char*	path)	/* in: path name */
2813
{
2814
	char*		subdir;
2815
	ibool		success, subdir_exists;
2816
	os_file_type_t	type;
2817
2818
	subdir = os_file_dirname(path);
2819
	if (strlen(subdir) == 1
2820
	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
2821
		/* subdir is root or cwd, nothing to do */
2822
		mem_free(subdir);
2823
2824
		return(TRUE);
2825
	}
2826
2827
	/* Test if subdir exists */
2828
	success = os_file_status(subdir, &subdir_exists, &type);
2829
	if (success && !subdir_exists) {
2830
		/* subdir does not exist, create it */
2831
		success = os_file_create_subdirs_if_needed(subdir);
2832
		if (!success) {
2833
			mem_free(subdir);
2834
2835
			return(FALSE);
2836
		}
2837
		success = os_file_create_directory(subdir, FALSE);
2838
	}
2839
2840
	mem_free(subdir);
2841
2842
	return(success);
2843
}
2844
2845
/********************************************************************
2846
Returns a pointer to the nth slot in the aio array. */
2847
static
2848
os_aio_slot_t*
2849
os_aio_array_get_nth_slot(
2850
/*======================*/
2851
					/* out: pointer to slot */
2852
	os_aio_array_t*		array,	/* in: aio array */
2853
	ulint			index)	/* in: index of the slot */
2854
{
2855
	ut_a(index < array->n_slots);
2856
2857
	return((array->slots) + index);
2858
}
2859
2860
/****************************************************************************
2861
Creates an aio wait array. */
2862
static
2863
os_aio_array_t*
2864
os_aio_array_create(
2865
/*================*/
2866
				/* out, own: aio array */
2867
	ulint	n,		/* in: maximum number of pending aio operations
2868
				allowed; n must be divisible by n_segments */
2869
	ulint	n_segments)	/* in: number of segments in the aio array */
2870
{
2871
	os_aio_array_t*	array;
2872
	ulint		i;
2873
	os_aio_slot_t*	slot;
2874
#ifdef WIN_ASYNC_IO
2875
	OVERLAPPED*	over;
2876
#endif
2877
	ut_a(n > 0);
2878
	ut_a(n_segments > 0);
2879
2880
	array = ut_malloc(sizeof(os_aio_array_t));
2881
2882
	array->mutex		= os_mutex_create(NULL);
2883
	array->not_full		= os_event_create(NULL);
2884
	array->is_empty		= os_event_create(NULL);
2885
2886
	os_event_set(array->is_empty);
2887
2888
	array->n_slots		= n;
2889
	array->n_segments	= n_segments;
2890
	array->n_reserved	= 0;
2891
	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
2892
#ifdef __WIN__
2893
	array->native_events	= ut_malloc(n * sizeof(os_native_event_t));
2894
#endif
2895
	for (i = 0; i < n; i++) {
2896
		slot = os_aio_array_get_nth_slot(array, i);
2897
2898
		slot->pos = i;
2899
		slot->reserved = FALSE;
2900
#ifdef WIN_ASYNC_IO
2901
		slot->event = os_event_create(NULL);
2902
2903
		over = &(slot->control);
2904
2905
		over->hEvent = slot->event->handle;
2906
2907
		*((array->native_events) + i) = over->hEvent;
2908
#endif
2909
	}
2910
2911
	return(array);
2912
}
2913
2914
/****************************************************************************
2915
Initializes the asynchronous io system. Calls also os_io_init_simple.
2916
Creates a separate aio array for
2917
non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2918
segment, two aio arrays for log reads and writes with one segment, and a
2919
synchronous aio array of the specified size. The combined number of segments
2920
in the three first aio arrays is the parameter n_segments given to the
2921
function. The caller must create an i/o handler thread for each segment in
2922
the four first arrays, but not for the sync aio array. */
2923
2924
void
2925
os_aio_init(
2926
/*========*/
2927
	ulint	n,		/* in: maximum number of pending aio operations
2928
				allowed; n must be divisible by n_segments */
2929
	ulint	n_segments,	/* in: combined number of segments in the four
2930
				first aio arrays; must be >= 4 */
2931
	ulint	n_slots_sync)	/* in: number of slots in the sync aio array */
2932
{
2933
	ulint	n_read_segs;
2934
	ulint	n_write_segs;
2935
	ulint	n_per_seg;
2936
	ulint	i;
2937
#ifdef POSIX_ASYNC_IO
2938
	sigset_t   sigset;
2939
#endif
2940
	ut_ad(n % n_segments == 0);
2941
	ut_ad(n_segments >= 4);
2942
2943
	os_io_init_simple();
2944
2945
	for (i = 0; i < n_segments; i++) {
2946
		srv_set_io_thread_op_info(i, "not started yet");
2947
	}
2948
2949
	n_per_seg = n / n_segments;
2950
	n_write_segs = (n_segments - 2) / 2;
2951
	n_read_segs = n_segments - 2 - n_write_segs;
2952
2953
	/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
2954
2955
	os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
2956
2957
	srv_io_thread_function[0] = "insert buffer thread";
2958
2959
	os_aio_log_array = os_aio_array_create(n_per_seg, 1);
2960
2961
	srv_io_thread_function[1] = "log thread";
2962
2963
	os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
2964
						n_read_segs);
2965
	for (i = 2; i < 2 + n_read_segs; i++) {
2966
		ut_a(i < SRV_MAX_N_IO_THREADS);
2967
		srv_io_thread_function[i] = "read thread";
2968
	}
2969
2970
	os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
2971
						 n_write_segs);
2972
	for (i = 2 + n_read_segs; i < n_segments; i++) {
2973
		ut_a(i < SRV_MAX_N_IO_THREADS);
2974
		srv_io_thread_function[i] = "write thread";
2975
	}
2976
2977
	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
2978
2979
	os_aio_n_segments = n_segments;
2980
2981
	os_aio_validate();
2982
2983
	os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
2984
2985
	for (i = 0; i < n_segments; i++) {
2986
		os_aio_segment_wait_events[i] = os_event_create(NULL);
2987
	}
2988
2989
	os_last_printout = time(NULL);
2990
2991
#ifdef POSIX_ASYNC_IO
2992
	/* Block aio signals from the current thread and its children:
2993
	for this to work, the current thread must be the first created
2994
	in the database, so that all its children will inherit its
2995
	signal mask */
2996
2997
	/* TODO: to work MySQL needs the SIGALARM signal; the following
2998
	will not work yet! */
2999
	sigemptyset(&sigset);
3000
	sigaddset(&sigset, SIGRTMIN + 1 + 0);
3001
	sigaddset(&sigset, SIGRTMIN + 1 + 1);
3002
	sigaddset(&sigset, SIGRTMIN + 1 + 2);
3003
	sigaddset(&sigset, SIGRTMIN + 1 + 3);
3004
3005
	pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
3006
#endif
3007
		}
3008
3009
#ifdef WIN_ASYNC_IO
3010
/****************************************************************************
3011
Wakes up all async i/o threads in the array in Windows async i/o at
3012
shutdown. */
3013
static
3014
void
3015
os_aio_array_wake_win_aio_at_shutdown(
3016
/*==================================*/
3017
	os_aio_array_t*	array)	/* in: aio array */
3018
{
3019
	ulint	i;
3020
3021
	for (i = 0; i < array->n_slots; i++) {
3022
3023
		os_event_set((array->slots + i)->event);
3024
	}
3025
}
3026
#endif
3027
3028
/****************************************************************************
3029
Wakes up all async i/o threads so that they know to exit themselves in
3030
shutdown. */
3031
3032
void
3033
os_aio_wake_all_threads_at_shutdown(void)
3034
/*=====================================*/
3035
{
3036
	ulint	i;
3037
3038
#ifdef WIN_ASYNC_IO
3039
	/* This code wakes up all ai/o threads in Windows native aio */
3040
	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3041
	os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3042
	os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3043
	os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3044
#endif
3045
	/* This loop wakes up all simulated ai/o threads */
3046
3047
	for (i = 0; i < os_aio_n_segments; i++) {
3048
3049
		os_event_set(os_aio_segment_wait_events[i]);
3050
	}
3051
}
3052
3053
/****************************************************************************
3054
Waits until there are no pending writes in os_aio_write_array. There can
3055
be other, synchronous, pending writes. */
3056
3057
void
3058
os_aio_wait_until_no_pending_writes(void)
3059
/*=====================================*/
3060
{
3061
	os_event_wait(os_aio_write_array->is_empty);
3062
}
3063
3064
/**************************************************************************
3065
Calculates segment number for a slot. */
3066
static
3067
ulint
3068
os_aio_get_segment_no_from_slot(
3069
/*============================*/
3070
				/* out: segment number (which is the number
3071
				used by, for example, i/o-handler threads) */
3072
	os_aio_array_t*	array,	/* in: aio wait array */
3073
	os_aio_slot_t*	slot)	/* in: slot in this array */
3074
{
3075
	ulint	segment;
3076
	ulint	seg_len;
3077
3078
	if (array == os_aio_ibuf_array) {
3079
		segment = 0;
3080
3081
	} else if (array == os_aio_log_array) {
3082
		segment = 1;
3083
3084
	} else if (array == os_aio_read_array) {
3085
		seg_len = os_aio_read_array->n_slots
3086
			/ os_aio_read_array->n_segments;
3087
3088
		segment = 2 + slot->pos / seg_len;
3089
	} else {
3090
		ut_a(array == os_aio_write_array);
3091
		seg_len = os_aio_write_array->n_slots
3092
			/ os_aio_write_array->n_segments;
3093
3094
		segment = os_aio_read_array->n_segments + 2
3095
			+ slot->pos / seg_len;
3096
	}
3097
3098
	return(segment);
3099
}
3100
3101
/**************************************************************************
3102
Calculates local segment number and aio array from global segment number. */
3103
static
3104
ulint
3105
os_aio_get_array_and_local_segment(
3106
/*===============================*/
3107
					/* out: local segment number within
3108
					the aio array */
3109
	os_aio_array_t** array,		/* out: aio wait array */
3110
	ulint		 global_segment)/* in: global segment number */
3111
{
3112
	ulint	segment;
3113
3114
	ut_a(global_segment < os_aio_n_segments);
3115
3116
	if (global_segment == 0) {
3117
		*array = os_aio_ibuf_array;
3118
		segment = 0;
3119
3120
	} else if (global_segment == 1) {
3121
		*array = os_aio_log_array;
3122
		segment = 0;
3123
3124
	} else if (global_segment < os_aio_read_array->n_segments + 2) {
3125
		*array = os_aio_read_array;
3126
3127
		segment = global_segment - 2;
3128
	} else {
3129
		*array = os_aio_write_array;
3130
3131
		segment = global_segment - (os_aio_read_array->n_segments + 2);
3132
	}
3133
3134
	return(segment);
3135
}
3136
3137
/***********************************************************************
3138
Gets an integer value designating a specified aio array. This is used
3139
to give numbers to signals in Posix aio. */
3140
3141
#if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO)
3142
static
3143
ulint
3144
os_aio_get_array_no(
3145
/*================*/
3146
	os_aio_array_t*	array)	/* in: aio array */
3147
{
3148
	if (array == os_aio_ibuf_array) {
3149
3150
		return(0);
3151
3152
	} else if (array == os_aio_log_array) {
3153
3154
		return(1);
3155
3156
	} else if (array == os_aio_read_array) {
3157
3158
		return(2);
3159
	} else if (array == os_aio_write_array) {
3160
3161
		return(3);
3162
	} else {
3163
		ut_error;
3164
3165
		return(0);
3166
	}
3167
}
3168
3169
/***********************************************************************
3170
Gets the aio array for its number. */
3171
static
3172
os_aio_array_t*
3173
os_aio_get_array_from_no(
3174
/*=====================*/
3175
			/* out: aio array */
3176
	ulint	n)	/* in: array number */
3177
{
3178
	if (n == 0) {
3179
		return(os_aio_ibuf_array);
3180
	} else if (n == 1) {
3181
3182
		return(os_aio_log_array);
3183
	} else if (n == 2) {
3184
3185
		return(os_aio_read_array);
3186
	} else if (n == 3) {
3187
3188
		return(os_aio_write_array);
3189
	} else {
3190
		ut_error;
3191
3192
		return(NULL);
3193
	}
3194
}
3195
#endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */
3196
3197
/***********************************************************************
3198
Requests for a slot in the aio array. If no slot is available, waits until
3199
not_full-event becomes signaled. */
3200
static
3201
os_aio_slot_t*
3202
os_aio_array_reserve_slot(
3203
/*======================*/
3204
				/* out: pointer to slot */
3205
	ulint		type,	/* in: OS_FILE_READ or OS_FILE_WRITE */
3206
	os_aio_array_t*	array,	/* in: aio array */
3207
	fil_node_t*	message1,/* in: message to be passed along with
3208
				the aio operation */
3209
	void*		message2,/* in: message to be passed along with
3210
				the aio operation */
3211
	os_file_t	file,	/* in: file handle */
3212
	const char*	name,	/* in: name of the file or path as a
3213
				null-terminated string */
3214
	void*		buf,	/* in: buffer where to read or from which
3215
				to write */
3216
	ulint		offset,	/* in: least significant 32 bits of file
3217
				offset */
3218
	ulint		offset_high, /* in: most significant 32 bits of
3219
				offset */
3220
	ulint		len)	/* in: length of the block to read or write */
3221
{
3222
	os_aio_slot_t*	slot;
3223
#ifdef WIN_ASYNC_IO
3224
	OVERLAPPED*	control;
3225
3226
#elif defined(POSIX_ASYNC_IO)
3227
3228
	struct aiocb*	control;
3229
#endif
3230
	ulint		i;
3231
loop:
3232
	os_mutex_enter(array->mutex);
3233
3234
	if (array->n_reserved == array->n_slots) {
3235
		os_mutex_exit(array->mutex);
3236
3237
		if (!os_aio_use_native_aio) {
3238
			/* If the handler threads are suspended, wake them
3239
			so that we get more slots */
3240
3241
			os_aio_simulated_wake_handler_threads();
3242
		}
3243
3244
		os_event_wait(array->not_full);
3245
3246
		goto loop;
3247
	}
3248
3249
	for (i = 0;; i++) {
3250
		slot = os_aio_array_get_nth_slot(array, i);
3251
3252
		if (slot->reserved == FALSE) {
3253
			break;
3254
		}
3255
	}
3256
3257
	array->n_reserved++;
3258
3259
	if (array->n_reserved == 1) {
3260
		os_event_reset(array->is_empty);
3261
	}
3262
3263
	if (array->n_reserved == array->n_slots) {
3264
		os_event_reset(array->not_full);
3265
	}
3266
3267
	slot->reserved = TRUE;
3268
	slot->reservation_time = time(NULL);
3269
	slot->message1 = message1;
3270
	slot->message2 = message2;
3271
	slot->file     = file;
3272
	slot->name     = name;
3273
	slot->len      = len;
3274
	slot->type     = type;
3275
	slot->buf      = buf;
3276
	slot->offset   = offset;
3277
	slot->offset_high = offset_high;
3278
	slot->io_already_done = FALSE;
3279
3280
#ifdef WIN_ASYNC_IO
3281
	control = &(slot->control);
3282
	control->Offset = (DWORD)offset;
3283
	control->OffsetHigh = (DWORD)offset_high;
3284
	os_event_reset(slot->event);
3285
3286
#elif defined(POSIX_ASYNC_IO)
3287
3288
#if (UNIV_WORD_SIZE == 8)
3289
	offset = offset + (offset_high << 32);
3290
#else
3291
	ut_a(offset_high == 0);
3292
#endif
3293
	control = &(slot->control);
3294
	control->aio_fildes = file;
3295
	control->aio_buf = buf;
3296
	control->aio_nbytes = len;
3297
	control->aio_offset = offset;
3298
	control->aio_reqprio = 0;
3299
	control->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
3300
	control->aio_sigevent.sigev_signo
3301
		= SIGRTMIN + 1 + os_aio_get_array_no(array);
3302
	/* TODO: How to choose the signal numbers? */
3303
	/*
3304
	fprintf(stderr, "AIO signal number %lu\n",
3305
	(ulint) control->aio_sigevent.sigev_signo);
3306
	*/
3307
	control->aio_sigevent.sigev_value.sival_ptr = slot;
3308
#endif
3309
	os_mutex_exit(array->mutex);
3310
3311
	return(slot);
3312
}
3313
3314
/***********************************************************************
3315
Frees a slot in the aio array. */
3316
static
3317
void
3318
os_aio_array_free_slot(
3319
/*===================*/
3320
	os_aio_array_t*	array,	/* in: aio array */
3321
	os_aio_slot_t*	slot)	/* in: pointer to slot */
3322
{
3323
	ut_ad(array);
3324
	ut_ad(slot);
3325
3326
	os_mutex_enter(array->mutex);
3327
3328
	ut_ad(slot->reserved);
3329
3330
	slot->reserved = FALSE;
3331
3332
	array->n_reserved--;
3333
3334
	if (array->n_reserved == array->n_slots - 1) {
3335
		os_event_set(array->not_full);
3336
	}
3337
3338
	if (array->n_reserved == 0) {
3339
		os_event_set(array->is_empty);
3340
	}
3341
3342
#ifdef WIN_ASYNC_IO
3343
	os_event_reset(slot->event);
3344
#endif
3345
	os_mutex_exit(array->mutex);
3346
}
3347
3348
/**************************************************************************
3349
Wakes up a simulated aio i/o-handler thread if it has something to do. */
3350
static
3351
void
3352
os_aio_simulated_wake_handler_thread(
3353
/*=================================*/
3354
	ulint	global_segment)	/* in: the number of the segment in the aio
3355
				arrays */
3356
{
3357
	os_aio_array_t*	array;
3358
	os_aio_slot_t*	slot;
3359
	ulint		segment;
3360
	ulint		n;
3361
	ulint		i;
3362
3363
	ut_ad(!os_aio_use_native_aio);
3364
3365
	segment = os_aio_get_array_and_local_segment(&array, global_segment);
3366
3367
	n = array->n_slots / array->n_segments;
3368
3369
	/* Look through n slots after the segment * n'th slot */
3370
3371
	os_mutex_enter(array->mutex);
3372
3373
	for (i = 0; i < n; i++) {
3374
		slot = os_aio_array_get_nth_slot(array, i + segment * n);
3375
3376
		if (slot->reserved) {
3377
			/* Found an i/o request */
3378
3379
			break;
3380
		}
3381
	}
3382
3383
	os_mutex_exit(array->mutex);
3384
3385
	if (i < n) {
3386
		os_event_set(os_aio_segment_wait_events[global_segment]);
3387
	}
3388
}
3389
3390
/**************************************************************************
3391
Wakes up simulated aio i/o-handler threads if they have something to do. */
3392
3393
void
3394
os_aio_simulated_wake_handler_threads(void)
3395
/*=======================================*/
3396
{
3397
	ulint	i;
3398
3399
	if (os_aio_use_native_aio) {
3400
		/* We do not use simulated aio: do nothing */
3401
3402
		return;
3403
	}
3404
3405
	os_aio_recommend_sleep_for_read_threads	= FALSE;
3406
3407
	for (i = 0; i < os_aio_n_segments; i++) {
3408
		os_aio_simulated_wake_handler_thread(i);
3409
	}
3410
}
3411
3412
/**************************************************************************
3413
This function can be called if one wants to post a batch of reads and
3414
prefers an i/o-handler thread to handle them all at once later. You must
3415
call os_aio_simulated_wake_handler_threads later to ensure the threads
3416
are not left sleeping! */
3417
3418
void
3419
os_aio_simulated_put_read_threads_to_sleep(void)
3420
/*============================================*/
3421
{
3422
	os_aio_array_t*	array;
3423
	ulint		g;
3424
3425
	os_aio_recommend_sleep_for_read_threads	= TRUE;
3426
3427
	for (g = 0; g < os_aio_n_segments; g++) {
3428
		os_aio_get_array_and_local_segment(&array, g);
3429
3430
		if (array == os_aio_read_array) {
3431
3432
			os_event_reset(os_aio_segment_wait_events[g]);
3433
		}
3434
	}
3435
}
3436
3437
/***********************************************************************
3438
Requests an asynchronous i/o operation. */
3439
3440
ibool
3441
os_aio(
3442
/*===*/
3443
				/* out: TRUE if request was queued
3444
				successfully, FALSE if fail */
3445
	ulint		type,	/* in: OS_FILE_READ or OS_FILE_WRITE */
3446
	ulint		mode,	/* in: OS_AIO_NORMAL, ..., possibly ORed
3447
				to OS_AIO_SIMULATED_WAKE_LATER: the
3448
				last flag advises this function not to wake
3449
				i/o-handler threads, but the caller will
3450
				do the waking explicitly later, in this
3451
				way the caller can post several requests in
3452
				a batch; NOTE that the batch must not be
3453
				so big that it exhausts the slots in aio
3454
				arrays! NOTE that a simulated batch
3455
				may introduce hidden chances of deadlocks,
3456
				because i/os are not actually handled until
3457
				all have been posted: use with great
3458
				caution! */
3459
	const char*	name,	/* in: name of the file or path as a
3460
				null-terminated string */
3461
	os_file_t	file,	/* in: handle to a file */
3462
	void*		buf,	/* in: buffer where to read or from which
3463
				to write */
3464
	ulint		offset,	/* in: least significant 32 bits of file
3465
				offset where to read or write */
3466
	ulint		offset_high, /* in: most significant 32 bits of
3467
				offset */
3468
	ulint		n,	/* in: number of bytes to read or write */
3469
	fil_node_t*	message1,/* in: messages for the aio handler (these
3470
				can be used to identify a completed aio
3471
				operation); if mode is OS_AIO_SYNC, these
3472
				are ignored */
3473
	void*		message2)
3474
{
3475
	os_aio_array_t*	array;
3476
	os_aio_slot_t*	slot;
3477
#ifdef WIN_ASYNC_IO
3478
	ibool		retval;
3479
	BOOL		ret		= TRUE;
3480
	DWORD		len		= (DWORD) n;
3481
	struct fil_node_struct * dummy_mess1;
3482
	void*		dummy_mess2;
3483
	ulint		dummy_type;
3484
#endif
3485
	ulint		err		= 0;
3486
	ibool		retry;
3487
	ulint		wake_later;
3488
3489
	ut_ad(file);
3490
	ut_ad(buf);
3491
	ut_ad(n > 0);
3492
	ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
3493
	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
3494
	ut_ad(os_aio_validate());
3495
3496
	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
3497
	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
3498
3499
	if (mode == OS_AIO_SYNC
3500
#ifdef WIN_ASYNC_IO
3501
	    && !os_aio_use_native_aio
3502
#endif
3503
	    ) {
3504
		/* This is actually an ordinary synchronous read or write:
3505
		no need to use an i/o-handler thread. NOTE that if we use
3506
		Windows async i/o, Windows does not allow us to use
3507
		ordinary synchronous os_file_read etc. on the same file,
3508
		therefore we have built a special mechanism for synchronous
3509
		wait in the Windows case. */
3510
3511
		if (type == OS_FILE_READ) {
3512
			return(os_file_read(file, buf, offset,
3513
					    offset_high, n));
3514
		}
3515
3516
		ut_a(type == OS_FILE_WRITE);
3517
3518
		return(os_file_write(name, file, buf, offset, offset_high, n));
3519
	}
3520
3521
try_again:
3522
	if (mode == OS_AIO_NORMAL) {
3523
		if (type == OS_FILE_READ) {
3524
			array = os_aio_read_array;
3525
		} else {
3526
			array = os_aio_write_array;
3527
		}
3528
	} else if (mode == OS_AIO_IBUF) {
3529
		ut_ad(type == OS_FILE_READ);
3530
		/* Reduce probability of deadlock bugs in connection with ibuf:
3531
		do not let the ibuf i/o handler sleep */
3532
3533
		wake_later = FALSE;
3534
3535
		array = os_aio_ibuf_array;
3536
	} else if (mode == OS_AIO_LOG) {
3537
3538
		array = os_aio_log_array;
3539
	} else if (mode == OS_AIO_SYNC) {
3540
		array = os_aio_sync_array;
3541
	} else {
3542
		array = NULL; /* Eliminate compiler warning */
3543
		ut_error;
3544
	}
3545
3546
	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
3547
					 name, buf, offset, offset_high, n);
3548
	if (type == OS_FILE_READ) {
3549
		if (os_aio_use_native_aio) {
3550
#ifdef WIN_ASYNC_IO
3551
			os_n_file_reads++;
3552
			os_bytes_read_since_printout += len;
3553
3554
			ret = ReadFile(file, buf, (DWORD)n, &len,
3555
				       &(slot->control));
3556
#elif defined(POSIX_ASYNC_IO)
3557
			slot->control.aio_lio_opcode = LIO_READ;
3558
			err = (ulint) aio_read(&(slot->control));
3559
			fprintf(stderr, "Starting POSIX aio read %lu\n", err);
3560
#endif
3561
		} else {
3562
			if (!wake_later) {
3563
				os_aio_simulated_wake_handler_thread(
3564
					os_aio_get_segment_no_from_slot(
3565
						array, slot));
3566
			}
3567
		}
3568
	} else if (type == OS_FILE_WRITE) {
3569
		if (os_aio_use_native_aio) {
3570
#ifdef WIN_ASYNC_IO
3571
			os_n_file_writes++;
3572
			ret = WriteFile(file, buf, (DWORD)n, &len,
3573
					&(slot->control));
3574
#elif defined(POSIX_ASYNC_IO)
3575
			slot->control.aio_lio_opcode = LIO_WRITE;
3576
			err = (ulint) aio_write(&(slot->control));
3577
			fprintf(stderr, "Starting POSIX aio write %lu\n", err);
3578
#endif
3579
		} else {
3580
			if (!wake_later) {
3581
				os_aio_simulated_wake_handler_thread(
3582
					os_aio_get_segment_no_from_slot(
3583
						array, slot));
3584
			}
3585
		}
3586
	} else {
3587
		ut_error;
3588
	}
3589
3590
#ifdef WIN_ASYNC_IO
3591
	if (os_aio_use_native_aio) {
3592
		if ((ret && len == n)
3593
		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
3594
			/* aio was queued successfully! */
3595
3596
			if (mode == OS_AIO_SYNC) {
3597
				/* We want a synchronous i/o operation on a
3598
				file where we also use async i/o: in Windows
3599
				we must use the same wait mechanism as for
3600
				async i/o */
3601
3602
				retval = os_aio_windows_handle(ULINT_UNDEFINED,
3603
							       slot->pos,
3604
							       &dummy_mess1,
3605
							       &dummy_mess2,
3606
							       &dummy_type);
3607
3608
				return(retval);
3609
			}
3610
3611
			return(TRUE);
3612
		}
3613
3614
		err = 1; /* Fall through the next if */
3615
	}
3616
#endif
3617
	if (err == 0) {
3618
		/* aio was queued successfully! */
3619
3620
		return(TRUE);
3621
	}
3622
3623
	os_aio_array_free_slot(array, slot);
3624
3625
	retry = os_file_handle_error(name,
3626
				     type == OS_FILE_READ
3627
				     ? "aio read" : "aio write");
3628
	if (retry) {
3629
3630
		goto try_again;
3631
	}
3632
3633
	return(FALSE);
3634
}
3635
3636
#ifdef WIN_ASYNC_IO
3637
/**************************************************************************
3638
This function is only used in Windows asynchronous i/o.
3639
Waits for an aio operation to complete. This function is used to wait the
3640
for completed requests. The aio array of pending requests is divided
3641
into segments. The thread specifies which segment or slot it wants to wait
3642
for. NOTE: this function will also take care of freeing the aio slot,
3643
therefore no other thread is allowed to do the freeing! */
3644
3645
ibool
3646
os_aio_windows_handle(
3647
/*==================*/
3648
				/* out: TRUE if the aio operation succeeded */
3649
	ulint	segment,	/* in: the number of the segment in the aio
3650
				arrays to wait for; segment 0 is the ibuf
3651
				i/o thread, segment 1 the log i/o thread,
3652
				then follow the non-ibuf read threads, and as
3653
				the last are the non-ibuf write threads; if
3654
				this is ULINT_UNDEFINED, then it means that
3655
				sync aio is used, and this parameter is
3656
				ignored */
3657
	ulint	pos,		/* this parameter is used only in sync aio:
3658
				wait for the aio slot at this position */
3659
	fil_node_t**message1,	/* out: the messages passed with the aio
3660
				request; note that also in the case where
3661
				the aio operation failed, these output
3662
				parameters are valid and can be used to
3663
				restart the operation, for example */
3664
	void**	message2,
3665
	ulint*	type)		/* out: OS_FILE_WRITE or ..._READ */
3666
{
3667
	ulint		orig_seg	= segment;
3668
	os_aio_array_t*	array;
3669
	os_aio_slot_t*	slot;
3670
	ulint		n;
3671
	ulint		i;
3672
	ibool		ret_val;
3673
	BOOL		ret;
3674
	DWORD		len;
3675
3676
	if (segment == ULINT_UNDEFINED) {
3677
		array = os_aio_sync_array;
3678
		segment = 0;
3679
	} else {
3680
		segment = os_aio_get_array_and_local_segment(&array, segment);
3681
	}
3682
3683
	/* NOTE! We only access constant fields in os_aio_array. Therefore
3684
	we do not have to acquire the protecting mutex yet */
3685
3686
	ut_ad(os_aio_validate());
3687
	ut_ad(segment < array->n_segments);
3688
3689
	n = array->n_slots / array->n_segments;
3690
3691
	if (array == os_aio_sync_array) {
3692
		os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
3693
		i = pos;
3694
	} else {
3695
		srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
3696
		i = os_event_wait_multiple(n,
3697
					   (array->native_events)
3698
					   + segment * n);
3699
	}
3700
3701
	os_mutex_enter(array->mutex);
3702
3703
	slot = os_aio_array_get_nth_slot(array, i + segment * n);
3704
3705
	ut_a(slot->reserved);
3706
3707
	if (orig_seg != ULINT_UNDEFINED) {
3708
		srv_set_io_thread_op_info(orig_seg,
3709
					  "get windows aio return value");
3710
	}
3711
3712
	ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
3713
3714
	*message1 = slot->message1;
3715
	*message2 = slot->message2;
3716
3717
	*type = slot->type;
3718
3719
	if (ret && len == slot->len) {
3720
		ret_val = TRUE;
3721
3722
# ifdef UNIV_DO_FLUSH
3723
		if (slot->type == OS_FILE_WRITE
3724
		    && !os_do_not_call_flush_at_each_write) {
3725
			ut_a(TRUE == os_file_flush(slot->file));
3726
		}
3727
# endif /* UNIV_DO_FLUSH */
3728
	} else {
3729
		os_file_handle_error(slot->name, "Windows aio");
3730
3731
		ret_val = FALSE;
3732
	}
3733
3734
	os_mutex_exit(array->mutex);
3735
3736
	os_aio_array_free_slot(array, slot);
3737
3738
	return(ret_val);
3739
}
3740
#endif
3741
3742
#ifdef POSIX_ASYNC_IO
3743
3744
/**************************************************************************
3745
This function is only used in Posix asynchronous i/o. Waits for an aio
3746
operation to complete. */
3747
3748
ibool
3749
os_aio_posix_handle(
3750
/*================*/
3751
				/* out: TRUE if the aio operation succeeded */
3752
	ulint	array_no,	/* in: array number 0 - 3 */
3753
	fil_node_t**message1,	/* out: the messages passed with the aio
3754
				request; note that also in the case where
3755
				the aio operation failed, these output
3756
				parameters are valid and can be used to
3757
				restart the operation, for example */
3758
	void**	message2)
3759
{
3760
	os_aio_array_t*	array;
3761
	os_aio_slot_t*	slot;
3762
	siginfo_t	info;
3763
	sigset_t	sigset;
3764
	sigset_t	proc_sigset;
3765
	sigset_t	thr_sigset;
3766
	int		ret;
3767
	int		i;
3768
	int		sig;
3769
3770
	sigemptyset(&sigset);
3771
	sigaddset(&sigset, SIGRTMIN + 1 + array_no);
3772
3773
	pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
3774
3775
#if 0
3776
	sigprocmask(0, NULL, &proc_sigset);
3777
	pthread_sigmask(0, NULL, &thr_sigset);
3778
3779
	for (i = 32 ; i < 40; i++) {
3780
		fprintf(stderr, "%lu : %lu %lu\n", (ulint)i,
3781
			(ulint) sigismember(&proc_sigset, i),
3782
			(ulint) sigismember(&thr_sigset, i));
3783
	}
3784
#endif
3785
3786
	ret = sigwaitinfo(&sigset, &info);
3787
3788
	if (sig != SIGRTMIN + 1 + array_no) {
3789
3790
		ut_error;
3791
3792
		return(FALSE);
3793
	}
3794
3795
	fputs("Handling POSIX aio\n", stderr);
3796
3797
	array = os_aio_get_array_from_no(array_no);
3798
3799
	os_mutex_enter(array->mutex);
3800
3801
	slot = info.si_value.sival_ptr;
3802
3803
	ut_a(slot->reserved);
3804
3805
	*message1 = slot->message1;
3806
	*message2 = slot->message2;
3807
3808
# ifdef UNIV_DO_FLUSH
3809
	if (slot->type == OS_FILE_WRITE
3810
	    && !os_do_not_call_flush_at_each_write) {
3811
		ut_a(TRUE == os_file_flush(slot->file));
3812
	}
3813
# endif /* UNIV_DO_FLUSH */
3814
3815
	os_mutex_exit(array->mutex);
3816
3817
	os_aio_array_free_slot(array, slot);
3818
3819
	return(TRUE);
3820
}
3821
#endif
3822
3823
/**************************************************************************
3824
Do a 'last millisecond' check that the page end is sensible;
3825
reported page checksum errors from Linux seem to wipe over the page end. */
3826
static
3827
void
3828
os_file_check_page_trailers(
3829
/*========================*/
3830
	byte*	combined_buf,	/* in: combined write buffer */
3831
	ulint	total_len)	/* in: size of combined_buf, in bytes
3832
				(a multiple of UNIV_PAGE_SIZE) */
3833
{
3834
	ulint	len;
3835
3836
	for (len = 0; len + UNIV_PAGE_SIZE <= total_len;
3837
	     len += UNIV_PAGE_SIZE) {
3838
		byte*	buf = combined_buf + len;
3839
3840
		if (UNIV_UNLIKELY
3841
		    (memcmp(buf + (FIL_PAGE_LSN + 4),
3842
			    buf + (UNIV_PAGE_SIZE
3843
				   - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
3844
		    	ut_print_timestamp(stderr);
3845
		    	fprintf(stderr,
3846
				"  InnoDB: ERROR: The page to be written"
3847
				" seems corrupt!\n"
3848
				"InnoDB: Writing a block of %lu bytes,"
3849
				" currently at offset %lu\n",
3850
				(ulong)total_len, (ulong)len);
3851
			buf_page_print(buf);
3852
		    	fprintf(stderr,
3853
				"InnoDB: ERROR: The page to be written"
3854
				" seems corrupt!\n");
3855
		}
3856
	}
3857
}
3858
3859
/**************************************************************************
3860
Does simulated aio. This function should be called by an i/o-handler
3861
thread. */
3862
3863
ibool
3864
os_aio_simulated_handle(
3865
/*====================*/
3866
				/* out: TRUE if the aio operation succeeded */
3867
	ulint	global_segment,	/* in: the number of the segment in the aio
3868
				arrays to wait for; segment 0 is the ibuf
3869
				i/o thread, segment 1 the log i/o thread,
3870
				then follow the non-ibuf read threads, and as
3871
				the last are the non-ibuf write threads */
3872
	fil_node_t**message1,	/* out: the messages passed with the aio
3873
				request; note that also in the case where
3874
				the aio operation failed, these output
3875
				parameters are valid and can be used to
3876
				restart the operation, for example */
3877
	void**	message2,
3878
	ulint*	type)		/* out: OS_FILE_WRITE or ..._READ */
3879
{
3880
	os_aio_array_t*	array;
3881
	ulint		segment;
3882
	os_aio_slot_t*	slot;
3883
	os_aio_slot_t*	slot2;
3884
	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
3885
	ulint		n_consecutive;
3886
	ulint		total_len;
3887
	ulint		offs;
3888
	ulint		lowest_offset;
3889
	ulint		biggest_age;
3890
	ulint		age;
3891
	byte*		combined_buf;
3892
	byte*		combined_buf2;
3893
	ibool		ret;
3894
	ulint		n;
3895
	ulint		i;
3896
3897
	segment = os_aio_get_array_and_local_segment(&array, global_segment);
3898
3899
restart:
3900
	/* NOTE! We only access constant fields in os_aio_array. Therefore
3901
	we do not have to acquire the protecting mutex yet */
3902
3903
	srv_set_io_thread_op_info(global_segment,
3904
				  "looking for i/o requests (a)");
3905
	ut_ad(os_aio_validate());
3906
	ut_ad(segment < array->n_segments);
3907
3908
	n = array->n_slots / array->n_segments;
3909
3910
	/* Look through n slots after the segment * n'th slot */
3911
3912
	if (array == os_aio_read_array
3913
	    && os_aio_recommend_sleep_for_read_threads) {
3914
3915
		/* Give other threads chance to add several i/os to the array
3916
		at once. */
3917
3918
		goto recommended_sleep;
3919
	}
3920
3921
	os_mutex_enter(array->mutex);
3922
3923
	srv_set_io_thread_op_info(global_segment,
3924
				  "looking for i/o requests (b)");
3925
3926
	/* Check if there is a slot for which the i/o has already been
3927
	done */
3928
3929
	for (i = 0; i < n; i++) {
3930
		slot = os_aio_array_get_nth_slot(array, i + segment * n);
3931
3932
		if (slot->reserved && slot->io_already_done) {
3933
3934
			if (os_aio_print_debug) {
3935
				fprintf(stderr,
3936
					"InnoDB: i/o for slot %lu"
3937
					" already done, returning\n",
3938
					(ulong) i);
3939
			}
3940
3941
			ret = TRUE;
3942
3943
			goto slot_io_done;
3944
		}
3945
	}
3946
3947
	n_consecutive = 0;
3948
3949
	/* If there are at least 2 seconds old requests, then pick the oldest
3950
	one to prevent starvation. If several requests have the same age,
3951
	then pick the one at the lowest offset. */
3952
3953
	biggest_age = 0;
3954
	lowest_offset = ULINT_MAX;
3955
3956
	for (i = 0; i < n; i++) {
3957
		slot = os_aio_array_get_nth_slot(array, i + segment * n);
3958
3959
		if (slot->reserved) {
3960
			age = (ulint)difftime(time(NULL),
3961
					      slot->reservation_time);
3962
3963
			if ((age >= 2 && age > biggest_age)
3964
			    || (age >= 2 && age == biggest_age
3965
				&& slot->offset < lowest_offset)) {
3966
3967
				/* Found an i/o request */
3968
				consecutive_ios[0] = slot;
3969
3970
				n_consecutive = 1;
3971
3972
				biggest_age = age;
3973
				lowest_offset = slot->offset;
3974
			}
3975
		}
3976
	}
3977
3978
	if (n_consecutive == 0) {
3979
		/* There were no old requests. Look for an i/o request at the
3980
		lowest offset in the array (we ignore the high 32 bits of the
3981
		offset in these heuristics) */
3982
3983
		lowest_offset = ULINT_MAX;
3984
3985
		for (i = 0; i < n; i++) {
3986
			slot = os_aio_array_get_nth_slot(array,
3987
							 i + segment * n);
3988
3989
			if (slot->reserved && slot->offset < lowest_offset) {
3990
3991
				/* Found an i/o request */
3992
				consecutive_ios[0] = slot;
3993
3994
				n_consecutive = 1;
3995
3996
				lowest_offset = slot->offset;
3997
			}
3998
		}
3999
	}
4000
4001
	if (n_consecutive == 0) {
4002
4003
		/* No i/o requested at the moment */
4004
4005
		goto wait_for_io;
4006
	}
4007
4008
	slot = consecutive_ios[0];
4009
4010
	/* Check if there are several consecutive blocks to read or write */
4011
4012
consecutive_loop:
4013
	for (i = 0; i < n; i++) {
4014
		slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4015
4016
		if (slot2->reserved && slot2 != slot
4017
		    && slot2->offset == slot->offset + slot->len
4018
		    /* check that sum does not wrap over */
4019
		    && slot->offset + slot->len > slot->offset
4020
		    && slot2->offset_high == slot->offset_high
4021
		    && slot2->type == slot->type
4022
		    && slot2->file == slot->file) {
4023
4024
			/* Found a consecutive i/o request */
4025
4026
			consecutive_ios[n_consecutive] = slot2;
4027
			n_consecutive++;
4028
4029
			slot = slot2;
4030
4031
			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4032
4033
				goto consecutive_loop;
4034
			} else {
4035
				break;
4036
			}
4037
		}
4038
	}
4039
4040
	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4041
4042
	/* We have now collected n_consecutive i/o requests in the array;
4043
	allocate a single buffer which can hold all data, and perform the
4044
	i/o */
4045
4046
	total_len = 0;
4047
	slot = consecutive_ios[0];
4048
4049
	for (i = 0; i < n_consecutive; i++) {
4050
		total_len += consecutive_ios[i]->len;
4051
	}
4052
4053
	if (n_consecutive == 1) {
4054
		/* We can use the buffer of the i/o request */
4055
		combined_buf = slot->buf;
4056
		combined_buf2 = NULL;
4057
	} else {
4058
		combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
4059
4060
		ut_a(combined_buf2);
4061
4062
		combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
4063
	}
4064
4065
	/* We release the array mutex for the time of the i/o: NOTE that
4066
	this assumes that there is just one i/o-handler thread serving
4067
	a single segment of slots! */
4068
4069
	os_mutex_exit(array->mutex);
4070
4071
	if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4072
		/* Copy the buffers to the combined buffer */
4073
		offs = 0;
4074
4075
		for (i = 0; i < n_consecutive; i++) {
4076
4077
			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4078
				  consecutive_ios[i]->len);
4079
			offs += consecutive_ios[i]->len;
4080
		}
4081
	}
4082
4083
	srv_set_io_thread_op_info(global_segment, "doing file i/o");
4084
4085
	if (os_aio_print_debug) {
4086
		fprintf(stderr,
4087
			"InnoDB: doing i/o of type %lu at offset %lu %lu,"
4088
			" length %lu\n",
4089
			(ulong) slot->type, (ulong) slot->offset_high,
4090
			(ulong) slot->offset, (ulong) total_len);
4091
	}
4092
4093
	/* Do the i/o with ordinary, synchronous i/o functions: */
4094
	if (slot->type == OS_FILE_WRITE) {
4095
		if (array == os_aio_write_array) {
4096
			if ((total_len % UNIV_PAGE_SIZE != 0)
4097
			    || (slot->offset % UNIV_PAGE_SIZE != 0)) {
4098
				fprintf(stderr,
4099
					"InnoDB: Error: trying a displaced"
4100
					" write to %s %lu %lu, len %lu\n",
4101
					slot->name, (ulong) slot->offset_high,
4102
					(ulong) slot->offset,
4103
					(ulong) total_len);
4104
				ut_error;
4105
			}
4106
4107
			os_file_check_page_trailers(combined_buf, total_len);
4108
		}
4109
4110
		ret = os_file_write(slot->name, slot->file, combined_buf,
4111
				    slot->offset, slot->offset_high,
4112
				    total_len);
4113
4114
		if (array == os_aio_write_array) {
4115
			os_file_check_page_trailers(combined_buf, total_len);
4116
		}
4117
	} else {
4118
		ret = os_file_read(slot->file, combined_buf,
4119
				   slot->offset, slot->offset_high, total_len);
4120
	}
4121
4122
	ut_a(ret);
4123
	srv_set_io_thread_op_info(global_segment, "file i/o done");
4124
4125
#if 0
4126
	fprintf(stderr,
4127
		"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4128
		n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4129
#endif
4130
4131
	if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4132
		/* Copy the combined buffer to individual buffers */
4133
		offs = 0;
4134
4135
		for (i = 0; i < n_consecutive; i++) {
4136
4137
			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4138
				  consecutive_ios[i]->len);
4139
			offs += consecutive_ios[i]->len;
4140
		}
4141
	}
4142
4143
	if (combined_buf2) {
4144
		ut_free(combined_buf2);
4145
	}
4146
4147
	os_mutex_enter(array->mutex);
4148
4149
	/* Mark the i/os done in slots */
4150
4151
	for (i = 0; i < n_consecutive; i++) {
4152
		consecutive_ios[i]->io_already_done = TRUE;
4153
	}
4154
4155
	/* We return the messages for the first slot now, and if there were
4156
	several slots, the messages will be returned with subsequent calls
4157
	of this function */
4158
4159
slot_io_done:
4160
4161
	ut_a(slot->reserved);
4162
4163
	*message1 = slot->message1;
4164
	*message2 = slot->message2;
4165
4166
	*type = slot->type;
4167
4168
	os_mutex_exit(array->mutex);
4169
4170
	os_aio_array_free_slot(array, slot);
4171
4172
	return(ret);
4173
4174
wait_for_io:
4175
	srv_set_io_thread_op_info(global_segment, "resetting wait event");
4176
4177
	/* We wait here until there again can be i/os in the segment
4178
	of this thread */
4179
4180
	os_event_reset(os_aio_segment_wait_events[global_segment]);
4181
4182
	os_mutex_exit(array->mutex);
4183
4184
recommended_sleep:
4185
	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4186
4187
	os_event_wait(os_aio_segment_wait_events[global_segment]);
4188
4189
	if (os_aio_print_debug) {
4190
		fprintf(stderr,
4191
			"InnoDB: i/o handler thread for i/o"
4192
			" segment %lu wakes up\n",
4193
			(ulong) global_segment);
4194
	}
4195
4196
	goto restart;
4197
}
4198
4199
/**************************************************************************
4200
Validates the consistency of an aio array. */
4201
static
4202
ibool
4203
os_aio_array_validate(
4204
/*==================*/
4205
				/* out: TRUE if ok */
4206
	os_aio_array_t*	array)	/* in: aio wait array */
4207
{
4208
	os_aio_slot_t*	slot;
4209
	ulint		n_reserved	= 0;
4210
	ulint		i;
4211
4212
	ut_a(array);
4213
4214
	os_mutex_enter(array->mutex);
4215
4216
	ut_a(array->n_slots > 0);
4217
	ut_a(array->n_segments > 0);
4218
4219
	for (i = 0; i < array->n_slots; i++) {
4220
		slot = os_aio_array_get_nth_slot(array, i);
4221
4222
		if (slot->reserved) {
4223
			n_reserved++;
4224
			ut_a(slot->len > 0);
4225
		}
4226
	}
4227
4228
	ut_a(array->n_reserved == n_reserved);
4229
4230
	os_mutex_exit(array->mutex);
4231
4232
	return(TRUE);
4233
}
4234
4235
/**************************************************************************
4236
Validates the consistency the aio system. */
4237
4238
ibool
4239
os_aio_validate(void)
4240
/*=================*/
4241
				/* out: TRUE if ok */
4242
{
4243
	os_aio_array_validate(os_aio_read_array);
4244
	os_aio_array_validate(os_aio_write_array);
4245
	os_aio_array_validate(os_aio_ibuf_array);
4246
	os_aio_array_validate(os_aio_log_array);
4247
	os_aio_array_validate(os_aio_sync_array);
4248
4249
	return(TRUE);
4250
}
4251
4252
/**************************************************************************
4253
Prints info of the aio arrays. */
4254
4255
void
4256
os_aio_print(
4257
/*=========*/
4258
	FILE*	file)	/* in: file where to print */
4259
{
4260
	os_aio_array_t*	array;
4261
	os_aio_slot_t*	slot;
4262
	ulint		n_reserved;
4263
	time_t		current_time;
4264
	double		time_elapsed;
4265
	double		avg_bytes_read;
4266
	ulint		i;
4267
4268
	for (i = 0; i < srv_n_file_io_threads; i++) {
4269
		fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
4270
			srv_io_thread_op_info[i],
4271
			srv_io_thread_function[i]);
4272
4273
#ifndef __WIN__
4274
		if (os_aio_segment_wait_events[i]->is_set) {
4275
			fprintf(file, " ev set");
4276
		}
4277
#endif
4278
4279
		fprintf(file, "\n");
4280
	}
4281
4282
	fputs("Pending normal aio reads:", file);
4283
4284
	array = os_aio_read_array;
4285
loop:
4286
	ut_a(array);
4287
4288
	os_mutex_enter(array->mutex);
4289
4290
	ut_a(array->n_slots > 0);
4291
	ut_a(array->n_segments > 0);
4292
4293
	n_reserved = 0;
4294
4295
	for (i = 0; i < array->n_slots; i++) {
4296
		slot = os_aio_array_get_nth_slot(array, i);
4297
4298
		if (slot->reserved) {
4299
			n_reserved++;
4300
#if 0
4301
			fprintf(stderr, "Reserved slot, messages %p %p\n",
4302
				(void*) slot->message1,
4303
				(void*) slot->message2);
4304
#endif
4305
			ut_a(slot->len > 0);
4306
		}
4307
	}
4308
4309
	ut_a(array->n_reserved == n_reserved);
4310
4311
	fprintf(file, " %lu", (ulong) n_reserved);
4312
4313
	os_mutex_exit(array->mutex);
4314
4315
	if (array == os_aio_read_array) {
4316
		fputs(", aio writes:", file);
4317
4318
		array = os_aio_write_array;
4319
4320
		goto loop;
4321
	}
4322
4323
	if (array == os_aio_write_array) {
4324
		fputs(",\n ibuf aio reads:", file);
4325
		array = os_aio_ibuf_array;
4326
4327
		goto loop;
4328
	}
4329
4330
	if (array == os_aio_ibuf_array) {
4331
		fputs(", log i/o's:", file);
4332
		array = os_aio_log_array;
4333
4334
		goto loop;
4335
	}
4336
4337
	if (array == os_aio_log_array) {
4338
		fputs(", sync i/o's:", file);
4339
		array = os_aio_sync_array;
4340
4341
		goto loop;
4342
	}
4343
4344
	putc('\n', file);
4345
	current_time = time(NULL);
4346
	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
4347
4348
	fprintf(file,
4349
		"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
4350
		"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
4351
		(ulong) fil_n_pending_log_flushes,
4352
		(ulong) fil_n_pending_tablespace_flushes,
4353
		(ulong) os_n_file_reads, (ulong) os_n_file_writes,
4354
		(ulong) os_n_fsyncs);
4355
4356
	if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
4357
		fprintf(file,
4358
			"%lu pending preads, %lu pending pwrites\n",
4359
			(ulong) os_file_n_pending_preads,
4360
			(ulong) os_file_n_pending_pwrites);
4361
	}
4362
4363
	if (os_n_file_reads == os_n_file_reads_old) {
4364
		avg_bytes_read = 0.0;
4365
	} else {
4366
		avg_bytes_read = (double) os_bytes_read_since_printout
4367
			/ (os_n_file_reads - os_n_file_reads_old);
4368
	}
4369
4370
	fprintf(file,
4371
		"%.2f reads/s, %lu avg bytes/read,"
4372
		" %.2f writes/s, %.2f fsyncs/s\n",
4373
		(os_n_file_reads - os_n_file_reads_old)
4374
		/ time_elapsed,
4375
		(ulong)avg_bytes_read,
4376
		(os_n_file_writes - os_n_file_writes_old)
4377
		/ time_elapsed,
4378
		(os_n_fsyncs - os_n_fsyncs_old)
4379
		/ time_elapsed);
4380
4381
	os_n_file_reads_old = os_n_file_reads;
4382
	os_n_file_writes_old = os_n_file_writes;
4383
	os_n_fsyncs_old = os_n_fsyncs;
4384
	os_bytes_read_since_printout = 0;
4385
4386
	os_last_printout = current_time;
4387
}
4388
4389
/**************************************************************************
4390
Refreshes the statistics used to print per-second averages. */
4391
4392
void
4393
os_aio_refresh_stats(void)
4394
/*======================*/
4395
{
4396
	os_n_file_reads_old = os_n_file_reads;
4397
	os_n_file_writes_old = os_n_file_writes;
4398
	os_n_fsyncs_old = os_n_fsyncs;
4399
	os_bytes_read_since_printout = 0;
4400
4401
	os_last_printout = time(NULL);
4402
}
4403
4404
#ifdef UNIV_DEBUG
4405
/**************************************************************************
4406
Checks that all slots in the system have been freed, that is, there are
4407
no pending io operations. */
4408
4409
ibool
4410
os_aio_all_slots_free(void)
4411
/*=======================*/
4412
				/* out: TRUE if all free */
4413
{
4414
	os_aio_array_t*	array;
4415
	ulint		n_res	= 0;
4416
4417
	array = os_aio_read_array;
4418
4419
	os_mutex_enter(array->mutex);
4420
4421
	n_res += array->n_reserved;
4422
4423
	os_mutex_exit(array->mutex);
4424
4425
	array = os_aio_write_array;
4426
4427
	os_mutex_enter(array->mutex);
4428
4429
	n_res += array->n_reserved;
4430
4431
	os_mutex_exit(array->mutex);
4432
4433
	array = os_aio_ibuf_array;
4434
4435
	os_mutex_enter(array->mutex);
4436
4437
	n_res += array->n_reserved;
4438
4439
	os_mutex_exit(array->mutex);
4440
4441
	array = os_aio_log_array;
4442
4443
	os_mutex_enter(array->mutex);
4444
4445
	n_res += array->n_reserved;
4446
4447
	os_mutex_exit(array->mutex);
4448
4449
	array = os_aio_sync_array;
4450
4451
	os_mutex_enter(array->mutex);
4452
4453
	n_res += array->n_reserved;
4454
4455
	os_mutex_exit(array->mutex);
4456
4457
	if (n_res == 0) {
4458
4459
		return(TRUE);
4460
	}
4461
4462
	return(FALSE);
4463
}
4464
#endif /* UNIV_DEBUG */