13
13
along with this program; if not, write to the Free Software
14
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
#ifdef USE_PRAGMA_IMPLEMENTATION
17
#pragma implementation // gcc: Class implementation
20
#include <drizzled/common_includes.h>
17
#include <drizzled/server_includes.h>
18
#include <drizzled/field.h>
19
#include <drizzled/field/blob.h>
20
#include <drizzled/field/timestamp.h>
21
21
#include <storage/myisam/myisam.h>
22
#include <drizzled/table.h>
23
#include <drizzled/session.h>
23
#include "ha_archive.h"
26
#include <storage/archive/ha_archive.h>
26
First, if you want to understand storage engines you should look at
27
ha_example.cc and ha_example.h.
29
First, if you want to understand storage engines you should look at
30
ha_example.cc and ha_example.h.
29
32
This example was written as a test case for a customer who needed
30
33
a storage engine without indexes that could compress data very well.
31
34
So, welcome to a completely compressed storage engine. This storage
32
engine only does inserts. No replace, deletes, or updates. All reads are
35
engine only does inserts. No replace, deletes, or updates. All reads are
33
36
complete table scans. Compression is done through a combination of packing
34
37
and making use of the zlib library
36
39
We keep a file pointer open for each instance of ha_archive for each read
37
40
but for writes we keep one open file handle just for that. We flush it
38
41
only if we have a read occur. azip handles compressing lots of records
42
45
the same time since we would want to flush).
44
47
A "meta" file is kept alongside the data file. This file serves two purpose.
45
The first purpose is to track the number of rows in the table. The second
46
purpose is to determine if the table was closed properly or not. When the
47
meta file is first opened it is marked as dirty. It is opened when the table
48
itself is opened for writing. When the table is closed the new count for rows
49
is written to the meta file and the file is marked as clean. If the meta file
50
is opened and it is marked as dirty, it is assumed that a crash occured. At
48
The first purpose is to track the number of rows in the table. The second
49
purpose is to determine if the table was closed properly or not. When the
50
meta file is first opened it is marked as dirty. It is opened when the table
51
itself is opened for writing. When the table is closed the new count for rows
52
is written to the meta file and the file is marked as clean. If the meta file
53
is opened and it is marked as dirty, it is assumed that a crash occured. At
51
54
this point an error occurs and the user is told to rebuild the file.
52
55
A rebuild scans the rows and rewrites the meta file. If corruption is found
53
56
in the data file then the meta file is not repaired.
55
58
At some point a recovery method for such a drastic case needs to be divised.
57
Locks are row level, and you will get a consistant read.
60
Locks are row level, and you will get a consistant read.
59
62
For performance as far as table scans go it is quite fast. I don't have
60
63
good numbers but locally it has out performed both Innodb and MyISAM. For
61
64
Innodb the question will be if the table can be fit into the buffer
62
65
pool. For MyISAM its a question of how much the file system caches the
63
66
MyISAM file. With enough free memory MyISAM is faster. Its only when the OS
64
doesn't have enough memory to cache entire table that archive turns out
67
doesn't have enough memory to cache entire table that archive turns out
67
70
Examples between MyISAM (packed) and Archive.
98
101
/* The file extension */
99
102
#define ARZ ".ARZ" // The data file
100
103
#define ARN ".ARN" // Files used during an optimize call
101
#define ARM ".ARM" // Meta file (deprecated)
104
unsigned char + unsigned char
106
#define DATA_BUFFER_SIZE 2 // Size of the data used in the data file
107
#define ARCHIVE_CHECK_HEADER 254 // The number we use to determine corruption
109
106
/* Static declarations for handerton */
110
static handler *archive_create_handler(handlerton *hton,
107
static handler *archive_create_handler(handlerton *hton,
112
109
MEM_ROOT *mem_root);
113
int archive_discover(handlerton *hton, THD* thd, const char *db,
115
unsigned char **frmblob,
118
111
static bool archive_use_aio= false;
216
207
archive_reader_open= false;
219
int archive_discover(handlerton *hton __attribute__((unused)),
220
THD* thd __attribute__((unused)),
223
unsigned char **frmblob,
226
azio_stream frm_stream;
227
char az_file[FN_REFLEN];
229
struct stat file_stat;
231
fn_format(az_file, name, db, ARZ, MY_REPLACE_EXT | MY_UNPACK_FILENAME);
233
if (stat(az_file, &file_stat))
236
if (!(azopen(&frm_stream, az_file, O_RDONLY|O_BINARY, AZ_METHOD_BLOCK)))
238
if (errno == EROFS || errno == EACCES)
239
return(my_errno= errno);
240
return(HA_ERR_CRASHED_ON_USAGE);
243
if (frm_stream.frm_length == 0)
246
frm_ptr= (char *)my_malloc(sizeof(char) * frm_stream.frm_length, MYF(0));
247
azread_frm(&frm_stream, frm_ptr);
248
azclose(&frm_stream);
250
*frmlen= frm_stream.frm_length;
251
*frmblob= (unsigned char*) frm_ptr;
260
211
This method reads the header of a datafile and returns whether or not it was successful.
309
260
share->archive_write_open= false;
310
261
fn_format(share->data_file_name, table_name, "",
311
262
ARZ, MY_REPLACE_EXT | MY_UNPACK_FILENAME);
312
my_stpcpy(share->table_name, table_name);
263
strcpy(share->table_name, table_name);
314
265
We will use this lock for rows.
316
267
pthread_mutex_init(&share->mutex,MY_MUTEX_INIT_FAST);
319
270
We read the meta file, but do not mark it dirty. Since we are not
320
271
doing a write we won't mark it dirty (and we won't open it for
321
272
anything but reading... open it for write and we will generate null
322
273
compression writes).
324
if (!(azopen(&archive_tmp, share->data_file_name, O_RDONLY|O_BINARY,
275
if (!(azopen(&archive_tmp, share->data_file_name, O_RDONLY,
325
276
AZ_METHOD_BLOCK)))
327
278
pthread_mutex_destroy(&share->mutex);
388
339
int ha_archive::init_archive_writer()
391
342
It is expensive to open and close the data files and since you can't have
392
343
a gzip file that can be both read and written we keep a writer open
393
344
that is shared amoung all open tables.
395
if (!(azopen(&(share->archive_write), share->data_file_name,
396
O_RDWR|O_BINARY, AZ_METHOD_BLOCK)))
346
if (!(azopen(&(share->archive_write), share->data_file_name,
347
O_RDWR, AZ_METHOD_BLOCK)))
398
349
share->crashed= true;
543
We create our data file here. The format is pretty simple.
492
We create our data file here. The format is pretty simple.
544
493
You can read about the format of the data file above.
545
Unlike other storage engines we do not "pack" our data. Since we
546
are about to do a general compression, packing would just be a waste of
547
CPU time. If the table has blobs they are written after the row in the order
494
Unlike other storage engines we do not "pack" our data. Since we
495
are about to do a general compression, packing would just be a waste of
496
CPU time. If the table has blobs they are written after the row in the order
600
550
There is a chance that the file was "discovered". In this case
601
551
just use whatever file is there.
603
if (!stat(name_buff, &file_stat))
606
if (!(azopen(&create_stream, name_buff, O_CREAT|O_RDWR|O_BINARY,
614
my_symlink(name_buff, linkname, MYF(0));
615
fn_format(name_buff, name, "", ".frm",
616
MY_REPLACE_EXT | MY_UNPACK_FILENAME);
619
Here is where we open up the frm and pass it to archive to store
621
if ((frm_file= my_open(name_buff, O_RDONLY, MYF(0))) > 0)
623
if (fstat(frm_file, &file_stat))
625
frm_ptr= (unsigned char *)my_malloc(sizeof(unsigned char) * file_stat.st_size, MYF(0));
628
my_read(frm_file, frm_ptr, file_stat.st_size, MYF(0));
629
azwrite_frm(&create_stream, (char *)frm_ptr, file_stat.st_size);
630
free((unsigned char*)frm_ptr);
633
my_close(frm_file, MYF(0));
636
if (create_info->comment.str)
637
azwrite_comment(&create_stream, create_info->comment.str,
638
(unsigned int)create_info->comment.length);
641
Yes you need to do this, because the starting value
642
for the autoincrement may not be zero.
644
create_stream.auto_increment= stats.auto_increment_value ?
645
stats.auto_increment_value - 1 : 0;
646
if (azclose(&create_stream))
553
r= stat(name_buff, &file_stat);
554
if (r == -1 && errno!=ENOENT)
559
return HA_ERR_TABLE_EXIST;
562
if (!(azopen(&create_stream, name_buff, O_CREAT|O_RDWR,
570
my_symlink(name_buff, linkname, MYF(0));
571
fn_format(name_buff, name, "", ".frm",
572
MY_REPLACE_EXT | MY_UNPACK_FILENAME);
575
Here is where we open up the frm and pass it to archive to store
577
if ((frm_file= fopen(name_buff, "r")) > 0)
579
if (fstat(fileno(frm_file), &file_stat))
581
if ((uint64_t)file_stat.st_size > SIZE_MAX)
586
frm_ptr= (unsigned char *)malloc((size_t)file_stat.st_size);
590
length_io= read(fileno(frm_file), frm_ptr, (size_t)file_stat.st_size);
592
if (length_io != (size_t)file_stat.st_size)
598
length_io= azwrite_frm(&create_stream, (char *)frm_ptr, (size_t)file_stat.st_size);
600
if (length_io != (size_t)file_stat.st_size)
612
if (create_info->comment.str)
616
write_length= azwrite_comment(&create_stream, create_info->comment.str,
617
(unsigned int)create_info->comment.length);
619
if (write_length == (size_t)create_info->comment.length)
624
Yes you need to do this, because the starting value
625
for the autoincrement may not be zero.
627
create_stream.auto_increment= stats.auto_increment_value ?
628
stats.auto_increment_value - 1 : 0;
629
if (azclose(&create_stream))
778
Bad news, this will cause a search for the unique value which is very
779
expensive since we will have to do a table scan which will lock up
780
all other writers during this period. This could perhaps be optimized
756
Bad news, this will cause a search for the unique value which is very
757
expensive since we will have to do a table scan which will lock up
758
all other writers during this period. This could perhaps be optimized
785
763
First we create a buffer that we can use for reading rows, and can pass
788
if (!(read_buf= (unsigned char*) my_malloc(table->s->reclength, MYF(MY_WME))))
766
if (!(read_buf= (unsigned char*) malloc(table->s->reclength)))
790
768
rc= HA_ERR_OUT_OF_MEM;
794
772
All of the buffer must be written out or we won't see all of the
797
775
azflush(&(share->archive_write), Z_SYNC_FLUSH);
844
void ha_archive::get_auto_increment(uint64_t offset __attribute__((unused)),
845
uint64_t increment __attribute__((unused)),
846
uint64_t nb_desired_values __attribute__((unused)),
847
uint64_t *first_value __attribute__((unused)),
848
uint64_t *nb_reserved_values __attribute__((unused)))
822
void ha_archive::get_auto_increment(uint64_t, uint64_t, uint64_t,
823
uint64_t *first_value, uint64_t *nb_reserved_values)
850
825
*nb_reserved_values= UINT64_MAX;
851
826
*first_value= share->archive_write.auto_increment + 1;
854
829
/* Initialized at each key walk (called multiple times unlike rnd_init()) */
855
int ha_archive::index_init(uint32_t keynr, bool sorted __attribute__((unused)))
830
int ha_archive::index_init(uint32_t keynr, bool)
857
832
active_index= keynr;
1077
This method repairs the meta file. It does this by walking the datafile and
1049
This method repairs the meta file. It does this by walking the datafile and
1078
1050
rewriting the meta file. Currently it does this by calling optimize with
1079
1051
the extended flag.
1081
int ha_archive::repair(THD* thd, HA_CHECK_OPT* check_opt)
1053
int ha_archive::repair(Session* session, HA_CHECK_OPT* check_opt)
1083
1055
check_opt->flags= T_EXTEND;
1084
int rc= optimize(thd, check_opt);
1056
int rc= optimize(session, check_opt);
1087
1059
return(HA_ERR_CRASHED_ON_REPAIR);
1113
1084
/* Lets create a file to contain the new data */
1114
fn_format(writer_filename, share->table_name, "", ARN,
1085
fn_format(writer_filename, share->table_name, "", ARN,
1115
1086
MY_REPLACE_EXT | MY_UNPACK_FILENAME);
1117
if (!(azopen(&writer, writer_filename, O_CREAT|O_RDWR|O_BINARY, AZ_METHOD_BLOCK)))
1118
return(HA_ERR_CRASHED_ON_USAGE);
1088
if (!(azopen(&writer, writer_filename, O_CREAT|O_RDWR, AZ_METHOD_BLOCK)))
1089
return(HA_ERR_CRASHED_ON_USAGE);
1121
An extended rebuild is a lot more effort. We open up each row and re-record it.
1122
Any dead rows are removed (aka rows that may have been partially recorded).
1092
An extended rebuild is a lot more effort. We open up each row and re-record it.
1093
Any dead rows are removed (aka rows that may have been partially recorded).
1124
1095
As of Archive format 3, this is the only type that is performed, before this
1125
1096
version it was just done on T_EXTEND
1208
1179
delayed_insert= false;
1210
if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
1181
if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
1213
1184
Here is where we get into the guts of a row level lock.
1215
1186
If we are not doing a LOCK Table or DISCARD/IMPORT
1216
TABLESPACE, then allow multiple writers
1187
TABLESPACE, then allow multiple writers
1219
1190
if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
1220
lock_type <= TL_WRITE) && !thd_in_lock_tables(thd)
1221
&& !thd_tablespace_op(thd))
1191
lock_type <= TL_WRITE) && !session_in_lock_tables(session)
1192
&& !session_tablespace_op(session))
1222
1193
lock_type = TL_WRITE_ALLOW_WRITE;
1225
1196
In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
1226
1197
MySQL would use the lock TL_READ_NO_INSERT on t2, and that
1227
1198
would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
1228
1199
to t2. Convert the lock to a normal read lock to allow
1229
concurrent inserts to t2.
1200
concurrent inserts to t2.
1232
if (lock_type == TL_READ_NO_INSERT && !thd_in_lock_tables(thd))
1203
if (lock_type == TL_READ_NO_INSERT && !session_in_lock_tables(session))
1233
1204
lock_type = TL_READ;
1235
1206
lock.type=lock_type;
1356
1327
We just return state if asked.
1358
bool ha_archive::is_crashed() const
1329
bool ha_archive::is_crashed() const
1360
return(share->crashed);
1331
return(share->crashed);
1364
1335
Simple scan of the tables to make sure everything is ok.
1367
int ha_archive::check(THD* thd,
1368
HA_CHECK_OPT* check_opt __attribute__((unused)))
1338
int ha_archive::check(Session* session, HA_CHECK_OPT *)
1371
1341
const char *old_proc_info;
1374
old_proc_info= thd_proc_info(thd, "Checking table");
1344
old_proc_info= get_session_proc_info(session);
1345
set_session_proc_info(session, "Checking table");
1375
1346
/* Flush any waiting data */
1376
1347
pthread_mutex_lock(&share->mutex);
1377
1348
azflush(&(share->archive_write), Z_SYNC_FLUSH);
1378
1349
pthread_mutex_unlock(&share->mutex);
1381
Now we will rewind the archive file so that we are positioned at the
1352
Now we will rewind the archive file so that we are positioned at the
1382
1353
start of the file.
1384
1355
init_archive_reader();
1409
1380
Check and repair the table if needed.
1411
bool ha_archive::check_and_repair(THD *thd)
1382
bool ha_archive::check_and_repair(Session *session)
1413
1384
HA_CHECK_OPT check_opt;
1415
1386
check_opt.init();
1417
return(repair(thd, &check_opt));
1388
return(repair(session, &check_opt));
1420
archive_record_buffer *ha_archive::create_record_buffer(unsigned int length)
1391
archive_record_buffer *ha_archive::create_record_buffer(unsigned int length)
1422
1393
archive_record_buffer *r;
1424
(archive_record_buffer*) my_malloc(sizeof(archive_record_buffer),
1394
if (!(r= (archive_record_buffer*) malloc(sizeof(archive_record_buffer))))
1427
1396
return(NULL); /* purecov: inspected */
1429
1398
r->length= (int)length;
1431
if (!(r->buffer= (unsigned char*) my_malloc(r->length,
1400
if (!(r->buffer= (unsigned char*) malloc(r->length)))
1434
1402
free((char*) r);
1435
1403
return(NULL); /* purecov: inspected */