1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
/* Pack MyISAM file */
19
#define USE_MY_FUNC /* We need at least my_malloc */
22
#include "myisamdef.h"
25
#include "mysys_err.h"
26
#ifndef __GNU_LIBRARY__
27
#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
29
#include <my_getopt.h>
32
#if SIZEOF_LONG_LONG > 4
38
#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
39
#define HEAD_LENGTH 32
40
#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
42
#define DATA_TMP_EXT ".TMD"
43
#define OLD_EXT ".OLD"
44
#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
46
struct st_file_buffer {
48
uchar *buffer,*pos,*end;
55
struct st_huff_element;
57
typedef struct st_huff_counts {
58
uint field_length,max_zero_fill;
60
uint max_end_space,max_pre_space,length_bits,min_space;
62
enum en_fieldtype field_type;
63
struct st_huff_tree *tree; /* Tree for field */
65
my_off_t end_space[8];
66
my_off_t pre_space[8];
67
my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
68
TREE int_tree; /* Tree for detecting distinct column values. */
69
uchar *tree_buff; /* Column values, 'field_length' each. */
70
uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
73
typedef struct st_huff_element HUFF_ELEMENT;
76
WARNING: It is crucial for the optimizations in calc_packed_length()
77
that 'count' is the first element of 'HUFF_ELEMENT'.
79
struct st_huff_element {
83
HUFF_ELEMENT *left,*right;
87
uint element_nr; /* Number of element */
93
typedef struct st_huff_tree {
94
HUFF_ELEMENT *root,*element_buffer;
98
my_off_t bytes_packed;
99
uint tree_pack_length;
100
uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
106
typedef struct st_isam_mrg {
107
MI_INFO **file,**current,**end;
110
uint min_pack_length; /* Theese is used by packed data */
111
uint max_pack_length;
113
uint max_blob_length;
115
/* true if at least one source file has at least one disabled index */
116
my_bool src_file_has_indexes_disabled;
120
extern int main(int argc,char * *argv);
121
static void get_options(int *argc,char ***argv);
122
static MI_INFO *open_isam_file(char *name,int mode);
123
static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
124
static int compress(PACK_MRG_INFO *file,char *join_name);
125
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
126
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
128
HUFF_COUNTS *huff_counts,
130
static int compare_tree(void* cmp_arg __attribute__((unused)),
131
const uchar *s,const uchar *t);
132
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
133
static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
135
static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
136
uint max_space_length,my_off_t *space_counts,
137
my_off_t tot_space_count,
138
enum en_fieldtype field_type);
139
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
140
static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
141
static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
142
static int save_counts_in_queue(uchar *key,element_count count,
144
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
145
static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
146
static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
147
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
148
HUFF_ELEMENT *element,uint size,
150
static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
151
my_off_t tot_elements,my_off_t filelength);
152
static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
153
static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
154
static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
155
HUFF_ELEMENT *element,
157
static uint max_bit(uint value);
158
static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
159
static char *make_new_name(char *new_name,char *old_name);
160
static char *make_old_name(char *new_name,char *old_name);
161
static void init_file_buffer(File file,bool read_buffer);
162
static int flush_buffer(ulong neaded_length);
163
static void end_file_buffer(void);
164
static void write_bits(uint64_t value, uint bits);
165
static void flush_bits(void);
166
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
168
static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
170
static int mrg_close(PACK_MRG_INFO *mrg);
171
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
172
static void mrg_reset(PACK_MRG_INFO *mrg);
174
static int error_on_write=0,test_only=0,verbose=0,silent=0,
175
write_loop=0,force_pack=0, isamchk_neaded=0;
176
static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
177
static my_bool backup, opt_wait;
179
tree_buff_length is somewhat arbitrary. The bigger it is the better
180
the chance to win in terms of compression factor. On the other hand,
181
this table becomes part of the compressed file header. And its length
182
is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
184
static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
185
static char tmp_dir[FN_REFLEN]={0},*join_table;
186
static my_off_t intervall_length;
187
static ha_checksum glob_crc;
188
static struct st_file_buffer file_buffer;
190
static HUFF_COUNTS *global_count;
191
static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
192
static const char *load_default_groups[]= { "myisampack",0 };
194
/* The main program */
196
int main(int argc, char **argv)
203
load_defaults("my",load_default_groups,&argc,&argv);
205
get_options(&argc,&argv);
207
error=ok=isamchk_neaded=0;
209
{ /* Join files into one */
210
if (open_isam_files(&merge,argv,(uint) argc) ||
211
compress(&merge,join_table))
217
if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
221
merge.file= &isam_file;
225
if (compress(&merge,0))
231
if (ok && isamchk_neaded && !silent)
232
puts("Remember to run myisamchk -rq on compressed tables");
233
VOID(fflush(stdout));
234
VOID(fflush(stderr));
235
free_defaults(default_argv);
236
my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
239
return 0; /* No compiler warning */
243
enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
245
static struct my_option my_long_options[] =
247
{"backup", 'b', "Make a backup of the table as table_name.OLD.",
248
(char**) &backup, (char**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
249
{"character-sets-dir", OPT_CHARSETS_DIR_MP,
250
"Directory where character sets are.", (char**) &charsets_dir,
251
(char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
252
{"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
253
0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
255
"Force packing of table even if it gets bigger or if tempfile exists.",
256
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
258
"Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
259
(char**) &join_table, (char**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
261
{"help", '?', "Display this help and exit.",
262
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
263
{"silent", 's', "Be more silent.",
264
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
265
{"tmpdir", 'T', "Use temporary directory to store temporary table.",
266
0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
267
{"test", 't', "Don't pack table, only test packing it.",
268
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
269
{"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
270
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
271
{"version", 'V', "Output version information and exit.",
272
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
273
{"wait", 'w', "Wait and retry if table is in use.", (char**) &opt_wait,
274
(char**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
275
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
278
static void print_version(void)
280
VOID(printf("%s Ver 1.23 for %s on %s\n",
281
my_progname, SYSTEM_TYPE, MACHINE_TYPE));
285
static void usage(void)
288
puts("Copyright (C) 2002 MySQL AB");
289
puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
290
puts("and you are welcome to modify and redistribute it under the GPL license\n");
292
puts("Pack a MyISAM-table to take much less space.");
293
puts("Keys are not updated, you must run myisamchk -rq on the datafile");
294
puts("afterwards to update the keys.");
295
puts("You should give the .MYI file as the filename argument.");
297
VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
298
my_print_help(my_long_options);
299
print_defaults("my", load_default_groups);
300
my_print_variables(my_long_options);
304
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
312
setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
317
tmpfile_createflag= O_RDWR | O_TRUNC;
320
write_loop= verbose= 0;
325
/* Avoid to reset 'verbose' if it was already set > 1. */
330
length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
331
if (length != dirname_length(tmp_dir))
333
tmp_dir[length]=FN_LIBCHAR;
338
verbose++; /* Allow for selecting the level of verbosity. */
353
/* Initiates DEBUG - but no debugging here ! */
355
static void get_options(int *argc,char ***argv)
359
my_progname= argv[0][0];
360
if (isatty(fileno(stdout)))
363
if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
373
backup=0; /* Not needed */
380
static MI_INFO *open_isam_file(char *name,int mode)
385
if (!(isam_file=mi_open(name,mode,
386
(opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
387
HA_OPEN_ABORT_IF_LOCKED))))
389
VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
393
if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
397
VOID(fprintf(stderr, "%s is already compressed\n", name));
398
VOID(mi_close(isam_file));
402
puts("Recompressing already compressed table");
403
share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
405
if (! force_pack && share->state.state.records != 0 &&
406
(share->state.state.records <= 1 ||
407
share->state.state.data_file_length < 1024))
409
VOID(fprintf(stderr, "%s is too small to compress\n", name));
410
VOID(mi_close(isam_file));
413
VOID(mi_lock_database(isam_file,F_WRLCK));
418
static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
423
mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
425
mrg->src_file_has_indexes_disabled= 0;
426
for (i=0; i < count ; i++)
428
if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
431
mrg->src_file_has_indexes_disabled|=
432
! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
433
mrg->file[i]->s->base.keys);
435
/* Check that files are identical */
436
for (j=0 ; j < count-1 ; j++)
438
MI_COLUMNDEF *m1,*m2,*end;
439
if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
440
mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
442
m1=mrg->file[j]->s->rec;
443
end=m1+mrg->file[j]->s->base.fields;
444
m2=mrg->file[j+1]->s->rec;
445
for ( ; m1 != end ; m1++,m2++)
447
if (m1->type != m2->type || m1->length != m2->length)
455
VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
456
my_progname, names[j], names[j+1]));
459
mi_close(mrg->file[i]);
460
my_free((uchar*) mrg->file,MYF(0));
465
static int compress(PACK_MRG_INFO *mrg,char *result_table)
468
File new_file,join_isam_file;
471
char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
472
uint i,header_length,fields,trees,used_trees;
473
my_off_t old_length,new_length,tot_elements;
474
HUFF_COUNTS *huff_counts;
475
HUFF_TREE *huff_trees;
477
isam_file=mrg->file[0]; /* Take this as an example */
479
new_file=join_isam_file= -1;
484
/* Create temporary or join file */
487
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
489
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
490
if (!test_only && result_table)
492
/* Make a new indexfile based on first file in list */
495
strmov(org_name,result_table); /* Fix error messages */
496
VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
497
if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
500
length=(uint) share->base.keystart;
501
if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
503
if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
504
my_write(join_isam_file,buff,length,
505
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
507
my_free(buff,MYF(0));
510
my_free(buff,MYF(0));
511
VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
513
else if (!tmp_dir[0])
514
VOID(make_new_name(new_name,org_name));
516
VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
518
(new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
521
/* Start calculating statistics */
524
for (i=0 ; i < mrg->count ; i++)
525
mrg->records+=mrg->file[i]->s->state.state.records;
527
if (write_loop || verbose)
529
VOID(printf("Compressing %s: (%lu records)\n",
530
result_table ? new_name : org_name, (ulong) mrg->records));
532
trees=fields=share->base.fields;
533
huff_counts=init_huff_count(isam_file,mrg->records);
537
Read the whole data file(s) for statistics.
539
if (write_loop || verbose)
540
VOID(printf("- Calculating statistics\n"));
541
if (get_statistic(mrg,huff_counts))
545
for (i=0; i < mrg->count ; i++)
546
old_length+= (mrg->file[i]->s->state.state.data_file_length -
547
mrg->file[i]->s->state.state.empty);
550
Create a global priority queue in preparation for making
551
temporary Huffman trees.
553
if (init_queue(&queue,256,0,0,compare_huff_elements,0))
557
Check each column if we should use pre-space-compress, end-space-
558
compress, empty-field-compress or zero-field-compress.
560
check_counts(huff_counts,fields,mrg->records);
563
Build a Huffman tree for each column.
565
huff_trees=make_huff_trees(huff_counts,trees);
568
If the packed lengths of combined columns is less then the sum of
569
the non-combined columns, then create common Huffman trees for them.
570
We do this only for byte compressed columns, not for distinct values
573
if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
577
Assign codes to all byte or column values.
579
if (make_huff_decode_table(huff_trees,fields))
582
/* Prepare a file buffer. */
583
init_file_buffer(new_file,0);
586
Reserve space in the target file for the fixed compressed file header.
588
file_buffer.pos_in_file=HEAD_LENGTH;
590
VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
593
Write field infos: field type, pack type, length bits, tree number.
595
write_field_info(huff_counts,fields,used_trees);
600
if (!(tot_elements=write_huff_tree(huff_trees,trees)))
604
Calculate the total length of the compression info header.
605
This includes the fixed compressed file header, the column compression
606
type descriptions, and the decode trees.
608
header_length=(uint) file_buffer.pos_in_file+
609
(uint) (file_buffer.pos-file_buffer.buffer);
612
Compress the source file into the target file.
614
if (write_loop || verbose)
615
VOID(printf("- Compressing file\n"));
616
error=compress_isam_file(mrg,huff_counts);
617
new_length=file_buffer.pos_in_file;
618
if (!error && !test_only)
620
uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
621
bzero(buff,sizeof(buff));
622
error=my_write(file_buffer.file,buff,sizeof(buff),
623
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
627
Write the fixed compressed file header.
630
error=write_header(mrg,header_length,used_trees,tot_elements,
633
/* Flush the file buffer. */
636
/* Display statistics. */
637
if (verbose && mrg->records)
638
VOID(printf("Min record length: %6d Max length: %6d "
639
"Mean total length: %6ld\n", mrg->min_pack_length,
640
mrg->max_pack_length, (ulong) (new_length/mrg->records)));
642
/* Close source and target file. */
645
error|=my_close(new_file,MYF(MY_WME));
648
error|=my_close(isam_file->dfile,MYF(MY_WME));
649
isam_file->dfile= -1; /* Tell mi_close file is closed */
654
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
655
if (! test_only && ! error)
659
error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
665
if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
671
error=my_copy(new_name,org_name,MYF(MY_WME));
673
error=my_rename(new_name,org_name,MYF(MY_WME));
676
VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
678
VOID(my_delete(new_name,MYF(MY_WME)));
686
error=my_copy(new_name,org_name,
687
MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
689
VOID(my_delete(new_name,MYF(MY_WME)));
692
error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
695
error=save_state(isam_file,mrg,new_length,glob_crc);
698
error|=mrg_close(mrg);
699
if (join_isam_file >= 0)
700
error|=my_close(join_isam_file,MYF(MY_WME));
703
VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
704
VOID(my_delete(new_name,MYF(MY_WME)));
707
if (write_loop || verbose)
710
VOID(printf("%.4g%% \n",
711
(((int64_t) (old_length - new_length)) * 100.0 /
712
(int64_t) old_length)));
714
puts("Empty file saved in compressed format");
719
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
721
VOID(my_close(new_file,MYF(0)));
722
if (join_isam_file >= 0)
723
VOID(my_close(join_isam_file,MYF(0)));
725
VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
729
/* Init a huff_count-struct for each field and init it */
731
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
734
register HUFF_COUNTS *count;
735
if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
737
MYF(MY_ZEROFILL | MY_WME))))
739
for (i=0 ; i < info->s->base.fields ; i++)
741
enum en_fieldtype type;
742
count[i].field_length=info->s->rec[i].length;
743
type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
744
if (type == FIELD_INTERVALL ||
745
type == FIELD_CONSTANT ||
748
if (count[i].field_length <= 8 &&
749
(type == FIELD_NORMAL ||
750
type == FIELD_SKIP_ZERO))
751
count[i].max_zero_fill= count[i].field_length;
753
For every column initialize a tree, which is used to detect distinct
754
column values. 'int_tree' works together with 'tree_buff' and
755
'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
756
This is accomplished by '-1' as the element size.
758
init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
760
if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
761
count[i].tree_pos=count[i].tree_buff =
762
my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
770
/* Free memory used by counts and trees */
772
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
773
HUFF_COUNTS *huff_counts,
780
for (i=0 ; i < trees ; i++)
782
if (huff_trees[i].element_buffer)
783
my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
784
if (huff_trees[i].code)
785
my_free((uchar*) huff_trees[i].code,MYF(0));
787
my_free((uchar*) huff_trees,MYF(0));
791
for (i=0 ; i < fields ; i++)
793
if (huff_counts[i].tree_buff)
795
my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
796
delete_tree(&huff_counts[i].int_tree);
799
my_free((uchar*) huff_counts,MYF(0));
801
delete_queue(&queue); /* This is safe to free */
805
/* Read through old file and gather some statistics */
807
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
811
ulong reclength,max_blob_length;
812
uchar *record,*pos,*next_pos,*end_pos,*start_pos;
813
ha_rows record_count;
814
my_bool static_row_size;
815
HUFF_COUNTS *count,*end_count;
816
TREE_ELEMENT *element;
818
reclength=mrg->file[0]->s->base.reclength;
819
record=(uchar*) my_alloca(reclength);
820
end_count=huff_counts+mrg->file[0]->s->base.fields;
821
record_count=0; glob_crc=0;
824
/* Check how to calculate checksum */
826
for (count=huff_counts ; count < end_count ; count++)
828
if (count->field_type == FIELD_BLOB ||
829
count->field_type == FIELD_VARCHAR)
837
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
839
ulong tot_blob_length=0;
842
/* glob_crc is a checksum over all bytes of all records. */
844
glob_crc+=mi_static_checksum(mrg->file[0],record);
846
glob_crc+=mi_checksum(mrg->file[0],record);
848
/* Count the incidence of values separately for every column. */
849
for (pos=record,count=huff_counts ;
854
next_pos=end_pos=(start_pos=pos)+count->field_length;
857
Put the whole column value in a tree if there is room for it.
858
'int_tree' is used to quickly check for duplicate values.
859
'tree_buff' collects as many distinct column values as
860
possible. If the field length is > 1, it is tree_buff_length,
861
else 2 bytes. Each value is 'field_length' bytes big. If there
862
are more distinct column values than fit into the buffer, we
863
give up with this tree. BLOBs and VARCHARs do not have a
864
tree_buff as it can only be used with fixed length columns.
865
For the special case of field length == 1, we handle only the
866
case that there is only one distinct value in the table(s).
867
Otherwise, we can have a maximum of 256 distinct values. This
868
is then handled by the normal Huffman tree build.
870
Another limit for collecting distinct column values is the
871
number of values itself. Since we would need to build a
872
Huffman tree for the values, we are limited by the 'IS_OFFSET'
873
constant. This constant expresses a bit which is used to
874
determine if a tree element holds a final value or an offset
875
to a child element. Hence, all values and offsets need to be
876
smaller than 'IS_OFFSET'. A tree element is implemented with
877
two integer values, one for the left branch and one for the
878
right branch. For the extreme case that the first element
879
points to the last element, the number of integers in the tree
880
must be less or equal to IS_OFFSET. So the number of elements
881
must be less or equal to IS_OFFSET / 2.
883
WARNING: At first, we insert a pointer into the record buffer
884
as the key for the tree. If we got a new distinct value, which
885
is really inserted into the tree, instead of being counted
886
only, we will copy the column value from the record buffer to
887
'tree_buff' and adjust the key pointer of the tree accordingly.
889
if (count->tree_buff)
892
if (!(element=tree_insert(&count->int_tree,pos, 0,
893
count->int_tree.custom_arg)) ||
894
(element->count == 1 &&
895
(count->tree_buff + tree_buff_length <
896
count->tree_pos + count->field_length)) ||
897
(count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
898
(count->field_length == 1 &&
899
count->int_tree.elements_in_tree > 1))
901
delete_tree(&count->int_tree);
902
my_free(count->tree_buff,MYF(0));
908
If tree_insert() succeeds, it either creates a new element
909
or increments the counter of an existing element.
911
if (element->count == 1)
913
/* Copy the new column value into 'tree_buff'. */
914
memcpy(count->tree_pos,pos,(size_t) count->field_length);
915
/* Adjust the key pointer in the tree. */
916
tree_set_pointer(element,count->tree_pos);
917
/* Point behind the last column value so far. */
918
count->tree_pos+=count->field_length;
923
/* Save character counters and space-counts and zero-field-counts */
924
if (count->field_type == FIELD_NORMAL ||
925
count->field_type == FIELD_SKIP_ENDSPACE)
927
/* Ignore trailing space. */
928
for ( ; end_pos > pos ; end_pos--)
929
if (end_pos[-1] != ' ')
931
/* Empty fields are just counted. Go to the next record. */
934
count->empty_fields++;
935
count->max_zero_fill=0;
939
Count the total of all trailing spaces and the number of
940
short trailing spaces. Remember the longest trailing space.
942
length= (uint) (next_pos-end_pos);
943
count->tot_end_space+=length;
945
count->end_space[length]++;
946
if (count->max_end_space < length)
947
count->max_end_space = length;
950
if (count->field_type == FIELD_NORMAL ||
951
count->field_type == FIELD_SKIP_PRESPACE)
953
/* Ignore leading space. */
954
for (pos=start_pos; pos < end_pos ; pos++)
957
/* Empty fields are just counted. Go to the next record. */
960
count->empty_fields++;
961
count->max_zero_fill=0;
965
Count the total of all leading spaces and the number of
966
short leading spaces. Remember the longest leading space.
968
length= (uint) (pos-start_pos);
969
count->tot_pre_space+=length;
971
count->pre_space[length]++;
972
if (count->max_pre_space < length)
973
count->max_pre_space = length;
976
/* Calculate pos, end_pos, and max_length for variable length fields. */
977
if (count->field_type == FIELD_BLOB)
979
uint field_length=count->field_length -portable_sizeof_char_ptr;
980
ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
981
memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
982
end_pos=pos+blob_length;
983
tot_blob_length+=blob_length;
984
set_if_bigger(count->max_length,blob_length);
986
else if (count->field_type == FIELD_VARCHAR)
988
uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
989
length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
990
uint2korr(start_pos));
991
pos= start_pos+pack_length;
993
set_if_bigger(count->max_length,length);
996
/* Evaluate 'max_zero_fill' for short fields. */
997
if (count->field_length <= 8 &&
998
(count->field_type == FIELD_NORMAL ||
999
count->field_type == FIELD_SKIP_ZERO))
1002
/* Zero fields are just counted. Go to the next record. */
1003
if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1005
count->zero_fields++;
1009
max_zero_fill starts with field_length. It is decreased every
1010
time a shorter "zero trailer" is found. It is set to zero when
1011
an empty field is found (see above). This suggests that the
1012
variable should be called 'min_zero_fill'.
1014
for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1016
if (i < count->max_zero_fill)
1017
count->max_zero_fill=i;
1020
/* Ignore zero fields and check fields. */
1021
if (count->field_type == FIELD_ZERO ||
1022
count->field_type == FIELD_CHECK)
1026
Count the incidence of every byte value in the
1027
significant field value.
1029
for ( ; pos < end_pos ; pos++)
1030
count->counts[(uchar) *pos]++;
1032
/* Step to next field. */
1035
if (tot_blob_length > max_blob_length)
1036
max_blob_length=tot_blob_length;
1038
if (write_loop && record_count % WRITE_COUNT == 0)
1040
VOID(printf("%lu\r", (ulong) record_count));
1041
VOID(fflush(stdout));
1044
else if (error != HA_ERR_RECORD_DELETED)
1046
VOID(fprintf(stderr, "Got error %d while reading rows", error));
1050
/* Step to next record. */
1054
VOID(printf(" \r"));
1055
VOID(fflush(stdout));
1059
VOID(printf("Found the following number of incidents "
1060
"of the byte codes:\n"));
1061
for (count= huff_counts ; count < end_count; count++)
1064
my_off_t total_count;
1068
VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1069
if (count->tree_buff)
1072
VOID(printf("number of distinct values: %u\n",
1073
(uint) ((count->tree_pos - count->tree_buff) /
1074
count->field_length)));
1077
for (idx= 0; idx < 256; idx++)
1079
if (count->counts[idx])
1081
total_count+= count->counts[idx];
1083
VOID(printf("counts[0x%02x]: %12s\n", idx,
1084
llstr((int64_t) count->counts[idx], llbuf)));
1087
if ((verbose >= 2) && total_count)
1089
VOID(printf("total: %12s\n",
1090
llstr((int64_t) total_count, llbuf)));
1094
mrg->records=record_count;
1095
mrg->max_blob_length=max_blob_length;
1096
my_afree((uchar*) record);
1097
return(error != HA_ERR_END_OF_FILE);
1100
static int compare_huff_elements(void *not_used __attribute__((unused)),
1103
return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1104
(*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1107
/* Check each tree if we should use pre-space-compress, end-space-
1108
compress, empty-field-compress or zero-field-compress */
1110
static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1113
uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1114
my_off_t old_length,new_length,length;
1116
bzero((uchar*) field_count,sizeof(field_count));
1117
space_fields=fill_zero_fields=0;
1119
for (; trees-- ; huff_counts++)
1121
if (huff_counts->field_type == FIELD_BLOB)
1123
huff_counts->length_bits=max_bit(huff_counts->max_length);
1126
else if (huff_counts->field_type == FIELD_VARCHAR)
1128
huff_counts->length_bits=max_bit(huff_counts->max_length);
1131
else if (huff_counts->field_type == FIELD_CHECK)
1133
huff_counts->bytes_packed=0;
1134
huff_counts->counts[0]=0;
1138
huff_counts->field_type=FIELD_NORMAL;
1139
huff_counts->pack_type=0;
1141
/* Check for zero-filled records (in this column), or zero records. */
1142
if (huff_counts->zero_fields || ! records)
1144
my_off_t old_space_count;
1146
If there are only zero filled records (in this column),
1147
or no records at all, we are done.
1149
if (huff_counts->zero_fields == records)
1151
huff_counts->field_type= FIELD_ZERO;
1152
huff_counts->bytes_packed=0;
1153
huff_counts->counts[0]=0;
1156
/* Remeber the number of significant spaces. */
1157
old_space_count=huff_counts->counts[' '];
1158
/* Add all leading and trailing spaces. */
1159
huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1160
huff_counts->tot_pre_space +
1161
huff_counts->empty_fields *
1162
huff_counts->field_length);
1163
/* Check, what the compressed length of this would be. */
1164
old_length=calc_packed_length(huff_counts,0)+records/8;
1165
/* Get the number of zero bytes. */
1166
length=huff_counts->zero_fields*huff_counts->field_length;
1167
/* Add it to the counts. */
1168
huff_counts->counts[0]+=length;
1169
/* Check, what the compressed length of this would be. */
1170
new_length=calc_packed_length(huff_counts,0);
1171
/* If the compression without the zeroes would be shorter, we are done. */
1172
if (old_length < new_length && huff_counts->field_length > 1)
1174
huff_counts->field_type=FIELD_SKIP_ZERO;
1175
huff_counts->counts[0]-=length;
1176
huff_counts->bytes_packed=old_length- records/8;
1179
/* Remove the insignificant spaces, but keep the zeroes. */
1180
huff_counts->counts[' ']=old_space_count;
1182
/* Check, what the compressed length of this column would be. */
1183
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1186
If there are enough empty records (in this column),
1187
treating them specially may pay off.
1189
if (huff_counts->empty_fields)
1191
if (huff_counts->field_length > 2 &&
1192
huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1193
(1+max_bit(max(huff_counts->max_pre_space,
1194
huff_counts->max_end_space))) <
1195
records * max_bit(huff_counts->field_length))
1197
huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1201
length=huff_counts->empty_fields*huff_counts->field_length;
1202
if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1204
huff_counts->tot_end_space+=length;
1205
huff_counts->max_end_space=huff_counts->field_length;
1206
if (huff_counts->field_length < 8)
1207
huff_counts->end_space[huff_counts->field_length]+=
1208
huff_counts->empty_fields;
1210
if (huff_counts->tot_pre_space)
1212
huff_counts->tot_pre_space+=length;
1213
huff_counts->max_pre_space=huff_counts->field_length;
1214
if (huff_counts->field_length < 8)
1215
huff_counts->pre_space[huff_counts->field_length]+=
1216
huff_counts->empty_fields;
1222
If there are enough trailing spaces (in this column),
1223
treating them specially may pay off.
1225
if (huff_counts->tot_end_space)
1227
huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1228
if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1229
huff_counts->end_space,
1230
huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1232
huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1236
If there are enough leading spaces (in this column),
1237
treating them specially may pay off.
1239
if (huff_counts->tot_pre_space)
1241
if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1242
huff_counts->pre_space,
1243
huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1247
found_pack: /* Found field-packing */
1249
/* Test if we can use zero-fill */
1251
if (huff_counts->max_zero_fill &&
1252
(huff_counts->field_type == FIELD_NORMAL ||
1253
huff_counts->field_type == FIELD_SKIP_ZERO))
1255
huff_counts->counts[0]-=huff_counts->max_zero_fill*
1256
(huff_counts->field_type == FIELD_SKIP_ZERO ?
1257
records - huff_counts->zero_fields : records);
1258
huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1259
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1262
/* Test if intervall-field is better */
1264
if (huff_counts->tree_buff)
1268
tree.element_buffer=0;
1269
if (!make_huff_tree(&tree,huff_counts) &&
1270
tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1272
if (tree.elements == 1)
1273
huff_counts->field_type=FIELD_CONSTANT;
1275
huff_counts->field_type=FIELD_INTERVALL;
1276
huff_counts->pack_type=0;
1280
my_free((uchar*) huff_counts->tree_buff,MYF(0));
1281
delete_tree(&huff_counts->int_tree);
1282
huff_counts->tree_buff=0;
1284
if (tree.element_buffer)
1285
my_free((uchar*) tree.element_buffer,MYF(0));
1287
if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1289
if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1291
field_count[huff_counts->field_type]++;
1294
VOID(printf("\nnormal: %3d empty-space: %3d "
1295
"empty-zero: %3d empty-fill: %3d\n"
1296
"pre-space: %3d end-space: %3d "
1297
"intervall-fields: %3d zero: %3d\n",
1298
field_count[FIELD_NORMAL],space_fields,
1299
field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1300
field_count[FIELD_SKIP_PRESPACE],
1301
field_count[FIELD_SKIP_ENDSPACE],
1302
field_count[FIELD_INTERVALL],
1303
field_count[FIELD_ZERO]));
1307
/* Test if we can use space-compression and empty-field-compression */
1310
test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1311
uint max_space_length, my_off_t *space_counts,
1312
my_off_t tot_space_count, enum en_fieldtype field_type)
1316
my_off_t space_count,min_space_count,min_pack,new_length,skip;
1318
length_bits=max_bit(max_space_length);
1320
/* Default no end_space-packing */
1321
space_count=huff_counts->counts[(uint) ' '];
1322
min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1323
min_pack=calc_packed_length(huff_counts,0);
1325
huff_counts->counts[(uint) ' ']=space_count;
1327
/* Test with allways space-count */
1328
new_length=huff_counts->bytes_packed+length_bits*records/8;
1329
if (new_length+1 < min_pack)
1332
min_pack=new_length;
1333
min_space_count=space_count;
1335
/* Test with length-flag */
1336
for (skip=0L, i=0 ; i < 8 ; i++)
1338
if (space_counts[i])
1341
huff_counts->counts[(uint) ' ']+=space_counts[i];
1342
skip+=huff_counts->pre_space[i];
1343
new_length=calc_packed_length(huff_counts,0)+
1344
(records+(records-skip)*(1+length_bits))/8;
1345
if (new_length < min_pack)
1348
min_pack=new_length;
1349
min_space_count=huff_counts->counts[(uint) ' '];
1354
huff_counts->counts[(uint) ' ']=min_space_count;
1355
huff_counts->bytes_packed=min_pack;
1358
return(0); /* No space-compress */
1359
case -1: /* Always space-count */
1360
huff_counts->field_type=field_type;
1361
huff_counts->min_space=0;
1362
huff_counts->length_bits=max_bit(max_space_length);
1365
huff_counts->field_type=field_type;
1366
huff_counts->min_space=(uint) min_pos;
1367
huff_counts->pack_type|=PACK_TYPE_SELECTED;
1368
huff_counts->length_bits=max_bit(max_space_length);
1371
return(1); /* Using space-compress */
1375
/* Make a huff_tree of each huff_count */
1377
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1380
HUFF_TREE *huff_tree;
1382
if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1383
MYF(MY_WME | MY_ZEROFILL))))
1386
for (tree=0 ; tree < trees ; tree++)
1388
if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1391
my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1392
my_free((uchar*) huff_tree,MYF(0));
1400
Build a Huffman tree.
1404
huff_tree The Huffman tree.
1405
huff_counts The counts.
1408
Build a Huffman tree according to huff_counts->counts or
1409
huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1410
tree_buff_length of distinct column values. In that case, whole
1411
values can be Huffman encoded instead of single bytes.
1418
static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1420
uint i,found,bits_packed,first,last;
1421
my_off_t bytes_packed;
1422
HUFF_ELEMENT *a,*b,*new_huff_el;
1425
if (huff_counts->tree_buff)
1427
/* Calculate the number of distinct values in tree_buff. */
1428
found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1429
huff_counts->field_length;
1430
first=0; last=found-1;
1434
/* Count the number of byte codes found in the column. */
1435
for (i=found=0 ; i < 256 ; i++)
1437
if (huff_counts->counts[i])
1448
/* When using 'tree_buff' we can have more that 256 values. */
1449
if (queue.max_elements < found)
1451
delete_queue(&queue);
1452
if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1456
/* Allocate or reallocate an element buffer for the Huffman tree. */
1457
if (!huff_tree->element_buffer)
1459
if (!(huff_tree->element_buffer=
1460
(HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1467
(HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1468
found*2*sizeof(HUFF_ELEMENT),
1471
huff_tree->element_buffer=temp;
1474
huff_counts->tree=huff_tree;
1475
huff_tree->counts=huff_counts;
1476
huff_tree->min_chr=first;
1477
huff_tree->max_chr=last;
1478
huff_tree->char_bits=max_bit(last-first);
1479
huff_tree->offset_bits=max_bit(found-1)+1;
1481
if (huff_counts->tree_buff)
1483
huff_tree->elements=0;
1484
huff_tree->tree_pack_length=(1+15+16+5+5+
1485
(huff_tree->char_bits+1)*found+
1486
(huff_tree->offset_bits+1)*
1488
(uint) (huff_tree->counts->tree_pos-
1489
huff_tree->counts->tree_buff);
1491
Put a HUFF_ELEMENT into the queue for every distinct column value.
1493
tree_walk() calls save_counts_in_queue() for every element in
1494
'int_tree'. This takes elements from the target trees element
1495
buffer and places references to them into the buffer of the
1496
priority queue. We insert in column value order, but the order is
1497
in fact irrelevant here. We will establish the correct order
1500
tree_walk(&huff_counts->int_tree,
1501
(int (*)(void*, element_count,void*)) save_counts_in_queue,
1502
(uchar*) huff_tree, left_root_right);
1506
huff_tree->elements=found;
1507
huff_tree->tree_pack_length=(9+9+5+5+
1508
(huff_tree->char_bits+1)*found+
1509
(huff_tree->offset_bits+1)*
1512
Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1514
The elements are taken from the target trees element buffer.
1515
Instead of using queue_insert(), we just place references to the
1516
elements into the buffer of the priority queue. We insert in byte
1517
value order, but the order is in fact irrelevant here. We will
1518
establish the correct order later.
1520
for (i=first, found=0 ; i <= last ; i++)
1522
if (huff_counts->counts[i])
1524
new_huff_el=huff_tree->element_buffer+(found++);
1525
new_huff_el->count=huff_counts->counts[i];
1526
new_huff_el->a.leaf.null=0;
1527
new_huff_el->a.leaf.element_nr=i;
1528
queue.root[found]=(uchar*) new_huff_el;
1532
If there is only a single byte value in this field in all records,
1533
add a second element with zero incidence. This is required to enter
1534
the loop, which builds the Huffman tree.
1538
new_huff_el=huff_tree->element_buffer+(found++);
1539
new_huff_el->count=0;
1540
new_huff_el->a.leaf.null=0;
1542
new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1544
new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1545
queue.root[found]=(uchar*) new_huff_el;
1549
/* Make a queue from the queue buffer. */
1550
queue.elements=found;
1553
Make a priority queue from the queue. Construct its index so that we
1554
have a partially ordered tree.
1556
for (i=found/2 ; i > 0 ; i--)
1557
_downheap(&queue,i);
1559
/* The Huffman algorithm. */
1560
bytes_packed=0; bits_packed=0;
1561
for (i=1 ; i < found ; i++)
1564
Pop the top element from the queue (the one with the least incidence).
1565
Popping from a priority queue includes a re-ordering of the queue,
1566
to get the next least incidence element to the top.
1568
a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1570
Copy the next least incidence element. The queue implementation
1571
reserves root[0] for temporary purposes. root[1] is the top.
1573
b=(HUFF_ELEMENT*) queue.root[1];
1574
/* Get a new element from the element buffer. */
1575
new_huff_el=huff_tree->element_buffer+found+i;
1576
/* The new element gets the sum of the two least incidence elements. */
1577
new_huff_el->count=a->count+b->count;
1579
The Huffman algorithm assigns another bit to the code for a byte
1580
every time that bytes incidence is combined (directly or indirectly)
1581
to a new element as one of the two least incidence elements.
1582
This means that one more bit per incidence of that byte is required
1583
in the resulting file. So we add the new combined incidence as the
1584
number of bits by which the result grows.
1586
bits_packed+=(uint) (new_huff_el->count & 7);
1587
bytes_packed+=new_huff_el->count/8;
1588
/* The new element points to its children, lesser in left. */
1589
new_huff_el->a.nod.left=a;
1590
new_huff_el->a.nod.right=b;
1592
Replace the copied top element by the new element and re-order the
1595
queue.root[1]=(uchar*) new_huff_el;
1596
queue_replaced(&queue);
1598
huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1599
huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1603
static int compare_tree(void* cmp_arg __attribute__((unused)),
1604
register const uchar *s, register const uchar *t)
1607
for (length=global_count->field_length; length-- ;)
1609
return (int) s[-1] - (int) t[-1];
1614
Organize distinct column values and their incidences into a priority queue.
1617
save_counts_in_queue()
1618
key The column value.
1619
count The incidence of this value.
1620
tree The Huffman tree to be built later.
1623
We use the element buffer of the targeted tree. The distinct column
1624
values are organized in a priority queue first. The Huffman
1625
algorithm will later organize the elements into a Huffman tree. For
1626
the time being, we just place references to the elements into the
1627
queue buffer. The buffer will later be organized into a priority
1634
static int save_counts_in_queue(uchar *key, element_count count,
1637
HUFF_ELEMENT *new_huff_el;
1639
new_huff_el=tree->element_buffer+(tree->elements++);
1640
new_huff_el->count=count;
1641
new_huff_el->a.leaf.null=0;
1642
new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1643
tree->counts->field_length;
1644
queue.root[tree->elements]=(uchar*) new_huff_el;
1650
Calculate length of file if given counts should be used.
1653
calc_packed_length()
1654
huff_counts The counts for a column of the table(s).
1655
add_tree_lenght If the decode tree length should be added.
1658
We need to follow the Huffman algorithm until we know, how many bits
1659
are required for each byte code. But we do not need the resulting
1660
Huffman tree. Hence, we can leave out some steps which are essential
1661
in make_huff_tree().
1664
Number of bytes required to compress this table column.
1667
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1668
uint add_tree_lenght)
1670
uint i,found,bits_packed,first,last;
1671
my_off_t bytes_packed;
1672
HUFF_ELEMENT element_buffer[256];
1675
WARNING: We use a small hack for efficiency: Instead of placing
1676
references to HUFF_ELEMENTs into the queue, we just insert
1677
references to the counts of the byte codes which appeared in this
1678
table column. During the Huffman algorithm they are successively
1679
replaced by references to HUFF_ELEMENTs. This works, because
1680
HUFF_ELEMENTs have the incidence count at their beginning.
1681
Regardless, wether the queue array contains references to counts of
1682
type my_off_t or references to HUFF_ELEMENTs which have the count of
1683
type my_off_t at their beginning, it always points to a count of the
1686
Instead of using queue_insert(), we just copy the references into
1687
the buffer of the priority queue. We insert in byte value order, but
1688
the order is in fact irrelevant here. We will establish the correct
1692
for (i=found=0 ; i < 256 ; i++)
1694
if (huff_counts->counts[i])
1699
/* We start with root[1], which is the queues top element. */
1700
queue.root[found]=(uchar*) &huff_counts->counts[i];
1704
return(0); /* Empty tree */
1706
If there is only a single byte value in this field in all records,
1707
add a second element with zero incidence. This is required to enter
1708
the loop, which follows the Huffman algorithm.
1711
queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1713
/* Make a queue from the queue buffer. */
1714
queue.elements=found;
1716
bytes_packed=0; bits_packed=0;
1717
/* Add the length of the coding table, which would become part of the file. */
1718
if (add_tree_lenght)
1719
bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1720
(max_bit(found-1)+1+1)*(found-2) +7)/8;
1723
Make a priority queue from the queue. Construct its index so that we
1724
have a partially ordered tree.
1726
for (i=(found+1)/2 ; i > 0 ; i--)
1727
_downheap(&queue,i);
1729
/* The Huffman algorithm. */
1730
for (i=0 ; i < found-1 ; i++)
1734
HUFF_ELEMENT *new_huff_el;
1737
Pop the top element from the queue (the one with the least
1738
incidence). Popping from a priority queue includes a re-ordering
1739
of the queue, to get the next least incidence element to the top.
1741
a= (my_off_t*) queue_remove(&queue, 0);
1743
Copy the next least incidence element. The queue implementation
1744
reserves root[0] for temporary purposes. root[1] is the top.
1746
b= (my_off_t*) queue.root[1];
1747
/* Create a new element in a local (automatic) buffer. */
1748
new_huff_el= element_buffer + i;
1749
/* The new element gets the sum of the two least incidence elements. */
1750
new_huff_el->count= *a + *b;
1752
The Huffman algorithm assigns another bit to the code for a byte
1753
every time that bytes incidence is combined (directly or indirectly)
1754
to a new element as one of the two least incidence elements.
1755
This means that one more bit per incidence of that byte is required
1756
in the resulting file. So we add the new combined incidence as the
1757
number of bits by which the result grows.
1759
bits_packed+=(uint) (new_huff_el->count & 7);
1760
bytes_packed+=new_huff_el->count/8;
1762
Replace the copied top element by the new element and re-order the
1763
queue. This successively replaces the references to counts by
1764
references to HUFF_ELEMENTs.
1766
queue.root[1]=(uchar*) new_huff_el;
1767
queue_replaced(&queue);
1769
return(bytes_packed+(bits_packed+7)/8);
1773
/* Remove trees that don't give any compression */
1775
static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1778
HUFF_COUNTS count,*i,*j,*last_count;
1780
last_count=huff_counts+trees;
1781
for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1783
if (!i->tree->tree_number)
1785
i->tree->tree_number= ++tree_number;
1787
continue; /* Don't join intervall */
1788
for (j=i+1 ; j < last_count ; j++)
1790
if (! j->tree->tree_number && ! j->tree_buff)
1792
for (k=0 ; k < 256 ; k++)
1793
count.counts[k]=i->counts[k]+j->counts[k];
1794
if (calc_packed_length(&count,1) <=
1795
i->tree->bytes_packed + j->tree->bytes_packed+
1796
i->tree->tree_pack_length+j->tree->tree_pack_length+
1799
memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1800
sizeof(count.counts[0])*256);
1801
my_free((uchar*) j->tree->element_buffer,MYF(0));
1802
j->tree->element_buffer=0;
1804
bmove((uchar*) i->counts,(uchar*) count.counts,
1805
sizeof(count.counts[0])*256);
1806
if (make_huff_tree(i->tree,i))
1814
VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
1815
return tree_number; /* Return trees left */
1820
Fill in huff_tree encode tables.
1823
make_huff_decode_table()
1824
huff_tree An array of HUFF_TREE which are to be encoded.
1825
trees The number of HUFF_TREE in the array.
1832
static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1835
for ( ; trees-- ; huff_tree++)
1837
if (huff_tree->tree_number > 0)
1839
elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1840
if (!(huff_tree->code =
1841
(uint64_t*) my_malloc(elements*
1842
(sizeof(uint64_t) + sizeof(uchar)),
1843
MYF(MY_WME | MY_ZEROFILL))))
1845
huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1846
make_traverse_code_tree(huff_tree, huff_tree->root,
1847
8 * sizeof(uint64_t), 0LL);
1854
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1855
HUFF_ELEMENT *element,
1856
uint size, uint64_t code)
1859
if (!element->a.leaf.null)
1861
chr=element->a.leaf.element_nr;
1862
huff_tree->code_len[chr]= (uchar) (8 * sizeof(uint64_t) - size);
1863
huff_tree->code[chr]= (code >> size);
1864
if (huff_tree->height < 8 * sizeof(uint64_t) - size)
1865
huff_tree->height= 8 * sizeof(uint64_t) - size;
1870
make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1871
make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1872
code + (((uint64_t) 1) << size));
1879
Convert a value into binary digits.
1884
length The number of low order bits to convert.
1887
The result string is in static storage. It is reused on every call.
1888
So you cannot use it twice in one expression.
1891
A pointer to a static NUL-terminated string.
1894
static char *bindigits(uint64_t value, uint bits)
1896
static char digits[72];
1900
assert(idx < sizeof(digits));
1902
*(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1909
Convert a value into hexadecimal digits.
1916
The result string is in static storage. It is reused on every call.
1917
So you cannot use it twice in one expression.
1920
A pointer to a static NUL-terminated string.
1923
static char *hexdigits(uint64_t value)
1925
static char digits[20];
1927
uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1929
assert(idx < sizeof(digits));
1932
if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1933
*(ptr - 1)+= 'a' - '9' - 1;
1940
/* Write header to new packed data file */
1942
static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
1943
my_off_t tot_elements,my_off_t filelength)
1945
uchar *buff= (uchar*) file_buffer.pos;
1947
bzero(buff,HEAD_LENGTH);
1948
memcpy_fixed(buff,myisam_pack_file_magic,4);
1949
int4store(buff+4,head_length);
1950
int4store(buff+8, mrg->min_pack_length);
1951
int4store(buff+12,mrg->max_pack_length);
1952
int4store(buff+16,tot_elements);
1953
int4store(buff+20,intervall_length);
1954
int2store(buff+24,trees);
1955
buff[26]=(char) mrg->ref_length;
1956
/* Save record pointer length */
1957
buff[27]= (uchar) mi_get_pointer_length((uint64_t) filelength,2);
1960
VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
1961
return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
1962
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
1965
/* Write fieldinfo to new packed file */
1967
static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
1970
uint huff_tree_bits;
1971
huff_tree_bits=max_bit(trees ? trees-1 : 0);
1976
VOID(printf("column types:\n"));
1977
VOID(printf("FIELD_NORMAL 0\n"));
1978
VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
1979
VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
1980
VOID(printf("FIELD_SKIP_ZERO 3\n"));
1981
VOID(printf("FIELD_BLOB 4\n"));
1982
VOID(printf("FIELD_CONSTANT 5\n"));
1983
VOID(printf("FIELD_INTERVALL 6\n"));
1984
VOID(printf("FIELD_ZERO 7\n"));
1985
VOID(printf("FIELD_VARCHAR 8\n"));
1986
VOID(printf("FIELD_CHECK 9\n"));
1988
VOID(printf("pack type as a set of flags:\n"));
1989
VOID(printf("PACK_TYPE_SELECTED 1\n"));
1990
VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
1991
VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
1994
for (i=0 ; i++ < fields ; counts++)
1996
write_bits((uint64_t) (int) counts->field_type, 5);
1997
write_bits(counts->pack_type,6);
1998
if (counts->pack_type & PACK_TYPE_ZERO_FILL)
1999
write_bits(counts->max_zero_fill,5);
2001
write_bits(counts->length_bits,5);
2002
write_bits((uint64_t) counts->tree->tree_number - 1, huff_tree_bits);
2004
VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2005
"tree: %2u length: %4u\n", i , counts->field_type,
2006
counts->pack_type, counts->max_zero_fill, counts->length_bits,
2007
counts->tree->tree_number, counts->field_length));
2013
/* Write all huff_trees to new datafile. Return tot count of
2014
elements in all trees
2015
Returns 0 on error */
2017
static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2023
uint *packed_tree,*offset,length;
2026
/* Find the highest number of elements in the trees. */
2027
for (i=length=0 ; i < trees ; i++)
2028
if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2029
length=huff_tree[i].elements;
2031
Allocate a buffer for packing a decode tree. Two numbers per element
2032
(left child and right child).
2034
if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2036
my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2044
for (elements=0; trees-- ; huff_tree++)
2046
/* Skip columns that have been joined with other columns. */
2047
if (huff_tree->tree_number == 0)
2048
continue; /* Deleted tree */
2052
/* Count the total number of elements (byte codes or column values). */
2053
elements+=huff_tree->elements;
2054
huff_tree->max_offset=2;
2055
/* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2056
if (huff_tree->elements <= 1)
2059
offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2061
/* This should be the same as 'length' above. */
2062
huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2065
Since we check this during collecting the distinct column values,
2066
this should never happen.
2068
if (huff_tree->max_offset >= IS_OFFSET)
2069
{ /* This should be impossible */
2070
VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2071
huff_tree->max_offset));
2072
my_afree((uchar*) packed_tree);
2076
if (!huff_tree->counts->tree_buff)
2078
/* We do a byte compression on this column. Mark with bit 0. */
2080
write_bits(huff_tree->min_chr,8);
2081
write_bits(huff_tree->elements,9);
2082
write_bits(huff_tree->char_bits,5);
2083
write_bits(huff_tree->offset_bits,5);
2088
int_length=(uint) (huff_tree->counts->tree_pos -
2089
huff_tree->counts->tree_buff);
2090
/* We have distinct column values for this column. Mark with bit 1. */
2092
write_bits(huff_tree->elements,15);
2093
write_bits(int_length,16);
2094
write_bits(huff_tree->char_bits,5);
2095
write_bits(huff_tree->offset_bits,5);
2096
intervall_length+=int_length;
2099
VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2100
"%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2101
huff_tree->char_bits, huff_tree->offset_bits,
2102
huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2103
huff_tree->counts->tree_buff ? int_length :
2104
huff_tree->min_chr, huff_tree->height));
2106
/* Check that the code tree length matches the element count. */
2107
length=(uint) (offset-packed_tree);
2108
if (length != huff_tree->elements*2-2)
2110
VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2111
length, huff_tree->elements * 2 - 2));
2116
for (i=0 ; i < length ; i++)
2118
if (packed_tree[i] & IS_OFFSET)
2119
write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2120
huff_tree->offset_bits+1);
2122
write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2124
VOID(printf("tree[0x%04x]: %s0x%04x\n",
2125
i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2126
(packed_tree[i] & IS_OFFSET) ?
2127
packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2132
Display coding tables and check their correctness.
2134
codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2135
for (i= 0; i < codes; i++)
2142
if (! (len= huff_tree->code_len[i]))
2145
VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2146
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2147
bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2149
/* Check that the encode table decodes correctly. */
2157
VOID(fflush(stdout));
2158
VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2159
hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2164
code|= (huff_tree->code[i] >> (--len)) & 1;
2166
if (bits > 8 * sizeof(code))
2168
VOID(fflush(stdout));
2169
VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2170
bits, (uint) (8 * sizeof(code))));
2174
idx+= (uint) code & 1;
2177
VOID(fflush(stdout));
2178
VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2183
if (packed_tree[idx] & IS_OFFSET)
2184
idx+= packed_tree[idx] & ~IS_OFFSET;
2186
break; /* Hit a leaf. This contains the result value. */
2191
if (packed_tree[idx] != i)
2193
VOID(fflush(stdout));
2194
VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2195
packed_tree[idx], i));
2199
} /*end for (codes)*/
2203
/* Write column values in case of distinct column value compression. */
2204
if (huff_tree->counts->tree_buff)
2206
for (i=0 ; i < int_length ; i++)
2208
write_bits((uint64_t) (uchar) huff_tree->counts->tree_buff[i], 8);
2210
VOID(printf("column_values[0x%04x]: 0x%02x\n",
2211
i, (uchar) huff_tree->counts->tree_buff[i]));
2218
my_afree((uchar*) packed_tree);
2221
VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2228
static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2233
prev_offset= offset;
2235
'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2236
then there is no left child and, hence no right child either. This
2237
is a property of a binary tree. An element is either a node with two
2238
childs, or a leaf without childs.
2240
The current element is always a node with two childs. Go left first.
2242
if (!element->a.nod.left->a.leaf.null)
2244
/* Store the byte code or the index of the column value. */
2245
prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2251
Recursively traverse the tree to the left. Mark it as an offset to
2252
another tree node (in contrast to a byte code or column value index).
2254
prev_offset[0]= IS_OFFSET+2;
2255
offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2258
/* Now, check the right child. */
2259
if (!element->a.nod.right->a.leaf.null)
2261
/* Store the byte code or the index of the column value. */
2262
prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2268
Recursively traverse the tree to the right. Mark it as an offset to
2269
another tree node (in contrast to a byte code or column value index).
2271
uint temp=(uint) (offset-prev_offset-1);
2272
prev_offset[1]= IS_OFFSET+ temp;
2273
if (huff_tree->max_offset < temp)
2274
huff_tree->max_offset = temp;
2275
return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2279
/* Get number of bits neaded to represent value */
2281
static uint max_bit(register uint value)
2283
register uint power=1;
2291
static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2294
uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2295
intervall,field_length,max_pack_length,pack_blob_length;
2296
my_off_t record_count;
2298
ulong length,pack_length;
2299
uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2300
HUFF_COUNTS *count,*end_count;
2302
MI_INFO *isam_file=mrg->file[0];
2303
uint pack_version= (uint) isam_file->s->pack.version;
2305
/* Allocate a buffer for the records (excluding blobs). */
2306
if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2309
end_count=huff_counts+isam_file->s->base.fields;
2310
min_record_length= (uint) ~0;
2311
max_record_length=0;
2314
Calculate the maximum number of bits required to pack the records.
2315
Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2316
The tree height determines the maximum number of bits per value.
2317
Some fields skip leading or trailing spaces or zeroes. The skipped
2318
number of bytes is encoded by 'length_bits' bits.
2319
Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2320
and varchar get a leading 0 bit.
2322
for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2324
if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2325
huff_counts[i].max_zero_fill=0;
2326
if (huff_counts[i].field_type == FIELD_CONSTANT ||
2327
huff_counts[i].field_type == FIELD_ZERO ||
2328
huff_counts[i].field_type == FIELD_CHECK)
2330
if (huff_counts[i].field_type == FIELD_INTERVALL)
2331
max_calc_length+=huff_counts[i].tree->height;
2332
else if (huff_counts[i].field_type == FIELD_BLOB ||
2333
huff_counts[i].field_type == FIELD_VARCHAR)
2334
max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2337
(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2338
huff_counts[i].tree->height+huff_counts[i].length_bits;
2340
max_calc_length= (max_calc_length + 7) / 8;
2341
pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2343
/* 'max_blob_length' is the max length of all blobs of a record. */
2344
pack_blob_length= isam_file->s->base.blobs ?
2345
calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2346
max_pack_length=pack_ref_length+pack_blob_length;
2349
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2351
ulong tot_blob_length=0;
2354
if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2356
record_pos= (uchar*) file_buffer.pos;
2357
file_buffer.pos+=max_pack_length;
2358
for (start_pos=record, count= huff_counts; count < end_count ; count++)
2360
end_pos=start_pos+(field_length=count->field_length);
2363
/* Check if the column contains spaces only. */
2364
if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2366
for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2375
end_pos-=count->max_zero_fill;
2376
field_length-=count->max_zero_fill;
2378
switch (count->field_type) {
2379
case FIELD_SKIP_ZERO:
2380
if (!memcmp((uchar*) start_pos,zero_string,field_length))
2389
for ( ; start_pos < end_pos ; start_pos++)
2391
write_bits(tree->code[(uchar) *start_pos],
2392
(uint) tree->code_len[(uchar) *start_pos]);
2395
case FIELD_SKIP_ENDSPACE:
2396
for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2397
length= (ulong) (end_pos - pos);
2398
if (count->pack_type & PACK_TYPE_SELECTED)
2400
if (length > count->min_space)
2403
write_bits(length,count->length_bits);
2413
write_bits(length,count->length_bits);
2415
/* Encode all significant bytes. */
2416
for ( ; start_pos < pos ; start_pos++)
2418
write_bits(tree->code[(uchar) *start_pos],
2419
(uint) tree->code_len[(uchar) *start_pos]);
2423
case FIELD_SKIP_PRESPACE:
2424
for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2425
length= (ulong) (pos - start_pos);
2426
if (count->pack_type & PACK_TYPE_SELECTED)
2428
if (length > count->min_space)
2431
write_bits(length,count->length_bits);
2441
write_bits(length,count->length_bits);
2443
/* Encode all significant bytes. */
2444
for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2446
write_bits(tree->code[(uchar) *start_pos],
2447
(uint) tree->code_len[(uchar) *start_pos]);
2450
case FIELD_CONSTANT:
2455
case FIELD_INTERVALL:
2457
pos=(uchar*) tree_search(&count->int_tree, start_pos,
2458
count->int_tree.custom_arg);
2459
intervall=(uint) (pos - count->tree_buff)/field_length;
2460
write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2465
ulong blob_length=_mi_calc_blob_length(field_length-
2466
portable_sizeof_char_ptr,
2468
/* Empty blobs are encoded with a single 1 bit. */
2475
uchar *blob,*blob_end;
2477
/* Write the blob length. */
2478
write_bits(blob_length,count->length_bits);
2479
memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2481
blob_end=blob+blob_length;
2482
/* Encode the blob bytes. */
2483
for ( ; blob < blob_end ; blob++)
2485
write_bits(tree->code[(uchar) *blob],
2486
(uint) tree->code_len[(uchar) *blob]);
2488
tot_blob_length+=blob_length;
2495
uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2496
ulong col_length= (var_pack_length == 1 ?
2497
(uint) *(uchar*) start_pos :
2498
uint2korr(start_pos));
2499
/* Empty varchar are encoded with a single 1 bit. */
2502
write_bits(1,1); /* Empty varchar */
2506
uchar *end= start_pos + var_pack_length + col_length;
2508
/* Write the varchar length. */
2509
write_bits(col_length,count->length_bits);
2510
/* Encode the varchar bytes. */
2511
for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2513
write_bits(tree->code[(uchar) *start_pos],
2514
(uint) tree->code_len[(uchar) *start_pos]);
2521
case FIELD_enum_val_count:
2522
abort(); /* Impossible */
2524
start_pos+=count->max_zero_fill;
2527
length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2528
pack_length= save_pack_length(pack_version, record_pos, length);
2529
if (pack_blob_length)
2530
pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2532
/* Correct file buffer if the header was smaller */
2533
if (pack_length != max_pack_length)
2535
bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2536
file_buffer.pos-= (max_pack_length-pack_length);
2538
if (length < (ulong) min_record_length)
2539
min_record_length=(uint) length;
2540
if (length > (ulong) max_record_length)
2541
max_record_length=(uint) length;
2543
if (write_loop && record_count % WRITE_COUNT == 0)
2545
VOID(printf("%lu\r", (ulong) record_count));
2546
VOID(fflush(stdout));
2549
else if (error != HA_ERR_RECORD_DELETED)
2552
if (error == HA_ERR_END_OF_FILE)
2556
VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2557
my_progname, error));
2560
VOID(printf("wrote %s records.\n", llstr((int64_t) record_count, llbuf)));
2562
my_afree((uchar*) record);
2563
mrg->ref_length=max_pack_length;
2564
mrg->min_pack_length=max_record_length ? min_record_length : 0;
2565
mrg->max_pack_length=max_record_length;
2566
return(error || error_on_write || flush_buffer(~(ulong) 0));
2570
static char *make_new_name(char *new_name, char *old_name)
2572
return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2575
static char *make_old_name(char *new_name, char *old_name)
2577
return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2580
/* rutines for bit writing buffer */
2582
static void init_file_buffer(File file, bool read_buffer)
2584
file_buffer.file=file;
2585
file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2587
file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2588
file_buffer.pos_in_file=0;
2593
file_buffer.pos=file_buffer.end;
2598
file_buffer.pos=file_buffer.buffer;
2599
file_buffer.bits=BITS_SAVED;
2601
file_buffer.bitbucket= 0;
2605
static int flush_buffer(ulong neaded_length)
2610
file_buffer.end is 8 bytes lower than the real end of the buffer.
2611
This is done so that the end-of-buffer condition does not need to be
2612
checked for every byte (see write_bits()). Consequently,
2613
file_buffer.pos can become greater than file_buffer.end. The
2614
algorithms in the other functions ensure that there will never be
2615
more than 8 bytes written to the buffer without an end-of-buffer
2616
check. So the buffer cannot be overrun. But we need to check for the
2617
near-to-buffer-end condition to avoid a negative result, which is
2618
casted to unsigned and thus becomes giant.
2620
if ((file_buffer.pos < file_buffer.end) &&
2621
((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2623
length=(ulong) (file_buffer.pos-file_buffer.buffer);
2624
file_buffer.pos=file_buffer.buffer;
2625
file_buffer.pos_in_file+=length;
2628
if (error_on_write|| my_write(file_buffer.file,
2629
(const uchar*) file_buffer.buffer,
2631
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2637
if (neaded_length != ~(ulong) 0 &&
2638
(ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2641
neaded_length+=256; /* some margin */
2642
tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2645
file_buffer.pos= ((uchar*) tmp +
2646
(ulong) (file_buffer.pos - file_buffer.buffer));
2647
file_buffer.buffer= (uchar*) tmp;
2648
file_buffer.end= (uchar*) (tmp+neaded_length-8);
2654
static void end_file_buffer(void)
2656
my_free((uchar*) file_buffer.buffer,MYF(0));
2659
/* output `bits` low bits of `value' */
2661
static void write_bits(register uint64_t value, register uint bits)
2663
assert(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2664
(bits == 8 * sizeof(value)));
2666
if ((file_buffer.bits-= (int) bits) >= 0)
2668
file_buffer.bitbucket|= value << file_buffer.bits;
2672
register uint64_t bit_buffer;
2673
bits= (uint) -file_buffer.bits;
2674
bit_buffer= (file_buffer.bitbucket |
2675
((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2676
#if BITS_SAVED == 64
2677
*file_buffer.pos++= (uchar) (bit_buffer >> 56);
2678
*file_buffer.pos++= (uchar) (bit_buffer >> 48);
2679
*file_buffer.pos++= (uchar) (bit_buffer >> 40);
2680
*file_buffer.pos++= (uchar) (bit_buffer >> 32);
2682
*file_buffer.pos++= (uchar) (bit_buffer >> 24);
2683
*file_buffer.pos++= (uchar) (bit_buffer >> 16);
2684
*file_buffer.pos++= (uchar) (bit_buffer >> 8);
2685
*file_buffer.pos++= (uchar) (bit_buffer);
2687
if (bits != 8 * sizeof(value))
2688
value&= (((uint64_t) 1) << bits) - 1;
2689
if (file_buffer.pos >= file_buffer.end)
2690
VOID(flush_buffer(~ (ulong) 0));
2691
file_buffer.bits=(int) (BITS_SAVED - bits);
2692
file_buffer.bitbucket= value << (BITS_SAVED - bits);
2697
/* Flush bits in bit_buffer to buffer */
2699
static void flush_bits(void)
2702
uint64_t bit_buffer;
2704
bits= file_buffer.bits & ~7;
2705
bit_buffer= file_buffer.bitbucket >> bits;
2706
bits= BITS_SAVED - bits;
2710
*file_buffer.pos++= (uchar) (bit_buffer >> bits);
2712
if (file_buffer.pos >= file_buffer.end)
2713
VOID(flush_buffer(~ (ulong) 0));
2714
file_buffer.bits= BITS_SAVED;
2715
file_buffer.bitbucket= 0;
2719
/****************************************************************************
2720
** functions to handle the joined files
2721
****************************************************************************/
2723
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2726
MYISAM_SHARE *share=isam_file->s;
2727
uint options=mi_uint2korr(share->state.header.options);
2730
options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2731
mi_int2store(share->state.header.options,options);
2733
share->state.state.data_file_length=new_length;
2734
share->state.state.del=0;
2735
share->state.state.empty=0;
2736
share->state.dellink= HA_OFFSET_ERROR;
2737
share->state.split=(ha_rows) mrg->records;
2738
share->state.version=(ulong) time((time_t*) 0);
2739
if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2742
Some indexes are disabled, cannot use current key_file_length value
2743
as an estimate of upper bound of index file size. Use packed data file
2746
share->state.state.key_file_length= new_length;
2749
If there are no disabled indexes, keep key_file_length value from
2750
original file so "myisamchk -rq" can use this value (this is necessary
2751
because index size cannot be easily calculated for fulltext keys)
2753
mi_clear_all_keys_active(share->state.key_map);
2754
for (key=0 ; key < share->base.keys ; key++)
2755
share->state.key_root[key]= HA_OFFSET_ERROR;
2756
for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2757
share->state.key_del[key]= HA_OFFSET_ERROR;
2758
isam_file->state->checksum=crc; /* Save crc here */
2759
share->changed=1; /* Force write of header */
2760
share->state.open_count=0;
2761
share->global_changed=0;
2762
(void)ftruncate(share->kfile, share->base.keystart);
2763
if (share->base.keys)
2765
return(mi_state_info_write(share->kfile,&share->state,1+2));
2769
static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2772
MI_STATE_INFO state;
2773
MI_INFO *isam_file=mrg->file[0];
2776
state= isam_file->s->state;
2777
options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2778
HA_OPTION_READ_ONLY_DATA);
2779
mi_int2store(state.header.options,options);
2780
state.state.data_file_length=new_length;
2782
state.state.empty=0;
2783
state.state.records=state.split=(ha_rows) mrg->records;
2784
/* See comment above in save_state about key_file_length handling. */
2785
if (mrg->src_file_has_indexes_disabled)
2787
isam_file->s->state.state.key_file_length=
2788
max(isam_file->s->state.state.key_file_length, new_length);
2790
state.dellink= HA_OFFSET_ERROR;
2791
state.version=(ulong) time((time_t*) 0);
2792
mi_clear_all_keys_active(state.key_map);
2793
state.state.checksum=crc;
2794
if (isam_file->s->base.keys)
2796
state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
2797
return (mi_state_info_write(file,&state,1+2));
2801
/* reset for mrg_rrnd */
2803
static void mrg_reset(PACK_MRG_INFO *mrg)
2807
mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
2812
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
2820
isam_info= *(info->current=info->file);
2821
info->end=info->current+info->count;
2822
mi_reset(isam_info);
2823
mi_extra(isam_info, HA_EXTRA_CACHE, 0);
2824
filepos=isam_info->s->pack.header_length;
2828
isam_info= *info->current;
2829
filepos= isam_info->nextpos;
2834
isam_info->update&= HA_STATE_CHANGED;
2835
if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
2837
error != HA_ERR_END_OF_FILE)
2839
mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
2840
if (info->current+1 == info->end)
2841
return(HA_ERR_END_OF_FILE);
2843
isam_info= *info->current;
2844
filepos=isam_info->s->pack.header_length;
2845
mi_reset(isam_info);
2846
mi_extra(isam_info,HA_EXTRA_CACHE, 0);
2851
static int mrg_close(PACK_MRG_INFO *mrg)
2855
for (i=0 ; i < mrg->count ; i++)
2856
error|=mi_close(mrg->file[i]);
2858
my_free((uchar*) mrg->file,MYF(0));