1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
/* Pack MyISAM file */
19
#define USE_MY_FUNC /* We need at least my_malloc */
22
#include "myisamdef.h"
25
#include "mysys_err.h"
26
#ifndef __GNU_LIBRARY__
27
#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
29
#include <my_getopt.h>
32
#if SIZEOF_LONG_LONG > 4
38
#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
39
#define HEAD_LENGTH 32
40
#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
42
#define DATA_TMP_EXT ".TMD"
43
#define OLD_EXT ".OLD"
44
#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
46
struct st_file_buffer {
48
uchar *buffer,*pos,*end;
55
struct st_huff_element;
57
typedef struct st_huff_counts {
58
uint field_length,max_zero_fill;
60
uint max_end_space,max_pre_space,length_bits,min_space;
62
enum en_fieldtype field_type;
63
struct st_huff_tree *tree; /* Tree for field */
65
my_off_t end_space[8];
66
my_off_t pre_space[8];
67
my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
68
TREE int_tree; /* Tree for detecting distinct column values. */
69
uchar *tree_buff; /* Column values, 'field_length' each. */
70
uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
73
typedef struct st_huff_element HUFF_ELEMENT;
76
WARNING: It is crucial for the optimizations in calc_packed_length()
77
that 'count' is the first element of 'HUFF_ELEMENT'.
79
struct st_huff_element {
83
HUFF_ELEMENT *left,*right;
87
uint element_nr; /* Number of element */
93
typedef struct st_huff_tree {
94
HUFF_ELEMENT *root,*element_buffer;
98
my_off_t bytes_packed;
99
uint tree_pack_length;
100
uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
106
typedef struct st_isam_mrg {
107
MI_INFO **file,**current,**end;
110
uint min_pack_length; /* Theese is used by packed data */
111
uint max_pack_length;
113
uint max_blob_length;
115
/* true if at least one source file has at least one disabled index */
116
my_bool src_file_has_indexes_disabled;
120
extern int main(int argc,char * *argv);
121
static void get_options(int *argc,char ***argv);
122
static MI_INFO *open_isam_file(char *name,int mode);
123
static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
124
static int compress(PACK_MRG_INFO *file,char *join_name);
125
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
126
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
128
HUFF_COUNTS *huff_counts,
130
static int compare_tree(void* cmp_arg __attribute__((unused)),
131
const uchar *s,const uchar *t);
132
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
133
static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
135
static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
136
uint max_space_length,my_off_t *space_counts,
137
my_off_t tot_space_count,
138
enum en_fieldtype field_type);
139
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
140
static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
141
static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
142
static int save_counts_in_queue(uchar *key,element_count count,
144
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
145
static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
146
static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
147
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
148
HUFF_ELEMENT *element,uint size,
150
static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
151
my_off_t tot_elements,my_off_t filelength);
152
static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
153
static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
154
static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
155
HUFF_ELEMENT *element,
157
static uint max_bit(uint value);
158
static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
159
static char *make_new_name(char *new_name,char *old_name);
160
static char *make_old_name(char *new_name,char *old_name);
161
static void init_file_buffer(File file,pbool read_buffer);
162
static int flush_buffer(ulong neaded_length);
163
static void end_file_buffer(void);
164
static void write_bits(ulonglong value, uint bits);
165
static void flush_bits(void);
166
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
168
static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
170
static int mrg_close(PACK_MRG_INFO *mrg);
171
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
172
static void mrg_reset(PACK_MRG_INFO *mrg);
173
#if !defined(DBUG_OFF)
174
static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
175
static int fakecmp(my_off_t **count1, my_off_t **count2);
179
static int error_on_write=0,test_only=0,verbose=0,silent=0,
180
write_loop=0,force_pack=0, isamchk_neaded=0;
181
static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
182
static my_bool backup, opt_wait;
184
tree_buff_length is somewhat arbitrary. The bigger it is the better
185
the chance to win in terms of compression factor. On the other hand,
186
this table becomes part of the compressed file header. And its length
187
is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
189
static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
190
static char tmp_dir[FN_REFLEN]={0},*join_table;
191
static my_off_t intervall_length;
192
static ha_checksum glob_crc;
193
static struct st_file_buffer file_buffer;
195
static HUFF_COUNTS *global_count;
196
static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
197
static const char *load_default_groups[]= { "myisampack",0 };
199
/* The main program */
201
int main(int argc, char **argv)
208
load_defaults("my",load_default_groups,&argc,&argv);
210
get_options(&argc,&argv);
212
error=ok=isamchk_neaded=0;
214
{ /* Join files into one */
215
if (open_isam_files(&merge,argv,(uint) argc) ||
216
compress(&merge,join_table))
222
if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
226
merge.file= &isam_file;
230
if (compress(&merge,0))
236
if (ok && isamchk_neaded && !silent)
237
puts("Remember to run myisamchk -rq on compressed tables");
238
VOID(fflush(stdout));
239
VOID(fflush(stderr));
240
free_defaults(default_argv);
241
my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
244
return 0; /* No compiler warning */
248
enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
250
static struct my_option my_long_options[] =
253
{"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
254
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
256
{"backup", 'b', "Make a backup of the table as table_name.OLD.",
257
(char**) &backup, (char**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
258
{"character-sets-dir", OPT_CHARSETS_DIR_MP,
259
"Directory where character sets are.", (char**) &charsets_dir,
260
(char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
261
{"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
262
0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
264
"Force packing of table even if it gets bigger or if tempfile exists.",
265
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
267
"Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
268
(char**) &join_table, (char**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
270
{"help", '?', "Display this help and exit.",
271
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
272
{"silent", 's', "Be more silent.",
273
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
274
{"tmpdir", 'T', "Use temporary directory to store temporary table.",
275
0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
276
{"test", 't', "Don't pack table, only test packing it.",
277
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
278
{"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
279
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
280
{"version", 'V', "Output version information and exit.",
281
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
282
{"wait", 'w', "Wait and retry if table is in use.", (char**) &opt_wait,
283
(char**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
284
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
287
#include <help_start.h>
289
static void print_version(void)
291
VOID(printf("%s Ver 1.23 for %s on %s\n",
292
my_progname, SYSTEM_TYPE, MACHINE_TYPE));
293
NETWARE_SET_SCREEN_MODE(1);
297
static void usage(void)
300
puts("Copyright (C) 2002 MySQL AB");
301
puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
302
puts("and you are welcome to modify and redistribute it under the GPL license\n");
304
puts("Pack a MyISAM-table to take much less space.");
305
puts("Keys are not updated, you must run myisamchk -rq on the datafile");
306
puts("afterwards to update the keys.");
307
puts("You should give the .MYI file as the filename argument.");
309
VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
310
my_print_help(my_long_options);
311
print_defaults("my", load_default_groups);
312
my_print_variables(my_long_options);
315
#include <help_end.h>
318
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
326
setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
331
tmpfile_createflag= O_RDWR | O_TRUNC;
334
write_loop= verbose= 0;
339
/* Avoid to reset 'verbose' if it was already set > 1. */
344
length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
345
if (length != dirname_length(tmp_dir))
347
tmp_dir[length]=FN_LIBCHAR;
352
verbose++; /* Allow for selecting the level of verbosity. */
356
DBUG_PUSH(argument ? argument : "d:t:o");
370
/* Initiates DEBUG - but no debugging here ! */
372
static void get_options(int *argc,char ***argv)
376
my_progname= argv[0][0];
377
if (isatty(fileno(stdout)))
380
if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
390
backup=0; /* Not needed */
397
static MI_INFO *open_isam_file(char *name,int mode)
401
DBUG_ENTER("open_isam_file");
403
if (!(isam_file=mi_open(name,mode,
404
(opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
405
HA_OPEN_ABORT_IF_LOCKED))))
407
VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
411
if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
415
VOID(fprintf(stderr, "%s is already compressed\n", name));
416
VOID(mi_close(isam_file));
420
puts("Recompressing already compressed table");
421
share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
423
if (! force_pack && share->state.state.records != 0 &&
424
(share->state.state.records <= 1 ||
425
share->state.state.data_file_length < 1024))
427
VOID(fprintf(stderr, "%s is too small to compress\n", name));
428
VOID(mi_close(isam_file));
431
VOID(mi_lock_database(isam_file,F_WRLCK));
432
DBUG_RETURN(isam_file);
436
static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
441
mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
443
mrg->src_file_has_indexes_disabled= 0;
444
for (i=0; i < count ; i++)
446
if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
449
mrg->src_file_has_indexes_disabled|=
450
! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
451
mrg->file[i]->s->base.keys);
453
/* Check that files are identical */
454
for (j=0 ; j < count-1 ; j++)
456
MI_COLUMNDEF *m1,*m2,*end;
457
if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
458
mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
460
m1=mrg->file[j]->s->rec;
461
end=m1+mrg->file[j]->s->base.fields;
462
m2=mrg->file[j+1]->s->rec;
463
for ( ; m1 != end ; m1++,m2++)
465
if (m1->type != m2->type || m1->length != m2->length)
473
VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
474
my_progname, names[j], names[j+1]));
477
mi_close(mrg->file[i]);
478
my_free((uchar*) mrg->file,MYF(0));
483
static int compress(PACK_MRG_INFO *mrg,char *result_table)
486
File new_file,join_isam_file;
489
char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
490
uint i,header_length,fields,trees,used_trees;
491
my_off_t old_length,new_length,tot_elements;
492
HUFF_COUNTS *huff_counts;
493
HUFF_TREE *huff_trees;
494
DBUG_ENTER("compress");
496
isam_file=mrg->file[0]; /* Take this as an example */
498
new_file=join_isam_file= -1;
503
/* Create temporary or join file */
506
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
508
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
509
if (!test_only && result_table)
511
/* Make a new indexfile based on first file in list */
514
strmov(org_name,result_table); /* Fix error messages */
515
VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
516
if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
519
length=(uint) share->base.keystart;
520
if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
522
if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
523
my_write(join_isam_file,buff,length,
524
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
526
my_free(buff,MYF(0));
529
my_free(buff,MYF(0));
530
VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
532
else if (!tmp_dir[0])
533
VOID(make_new_name(new_name,org_name));
535
VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
537
(new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
540
/* Start calculating statistics */
543
for (i=0 ; i < mrg->count ; i++)
544
mrg->records+=mrg->file[i]->s->state.state.records;
546
DBUG_PRINT("info", ("Compressing %s: (%lu records)",
547
result_table ? new_name : org_name,
548
(ulong) mrg->records));
549
if (write_loop || verbose)
551
VOID(printf("Compressing %s: (%lu records)\n",
552
result_table ? new_name : org_name, (ulong) mrg->records));
554
trees=fields=share->base.fields;
555
huff_counts=init_huff_count(isam_file,mrg->records);
559
Read the whole data file(s) for statistics.
561
DBUG_PRINT("info", ("- Calculating statistics"));
562
if (write_loop || verbose)
563
VOID(printf("- Calculating statistics\n"));
564
if (get_statistic(mrg,huff_counts))
568
for (i=0; i < mrg->count ; i++)
569
old_length+= (mrg->file[i]->s->state.state.data_file_length -
570
mrg->file[i]->s->state.state.empty);
573
Create a global priority queue in preparation for making
574
temporary Huffman trees.
576
if (init_queue(&queue,256,0,0,compare_huff_elements,0))
580
Check each column if we should use pre-space-compress, end-space-
581
compress, empty-field-compress or zero-field-compress.
583
check_counts(huff_counts,fields,mrg->records);
586
Build a Huffman tree for each column.
588
huff_trees=make_huff_trees(huff_counts,trees);
591
If the packed lengths of combined columns is less then the sum of
592
the non-combined columns, then create common Huffman trees for them.
593
We do this only for byte compressed columns, not for distinct values
596
if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
600
Assign codes to all byte or column values.
602
if (make_huff_decode_table(huff_trees,fields))
605
/* Prepare a file buffer. */
606
init_file_buffer(new_file,0);
609
Reserve space in the target file for the fixed compressed file header.
611
file_buffer.pos_in_file=HEAD_LENGTH;
613
VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
616
Write field infos: field type, pack type, length bits, tree number.
618
write_field_info(huff_counts,fields,used_trees);
623
if (!(tot_elements=write_huff_tree(huff_trees,trees)))
627
Calculate the total length of the compression info header.
628
This includes the fixed compressed file header, the column compression
629
type descriptions, and the decode trees.
631
header_length=(uint) file_buffer.pos_in_file+
632
(uint) (file_buffer.pos-file_buffer.buffer);
635
Compress the source file into the target file.
637
DBUG_PRINT("info", ("- Compressing file"));
638
if (write_loop || verbose)
639
VOID(printf("- Compressing file\n"));
640
error=compress_isam_file(mrg,huff_counts);
641
new_length=file_buffer.pos_in_file;
642
if (!error && !test_only)
644
uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
645
bzero(buff,sizeof(buff));
646
error=my_write(file_buffer.file,buff,sizeof(buff),
647
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
651
Write the fixed compressed file header.
654
error=write_header(mrg,header_length,used_trees,tot_elements,
657
/* Flush the file buffer. */
660
/* Display statistics. */
661
DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
662
"Mean total length: %6ld\n",
663
mrg->min_pack_length, mrg->max_pack_length,
664
(ulong) (mrg->records ? (new_length/mrg->records) : 0)));
665
if (verbose && mrg->records)
666
VOID(printf("Min record length: %6d Max length: %6d "
667
"Mean total length: %6ld\n", mrg->min_pack_length,
668
mrg->max_pack_length, (ulong) (new_length/mrg->records)));
670
/* Close source and target file. */
673
error|=my_close(new_file,MYF(MY_WME));
676
error|=my_close(isam_file->dfile,MYF(MY_WME));
677
isam_file->dfile= -1; /* Tell mi_close file is closed */
682
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
683
if (! test_only && ! error)
687
error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
693
if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
699
error=my_copy(new_name,org_name,MYF(MY_WME));
701
error=my_rename(new_name,org_name,MYF(MY_WME));
704
VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
706
VOID(my_delete(new_name,MYF(MY_WME)));
714
error=my_copy(new_name,org_name,
715
MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
717
VOID(my_delete(new_name,MYF(MY_WME)));
720
error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
723
error=save_state(isam_file,mrg,new_length,glob_crc);
726
error|=mrg_close(mrg);
727
if (join_isam_file >= 0)
728
error|=my_close(join_isam_file,MYF(MY_WME));
731
VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
732
VOID(my_delete(new_name,MYF(MY_WME)));
735
if (write_loop || verbose)
738
VOID(printf("%.4g%% \n",
739
(((longlong) (old_length - new_length)) * 100.0 /
740
(longlong) old_length)));
742
puts("Empty file saved in compressed format");
747
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
749
VOID(my_close(new_file,MYF(0)));
750
if (join_isam_file >= 0)
751
VOID(my_close(join_isam_file,MYF(0)));
753
VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
757
/* Init a huff_count-struct for each field and init it */
759
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
762
register HUFF_COUNTS *count;
763
if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
765
MYF(MY_ZEROFILL | MY_WME))))
767
for (i=0 ; i < info->s->base.fields ; i++)
769
enum en_fieldtype type;
770
count[i].field_length=info->s->rec[i].length;
771
type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
772
if (type == FIELD_INTERVALL ||
773
type == FIELD_CONSTANT ||
776
if (count[i].field_length <= 8 &&
777
(type == FIELD_NORMAL ||
778
type == FIELD_SKIP_ZERO))
779
count[i].max_zero_fill= count[i].field_length;
781
For every column initialize a tree, which is used to detect distinct
782
column values. 'int_tree' works together with 'tree_buff' and
783
'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
784
This is accomplished by '-1' as the element size.
786
init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
788
if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
789
count[i].tree_pos=count[i].tree_buff =
790
my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
798
/* Free memory used by counts and trees */
800
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
801
HUFF_COUNTS *huff_counts,
808
for (i=0 ; i < trees ; i++)
810
if (huff_trees[i].element_buffer)
811
my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
812
if (huff_trees[i].code)
813
my_free((uchar*) huff_trees[i].code,MYF(0));
815
my_free((uchar*) huff_trees,MYF(0));
819
for (i=0 ; i < fields ; i++)
821
if (huff_counts[i].tree_buff)
823
my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
824
delete_tree(&huff_counts[i].int_tree);
827
my_free((uchar*) huff_counts,MYF(0));
829
delete_queue(&queue); /* This is safe to free */
833
/* Read through old file and gather some statistics */
835
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
839
ulong reclength,max_blob_length;
840
uchar *record,*pos,*next_pos,*end_pos,*start_pos;
841
ha_rows record_count;
842
my_bool static_row_size;
843
HUFF_COUNTS *count,*end_count;
844
TREE_ELEMENT *element;
845
DBUG_ENTER("get_statistic");
847
reclength=mrg->file[0]->s->base.reclength;
848
record=(uchar*) my_alloca(reclength);
849
end_count=huff_counts+mrg->file[0]->s->base.fields;
850
record_count=0; glob_crc=0;
853
/* Check how to calculate checksum */
855
for (count=huff_counts ; count < end_count ; count++)
857
if (count->field_type == FIELD_BLOB ||
858
count->field_type == FIELD_VARCHAR)
866
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
868
ulong tot_blob_length=0;
871
/* glob_crc is a checksum over all bytes of all records. */
873
glob_crc+=mi_static_checksum(mrg->file[0],record);
875
glob_crc+=mi_checksum(mrg->file[0],record);
877
/* Count the incidence of values separately for every column. */
878
for (pos=record,count=huff_counts ;
883
next_pos=end_pos=(start_pos=pos)+count->field_length;
886
Put the whole column value in a tree if there is room for it.
887
'int_tree' is used to quickly check for duplicate values.
888
'tree_buff' collects as many distinct column values as
889
possible. If the field length is > 1, it is tree_buff_length,
890
else 2 bytes. Each value is 'field_length' bytes big. If there
891
are more distinct column values than fit into the buffer, we
892
give up with this tree. BLOBs and VARCHARs do not have a
893
tree_buff as it can only be used with fixed length columns.
894
For the special case of field length == 1, we handle only the
895
case that there is only one distinct value in the table(s).
896
Otherwise, we can have a maximum of 256 distinct values. This
897
is then handled by the normal Huffman tree build.
899
Another limit for collecting distinct column values is the
900
number of values itself. Since we would need to build a
901
Huffman tree for the values, we are limited by the 'IS_OFFSET'
902
constant. This constant expresses a bit which is used to
903
determine if a tree element holds a final value or an offset
904
to a child element. Hence, all values and offsets need to be
905
smaller than 'IS_OFFSET'. A tree element is implemented with
906
two integer values, one for the left branch and one for the
907
right branch. For the extreme case that the first element
908
points to the last element, the number of integers in the tree
909
must be less or equal to IS_OFFSET. So the number of elements
910
must be less or equal to IS_OFFSET / 2.
912
WARNING: At first, we insert a pointer into the record buffer
913
as the key for the tree. If we got a new distinct value, which
914
is really inserted into the tree, instead of being counted
915
only, we will copy the column value from the record buffer to
916
'tree_buff' and adjust the key pointer of the tree accordingly.
918
if (count->tree_buff)
921
if (!(element=tree_insert(&count->int_tree,pos, 0,
922
count->int_tree.custom_arg)) ||
923
(element->count == 1 &&
924
(count->tree_buff + tree_buff_length <
925
count->tree_pos + count->field_length)) ||
926
(count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
927
(count->field_length == 1 &&
928
count->int_tree.elements_in_tree > 1))
930
delete_tree(&count->int_tree);
931
my_free(count->tree_buff,MYF(0));
937
If tree_insert() succeeds, it either creates a new element
938
or increments the counter of an existing element.
940
if (element->count == 1)
942
/* Copy the new column value into 'tree_buff'. */
943
memcpy(count->tree_pos,pos,(size_t) count->field_length);
944
/* Adjust the key pointer in the tree. */
945
tree_set_pointer(element,count->tree_pos);
946
/* Point behind the last column value so far. */
947
count->tree_pos+=count->field_length;
952
/* Save character counters and space-counts and zero-field-counts */
953
if (count->field_type == FIELD_NORMAL ||
954
count->field_type == FIELD_SKIP_ENDSPACE)
956
/* Ignore trailing space. */
957
for ( ; end_pos > pos ; end_pos--)
958
if (end_pos[-1] != ' ')
960
/* Empty fields are just counted. Go to the next record. */
963
count->empty_fields++;
964
count->max_zero_fill=0;
968
Count the total of all trailing spaces and the number of
969
short trailing spaces. Remember the longest trailing space.
971
length= (uint) (next_pos-end_pos);
972
count->tot_end_space+=length;
974
count->end_space[length]++;
975
if (count->max_end_space < length)
976
count->max_end_space = length;
979
if (count->field_type == FIELD_NORMAL ||
980
count->field_type == FIELD_SKIP_PRESPACE)
982
/* Ignore leading space. */
983
for (pos=start_pos; pos < end_pos ; pos++)
986
/* Empty fields are just counted. Go to the next record. */
989
count->empty_fields++;
990
count->max_zero_fill=0;
994
Count the total of all leading spaces and the number of
995
short leading spaces. Remember the longest leading space.
997
length= (uint) (pos-start_pos);
998
count->tot_pre_space+=length;
1000
count->pre_space[length]++;
1001
if (count->max_pre_space < length)
1002
count->max_pre_space = length;
1005
/* Calculate pos, end_pos, and max_length for variable length fields. */
1006
if (count->field_type == FIELD_BLOB)
1008
uint field_length=count->field_length -portable_sizeof_char_ptr;
1009
ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
1010
memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
1011
end_pos=pos+blob_length;
1012
tot_blob_length+=blob_length;
1013
set_if_bigger(count->max_length,blob_length);
1015
else if (count->field_type == FIELD_VARCHAR)
1017
uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1018
length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1019
uint2korr(start_pos));
1020
pos= start_pos+pack_length;
1021
end_pos= pos+length;
1022
set_if_bigger(count->max_length,length);
1025
/* Evaluate 'max_zero_fill' for short fields. */
1026
if (count->field_length <= 8 &&
1027
(count->field_type == FIELD_NORMAL ||
1028
count->field_type == FIELD_SKIP_ZERO))
1031
/* Zero fields are just counted. Go to the next record. */
1032
if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1034
count->zero_fields++;
1038
max_zero_fill starts with field_length. It is decreased every
1039
time a shorter "zero trailer" is found. It is set to zero when
1040
an empty field is found (see above). This suggests that the
1041
variable should be called 'min_zero_fill'.
1043
for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1045
if (i < count->max_zero_fill)
1046
count->max_zero_fill=i;
1049
/* Ignore zero fields and check fields. */
1050
if (count->field_type == FIELD_ZERO ||
1051
count->field_type == FIELD_CHECK)
1055
Count the incidence of every byte value in the
1056
significant field value.
1058
for ( ; pos < end_pos ; pos++)
1059
count->counts[(uchar) *pos]++;
1061
/* Step to next field. */
1064
if (tot_blob_length > max_blob_length)
1065
max_blob_length=tot_blob_length;
1067
if (write_loop && record_count % WRITE_COUNT == 0)
1069
VOID(printf("%lu\r", (ulong) record_count));
1070
VOID(fflush(stdout));
1073
else if (error != HA_ERR_RECORD_DELETED)
1075
VOID(fprintf(stderr, "Got error %d while reading rows", error));
1079
/* Step to next record. */
1083
VOID(printf(" \r"));
1084
VOID(fflush(stdout));
1088
If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1091
DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1093
DBUG_PRINT("info", ("Found the following number of incidents "
1094
"of the byte codes:"));
1096
VOID(printf("Found the following number of incidents "
1097
"of the byte codes:\n"));
1098
for (count= huff_counts ; count < end_count; count++)
1101
my_off_t total_count;
1104
DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1106
VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1107
if (count->tree_buff)
1109
DBUG_PRINT("info", ("number of distinct values: %u",
1110
(uint) ((count->tree_pos - count->tree_buff) /
1111
count->field_length)));
1113
VOID(printf("number of distinct values: %u\n",
1114
(uint) ((count->tree_pos - count->tree_buff) /
1115
count->field_length)));
1118
for (idx= 0; idx < 256; idx++)
1120
if (count->counts[idx])
1122
total_count+= count->counts[idx];
1123
DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1124
llstr((longlong) count->counts[idx], llbuf)));
1126
VOID(printf("counts[0x%02x]: %12s\n", idx,
1127
llstr((longlong) count->counts[idx], llbuf)));
1130
DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1132
if ((verbose >= 2) && total_count)
1134
VOID(printf("total: %12s\n",
1135
llstr((longlong) total_count, llbuf)));
1139
mrg->records=record_count;
1140
mrg->max_blob_length=max_blob_length;
1141
my_afree((uchar*) record);
1142
DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1145
static int compare_huff_elements(void *not_used __attribute__((unused)),
1148
return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1149
(*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1152
/* Check each tree if we should use pre-space-compress, end-space-
1153
compress, empty-field-compress or zero-field-compress */
1155
static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1158
uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1159
my_off_t old_length,new_length,length;
1160
DBUG_ENTER("check_counts");
1162
bzero((uchar*) field_count,sizeof(field_count));
1163
space_fields=fill_zero_fields=0;
1165
for (; trees-- ; huff_counts++)
1167
if (huff_counts->field_type == FIELD_BLOB)
1169
huff_counts->length_bits=max_bit(huff_counts->max_length);
1172
else if (huff_counts->field_type == FIELD_VARCHAR)
1174
huff_counts->length_bits=max_bit(huff_counts->max_length);
1177
else if (huff_counts->field_type == FIELD_CHECK)
1179
huff_counts->bytes_packed=0;
1180
huff_counts->counts[0]=0;
1184
huff_counts->field_type=FIELD_NORMAL;
1185
huff_counts->pack_type=0;
1187
/* Check for zero-filled records (in this column), or zero records. */
1188
if (huff_counts->zero_fields || ! records)
1190
my_off_t old_space_count;
1192
If there are only zero filled records (in this column),
1193
or no records at all, we are done.
1195
if (huff_counts->zero_fields == records)
1197
huff_counts->field_type= FIELD_ZERO;
1198
huff_counts->bytes_packed=0;
1199
huff_counts->counts[0]=0;
1202
/* Remeber the number of significant spaces. */
1203
old_space_count=huff_counts->counts[' '];
1204
/* Add all leading and trailing spaces. */
1205
huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1206
huff_counts->tot_pre_space +
1207
huff_counts->empty_fields *
1208
huff_counts->field_length);
1209
/* Check, what the compressed length of this would be. */
1210
old_length=calc_packed_length(huff_counts,0)+records/8;
1211
/* Get the number of zero bytes. */
1212
length=huff_counts->zero_fields*huff_counts->field_length;
1213
/* Add it to the counts. */
1214
huff_counts->counts[0]+=length;
1215
/* Check, what the compressed length of this would be. */
1216
new_length=calc_packed_length(huff_counts,0);
1217
/* If the compression without the zeroes would be shorter, we are done. */
1218
if (old_length < new_length && huff_counts->field_length > 1)
1220
huff_counts->field_type=FIELD_SKIP_ZERO;
1221
huff_counts->counts[0]-=length;
1222
huff_counts->bytes_packed=old_length- records/8;
1225
/* Remove the insignificant spaces, but keep the zeroes. */
1226
huff_counts->counts[' ']=old_space_count;
1228
/* Check, what the compressed length of this column would be. */
1229
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1232
If there are enough empty records (in this column),
1233
treating them specially may pay off.
1235
if (huff_counts->empty_fields)
1237
if (huff_counts->field_length > 2 &&
1238
huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1239
(1+max_bit(max(huff_counts->max_pre_space,
1240
huff_counts->max_end_space))) <
1241
records * max_bit(huff_counts->field_length))
1243
huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1247
length=huff_counts->empty_fields*huff_counts->field_length;
1248
if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1250
huff_counts->tot_end_space+=length;
1251
huff_counts->max_end_space=huff_counts->field_length;
1252
if (huff_counts->field_length < 8)
1253
huff_counts->end_space[huff_counts->field_length]+=
1254
huff_counts->empty_fields;
1256
if (huff_counts->tot_pre_space)
1258
huff_counts->tot_pre_space+=length;
1259
huff_counts->max_pre_space=huff_counts->field_length;
1260
if (huff_counts->field_length < 8)
1261
huff_counts->pre_space[huff_counts->field_length]+=
1262
huff_counts->empty_fields;
1268
If there are enough trailing spaces (in this column),
1269
treating them specially may pay off.
1271
if (huff_counts->tot_end_space)
1273
huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1274
if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1275
huff_counts->end_space,
1276
huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1278
huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1282
If there are enough leading spaces (in this column),
1283
treating them specially may pay off.
1285
if (huff_counts->tot_pre_space)
1287
if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1288
huff_counts->pre_space,
1289
huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1293
found_pack: /* Found field-packing */
1295
/* Test if we can use zero-fill */
1297
if (huff_counts->max_zero_fill &&
1298
(huff_counts->field_type == FIELD_NORMAL ||
1299
huff_counts->field_type == FIELD_SKIP_ZERO))
1301
huff_counts->counts[0]-=huff_counts->max_zero_fill*
1302
(huff_counts->field_type == FIELD_SKIP_ZERO ?
1303
records - huff_counts->zero_fields : records);
1304
huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1305
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1308
/* Test if intervall-field is better */
1310
if (huff_counts->tree_buff)
1314
DBUG_EXECUTE_IF("forceintervall",
1315
huff_counts->bytes_packed= ~ (my_off_t) 0;);
1316
tree.element_buffer=0;
1317
if (!make_huff_tree(&tree,huff_counts) &&
1318
tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1320
if (tree.elements == 1)
1321
huff_counts->field_type=FIELD_CONSTANT;
1323
huff_counts->field_type=FIELD_INTERVALL;
1324
huff_counts->pack_type=0;
1328
my_free((uchar*) huff_counts->tree_buff,MYF(0));
1329
delete_tree(&huff_counts->int_tree);
1330
huff_counts->tree_buff=0;
1332
if (tree.element_buffer)
1333
my_free((uchar*) tree.element_buffer,MYF(0));
1335
if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1337
if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1339
field_count[huff_counts->field_type]++;
1341
DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1342
"empty-zero: %3d empty-fill: %3d",
1343
field_count[FIELD_NORMAL],space_fields,
1344
field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1345
DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1346
"intervall-fields: %3d zero: %3d",
1347
field_count[FIELD_SKIP_PRESPACE],
1348
field_count[FIELD_SKIP_ENDSPACE],
1349
field_count[FIELD_INTERVALL],
1350
field_count[FIELD_ZERO]));
1352
VOID(printf("\nnormal: %3d empty-space: %3d "
1353
"empty-zero: %3d empty-fill: %3d\n"
1354
"pre-space: %3d end-space: %3d "
1355
"intervall-fields: %3d zero: %3d\n",
1356
field_count[FIELD_NORMAL],space_fields,
1357
field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1358
field_count[FIELD_SKIP_PRESPACE],
1359
field_count[FIELD_SKIP_ENDSPACE],
1360
field_count[FIELD_INTERVALL],
1361
field_count[FIELD_ZERO]));
1365
/* Test if we can use space-compression and empty-field-compression */
1368
test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1369
uint max_space_length, my_off_t *space_counts,
1370
my_off_t tot_space_count, enum en_fieldtype field_type)
1374
my_off_t space_count,min_space_count,min_pack,new_length,skip;
1376
length_bits=max_bit(max_space_length);
1378
/* Default no end_space-packing */
1379
space_count=huff_counts->counts[(uint) ' '];
1380
min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1381
min_pack=calc_packed_length(huff_counts,0);
1383
huff_counts->counts[(uint) ' ']=space_count;
1385
/* Test with allways space-count */
1386
new_length=huff_counts->bytes_packed+length_bits*records/8;
1387
if (new_length+1 < min_pack)
1390
min_pack=new_length;
1391
min_space_count=space_count;
1393
/* Test with length-flag */
1394
for (skip=0L, i=0 ; i < 8 ; i++)
1396
if (space_counts[i])
1399
huff_counts->counts[(uint) ' ']+=space_counts[i];
1400
skip+=huff_counts->pre_space[i];
1401
new_length=calc_packed_length(huff_counts,0)+
1402
(records+(records-skip)*(1+length_bits))/8;
1403
if (new_length < min_pack)
1406
min_pack=new_length;
1407
min_space_count=huff_counts->counts[(uint) ' '];
1412
huff_counts->counts[(uint) ' ']=min_space_count;
1413
huff_counts->bytes_packed=min_pack;
1416
return(0); /* No space-compress */
1417
case -1: /* Always space-count */
1418
huff_counts->field_type=field_type;
1419
huff_counts->min_space=0;
1420
huff_counts->length_bits=max_bit(max_space_length);
1423
huff_counts->field_type=field_type;
1424
huff_counts->min_space=(uint) min_pos;
1425
huff_counts->pack_type|=PACK_TYPE_SELECTED;
1426
huff_counts->length_bits=max_bit(max_space_length);
1429
return(1); /* Using space-compress */
1433
/* Make a huff_tree of each huff_count */
1435
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1438
HUFF_TREE *huff_tree;
1439
DBUG_ENTER("make_huff_trees");
1441
if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1442
MYF(MY_WME | MY_ZEROFILL))))
1445
for (tree=0 ; tree < trees ; tree++)
1447
if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1450
my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1451
my_free((uchar*) huff_tree,MYF(0));
1455
DBUG_RETURN(huff_tree);
1459
Build a Huffman tree.
1463
huff_tree The Huffman tree.
1464
huff_counts The counts.
1467
Build a Huffman tree according to huff_counts->counts or
1468
huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1469
tree_buff_length of distinct column values. In that case, whole
1470
values can be Huffman encoded instead of single bytes.
1477
static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1479
uint i,found,bits_packed,first,last;
1480
my_off_t bytes_packed;
1481
HUFF_ELEMENT *a,*b,*new_huff_el;
1484
if (huff_counts->tree_buff)
1486
/* Calculate the number of distinct values in tree_buff. */
1487
found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1488
huff_counts->field_length;
1489
first=0; last=found-1;
1493
/* Count the number of byte codes found in the column. */
1494
for (i=found=0 ; i < 256 ; i++)
1496
if (huff_counts->counts[i])
1507
/* When using 'tree_buff' we can have more that 256 values. */
1508
if (queue.max_elements < found)
1510
delete_queue(&queue);
1511
if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1515
/* Allocate or reallocate an element buffer for the Huffman tree. */
1516
if (!huff_tree->element_buffer)
1518
if (!(huff_tree->element_buffer=
1519
(HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1526
(HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1527
found*2*sizeof(HUFF_ELEMENT),
1530
huff_tree->element_buffer=temp;
1533
huff_counts->tree=huff_tree;
1534
huff_tree->counts=huff_counts;
1535
huff_tree->min_chr=first;
1536
huff_tree->max_chr=last;
1537
huff_tree->char_bits=max_bit(last-first);
1538
huff_tree->offset_bits=max_bit(found-1)+1;
1540
if (huff_counts->tree_buff)
1542
huff_tree->elements=0;
1543
huff_tree->tree_pack_length=(1+15+16+5+5+
1544
(huff_tree->char_bits+1)*found+
1545
(huff_tree->offset_bits+1)*
1547
(uint) (huff_tree->counts->tree_pos-
1548
huff_tree->counts->tree_buff);
1550
Put a HUFF_ELEMENT into the queue for every distinct column value.
1552
tree_walk() calls save_counts_in_queue() for every element in
1553
'int_tree'. This takes elements from the target trees element
1554
buffer and places references to them into the buffer of the
1555
priority queue. We insert in column value order, but the order is
1556
in fact irrelevant here. We will establish the correct order
1559
tree_walk(&huff_counts->int_tree,
1560
(int (*)(void*, element_count,void*)) save_counts_in_queue,
1561
(uchar*) huff_tree, left_root_right);
1565
huff_tree->elements=found;
1566
huff_tree->tree_pack_length=(9+9+5+5+
1567
(huff_tree->char_bits+1)*found+
1568
(huff_tree->offset_bits+1)*
1571
Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1573
The elements are taken from the target trees element buffer.
1574
Instead of using queue_insert(), we just place references to the
1575
elements into the buffer of the priority queue. We insert in byte
1576
value order, but the order is in fact irrelevant here. We will
1577
establish the correct order later.
1579
for (i=first, found=0 ; i <= last ; i++)
1581
if (huff_counts->counts[i])
1583
new_huff_el=huff_tree->element_buffer+(found++);
1584
new_huff_el->count=huff_counts->counts[i];
1585
new_huff_el->a.leaf.null=0;
1586
new_huff_el->a.leaf.element_nr=i;
1587
queue.root[found]=(uchar*) new_huff_el;
1591
If there is only a single byte value in this field in all records,
1592
add a second element with zero incidence. This is required to enter
1593
the loop, which builds the Huffman tree.
1597
new_huff_el=huff_tree->element_buffer+(found++);
1598
new_huff_el->count=0;
1599
new_huff_el->a.leaf.null=0;
1601
new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1603
new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1604
queue.root[found]=(uchar*) new_huff_el;
1608
/* Make a queue from the queue buffer. */
1609
queue.elements=found;
1612
Make a priority queue from the queue. Construct its index so that we
1613
have a partially ordered tree.
1615
for (i=found/2 ; i > 0 ; i--)
1616
_downheap(&queue,i);
1618
/* The Huffman algorithm. */
1619
bytes_packed=0; bits_packed=0;
1620
for (i=1 ; i < found ; i++)
1623
Pop the top element from the queue (the one with the least incidence).
1624
Popping from a priority queue includes a re-ordering of the queue,
1625
to get the next least incidence element to the top.
1627
a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1629
Copy the next least incidence element. The queue implementation
1630
reserves root[0] for temporary purposes. root[1] is the top.
1632
b=(HUFF_ELEMENT*) queue.root[1];
1633
/* Get a new element from the element buffer. */
1634
new_huff_el=huff_tree->element_buffer+found+i;
1635
/* The new element gets the sum of the two least incidence elements. */
1636
new_huff_el->count=a->count+b->count;
1638
The Huffman algorithm assigns another bit to the code for a byte
1639
every time that bytes incidence is combined (directly or indirectly)
1640
to a new element as one of the two least incidence elements.
1641
This means that one more bit per incidence of that byte is required
1642
in the resulting file. So we add the new combined incidence as the
1643
number of bits by which the result grows.
1645
bits_packed+=(uint) (new_huff_el->count & 7);
1646
bytes_packed+=new_huff_el->count/8;
1647
/* The new element points to its children, lesser in left. */
1648
new_huff_el->a.nod.left=a;
1649
new_huff_el->a.nod.right=b;
1651
Replace the copied top element by the new element and re-order the
1654
queue.root[1]=(uchar*) new_huff_el;
1655
queue_replaced(&queue);
1657
huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1658
huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1662
static int compare_tree(void* cmp_arg __attribute__((unused)),
1663
register const uchar *s, register const uchar *t)
1666
for (length=global_count->field_length; length-- ;)
1668
return (int) s[-1] - (int) t[-1];
1673
Organize distinct column values and their incidences into a priority queue.
1676
save_counts_in_queue()
1677
key The column value.
1678
count The incidence of this value.
1679
tree The Huffman tree to be built later.
1682
We use the element buffer of the targeted tree. The distinct column
1683
values are organized in a priority queue first. The Huffman
1684
algorithm will later organize the elements into a Huffman tree. For
1685
the time being, we just place references to the elements into the
1686
queue buffer. The buffer will later be organized into a priority
1693
static int save_counts_in_queue(uchar *key, element_count count,
1696
HUFF_ELEMENT *new_huff_el;
1698
new_huff_el=tree->element_buffer+(tree->elements++);
1699
new_huff_el->count=count;
1700
new_huff_el->a.leaf.null=0;
1701
new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1702
tree->counts->field_length;
1703
queue.root[tree->elements]=(uchar*) new_huff_el;
1709
Calculate length of file if given counts should be used.
1712
calc_packed_length()
1713
huff_counts The counts for a column of the table(s).
1714
add_tree_lenght If the decode tree length should be added.
1717
We need to follow the Huffman algorithm until we know, how many bits
1718
are required for each byte code. But we do not need the resulting
1719
Huffman tree. Hence, we can leave out some steps which are essential
1720
in make_huff_tree().
1723
Number of bytes required to compress this table column.
1726
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1727
uint add_tree_lenght)
1729
uint i,found,bits_packed,first,last;
1730
my_off_t bytes_packed;
1731
HUFF_ELEMENT element_buffer[256];
1732
DBUG_ENTER("calc_packed_length");
1735
WARNING: We use a small hack for efficiency: Instead of placing
1736
references to HUFF_ELEMENTs into the queue, we just insert
1737
references to the counts of the byte codes which appeared in this
1738
table column. During the Huffman algorithm they are successively
1739
replaced by references to HUFF_ELEMENTs. This works, because
1740
HUFF_ELEMENTs have the incidence count at their beginning.
1741
Regardless, wether the queue array contains references to counts of
1742
type my_off_t or references to HUFF_ELEMENTs which have the count of
1743
type my_off_t at their beginning, it always points to a count of the
1746
Instead of using queue_insert(), we just copy the references into
1747
the buffer of the priority queue. We insert in byte value order, but
1748
the order is in fact irrelevant here. We will establish the correct
1752
for (i=found=0 ; i < 256 ; i++)
1754
if (huff_counts->counts[i])
1759
/* We start with root[1], which is the queues top element. */
1760
queue.root[found]=(uchar*) &huff_counts->counts[i];
1764
DBUG_RETURN(0); /* Empty tree */
1766
If there is only a single byte value in this field in all records,
1767
add a second element with zero incidence. This is required to enter
1768
the loop, which follows the Huffman algorithm.
1771
queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1773
/* Make a queue from the queue buffer. */
1774
queue.elements=found;
1776
bytes_packed=0; bits_packed=0;
1777
/* Add the length of the coding table, which would become part of the file. */
1778
if (add_tree_lenght)
1779
bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1780
(max_bit(found-1)+1+1)*(found-2) +7)/8;
1783
Make a priority queue from the queue. Construct its index so that we
1784
have a partially ordered tree.
1786
for (i=(found+1)/2 ; i > 0 ; i--)
1787
_downheap(&queue,i);
1789
/* The Huffman algorithm. */
1790
for (i=0 ; i < found-1 ; i++)
1794
HUFF_ELEMENT *new_huff_el;
1797
Pop the top element from the queue (the one with the least
1798
incidence). Popping from a priority queue includes a re-ordering
1799
of the queue, to get the next least incidence element to the top.
1801
a= (my_off_t*) queue_remove(&queue, 0);
1803
Copy the next least incidence element. The queue implementation
1804
reserves root[0] for temporary purposes. root[1] is the top.
1806
b= (my_off_t*) queue.root[1];
1807
/* Create a new element in a local (automatic) buffer. */
1808
new_huff_el= element_buffer + i;
1809
/* The new element gets the sum of the two least incidence elements. */
1810
new_huff_el->count= *a + *b;
1812
The Huffman algorithm assigns another bit to the code for a byte
1813
every time that bytes incidence is combined (directly or indirectly)
1814
to a new element as one of the two least incidence elements.
1815
This means that one more bit per incidence of that byte is required
1816
in the resulting file. So we add the new combined incidence as the
1817
number of bits by which the result grows.
1819
bits_packed+=(uint) (new_huff_el->count & 7);
1820
bytes_packed+=new_huff_el->count/8;
1822
Replace the copied top element by the new element and re-order the
1823
queue. This successively replaces the references to counts by
1824
references to HUFF_ELEMENTs.
1826
queue.root[1]=(uchar*) new_huff_el;
1827
queue_replaced(&queue);
1829
DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1833
/* Remove trees that don't give any compression */
1835
static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1838
HUFF_COUNTS count,*i,*j,*last_count;
1840
last_count=huff_counts+trees;
1841
for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1843
if (!i->tree->tree_number)
1845
i->tree->tree_number= ++tree_number;
1847
continue; /* Don't join intervall */
1848
for (j=i+1 ; j < last_count ; j++)
1850
if (! j->tree->tree_number && ! j->tree_buff)
1852
for (k=0 ; k < 256 ; k++)
1853
count.counts[k]=i->counts[k]+j->counts[k];
1854
if (calc_packed_length(&count,1) <=
1855
i->tree->bytes_packed + j->tree->bytes_packed+
1856
i->tree->tree_pack_length+j->tree->tree_pack_length+
1859
memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1860
sizeof(count.counts[0])*256);
1861
my_free((uchar*) j->tree->element_buffer,MYF(0));
1862
j->tree->element_buffer=0;
1864
bmove((uchar*) i->counts,(uchar*) count.counts,
1865
sizeof(count.counts[0])*256);
1866
if (make_huff_tree(i->tree,i))
1873
DBUG_PRINT("info", ("Original trees: %d After join: %d",
1874
trees, tree_number));
1876
VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
1877
return tree_number; /* Return trees left */
1882
Fill in huff_tree encode tables.
1885
make_huff_decode_table()
1886
huff_tree An array of HUFF_TREE which are to be encoded.
1887
trees The number of HUFF_TREE in the array.
1894
static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1897
for ( ; trees-- ; huff_tree++)
1899
if (huff_tree->tree_number > 0)
1901
elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1902
if (!(huff_tree->code =
1903
(ulonglong*) my_malloc(elements*
1904
(sizeof(ulonglong) + sizeof(uchar)),
1905
MYF(MY_WME | MY_ZEROFILL))))
1907
huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1908
make_traverse_code_tree(huff_tree, huff_tree->root,
1909
8 * sizeof(ulonglong), 0LL);
1916
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1917
HUFF_ELEMENT *element,
1918
uint size, ulonglong code)
1921
if (!element->a.leaf.null)
1923
chr=element->a.leaf.element_nr;
1924
huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1925
huff_tree->code[chr]= (code >> size);
1926
if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1927
huff_tree->height= 8 * sizeof(ulonglong) - size;
1932
make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1933
make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1934
code + (((ulonglong) 1) << size));
1941
Convert a value into binary digits.
1946
length The number of low order bits to convert.
1949
The result string is in static storage. It is reused on every call.
1950
So you cannot use it twice in one expression.
1953
A pointer to a static NUL-terminated string.
1956
static char *bindigits(ulonglong value, uint bits)
1958
static char digits[72];
1962
DBUG_ASSERT(idx < sizeof(digits));
1964
*(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1971
Convert a value into hexadecimal digits.
1978
The result string is in static storage. It is reused on every call.
1979
So you cannot use it twice in one expression.
1982
A pointer to a static NUL-terminated string.
1985
static char *hexdigits(ulonglong value)
1987
static char digits[20];
1989
uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1991
DBUG_ASSERT(idx < sizeof(digits));
1994
if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1995
*(ptr - 1)+= 'a' - '9' - 1;
2002
/* Write header to new packed data file */
2004
static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2005
my_off_t tot_elements,my_off_t filelength)
2007
uchar *buff= (uchar*) file_buffer.pos;
2009
bzero(buff,HEAD_LENGTH);
2010
memcpy_fixed(buff,myisam_pack_file_magic,4);
2011
int4store(buff+4,head_length);
2012
int4store(buff+8, mrg->min_pack_length);
2013
int4store(buff+12,mrg->max_pack_length);
2014
int4store(buff+16,tot_elements);
2015
int4store(buff+20,intervall_length);
2016
int2store(buff+24,trees);
2017
buff[26]=(char) mrg->ref_length;
2018
/* Save record pointer length */
2019
buff[27]= (uchar) mi_get_pointer_length((ulonglong) filelength,2);
2022
VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
2023
return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2024
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2027
/* Write fieldinfo to new packed file */
2029
static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2032
uint huff_tree_bits;
2033
huff_tree_bits=max_bit(trees ? trees-1 : 0);
2035
DBUG_PRINT("info", (" "));
2036
DBUG_PRINT("info", ("column types:"));
2037
DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2038
DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2039
DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2040
DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2041
DBUG_PRINT("info", ("FIELD_BLOB 4"));
2042
DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2043
DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2044
DBUG_PRINT("info", ("FIELD_ZERO 7"));
2045
DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2046
DBUG_PRINT("info", ("FIELD_CHECK 9"));
2047
DBUG_PRINT("info", (" "));
2048
DBUG_PRINT("info", ("pack type as a set of flags:"));
2049
DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2050
DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2051
DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2052
DBUG_PRINT("info", (" "));
2056
VOID(printf("column types:\n"));
2057
VOID(printf("FIELD_NORMAL 0\n"));
2058
VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
2059
VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
2060
VOID(printf("FIELD_SKIP_ZERO 3\n"));
2061
VOID(printf("FIELD_BLOB 4\n"));
2062
VOID(printf("FIELD_CONSTANT 5\n"));
2063
VOID(printf("FIELD_INTERVALL 6\n"));
2064
VOID(printf("FIELD_ZERO 7\n"));
2065
VOID(printf("FIELD_VARCHAR 8\n"));
2066
VOID(printf("FIELD_CHECK 9\n"));
2068
VOID(printf("pack type as a set of flags:\n"));
2069
VOID(printf("PACK_TYPE_SELECTED 1\n"));
2070
VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
2071
VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
2074
for (i=0 ; i++ < fields ; counts++)
2076
write_bits((ulonglong) (int) counts->field_type, 5);
2077
write_bits(counts->pack_type,6);
2078
if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2079
write_bits(counts->max_zero_fill,5);
2081
write_bits(counts->length_bits,5);
2082
write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2083
DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2084
"lbits: %2u tree: %2u length: %4u",
2085
i , counts->field_type, counts->pack_type,
2086
counts->max_zero_fill, counts->length_bits,
2087
counts->tree->tree_number, counts->field_length));
2089
VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2090
"tree: %2u length: %4u\n", i , counts->field_type,
2091
counts->pack_type, counts->max_zero_fill, counts->length_bits,
2092
counts->tree->tree_number, counts->field_length));
2098
/* Write all huff_trees to new datafile. Return tot count of
2099
elements in all trees
2100
Returns 0 on error */
2102
static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2108
uint *packed_tree,*offset,length;
2111
/* Find the highest number of elements in the trees. */
2112
for (i=length=0 ; i < trees ; i++)
2113
if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2114
length=huff_tree[i].elements;
2116
Allocate a buffer for packing a decode tree. Two numbers per element
2117
(left child and right child).
2119
if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2121
my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2125
DBUG_PRINT("info", (" "));
2130
for (elements=0; trees-- ; huff_tree++)
2132
/* Skip columns that have been joined with other columns. */
2133
if (huff_tree->tree_number == 0)
2134
continue; /* Deleted tree */
2136
DBUG_PRINT("info", (" "));
2139
/* Count the total number of elements (byte codes or column values). */
2140
elements+=huff_tree->elements;
2141
huff_tree->max_offset=2;
2142
/* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2143
if (huff_tree->elements <= 1)
2146
offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2148
/* This should be the same as 'length' above. */
2149
huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2152
Since we check this during collecting the distinct column values,
2153
this should never happen.
2155
if (huff_tree->max_offset >= IS_OFFSET)
2156
{ /* This should be impossible */
2157
VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2158
huff_tree->max_offset));
2159
my_afree((uchar*) packed_tree);
2163
DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2165
(ulong) (file_buffer.pos - file_buffer.buffer),
2166
huff_tree->elements, (ulong) (offset - packed_tree),
2167
huff_tree->char_bits));
2168
if (!huff_tree->counts->tree_buff)
2170
/* We do a byte compression on this column. Mark with bit 0. */
2172
write_bits(huff_tree->min_chr,8);
2173
write_bits(huff_tree->elements,9);
2174
write_bits(huff_tree->char_bits,5);
2175
write_bits(huff_tree->offset_bits,5);
2180
int_length=(uint) (huff_tree->counts->tree_pos -
2181
huff_tree->counts->tree_buff);
2182
/* We have distinct column values for this column. Mark with bit 1. */
2184
write_bits(huff_tree->elements,15);
2185
write_bits(int_length,16);
2186
write_bits(huff_tree->char_bits,5);
2187
write_bits(huff_tree->offset_bits,5);
2188
intervall_length+=int_length;
2190
DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2191
"offset_bits: %2u %s: %5u codelen: %2u",
2192
tree_no, huff_tree->elements, huff_tree->char_bits,
2193
huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2194
"bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2195
int_length : huff_tree->min_chr, huff_tree->height));
2197
VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2198
"%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2199
huff_tree->char_bits, huff_tree->offset_bits,
2200
huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2201
huff_tree->counts->tree_buff ? int_length :
2202
huff_tree->min_chr, huff_tree->height));
2204
/* Check that the code tree length matches the element count. */
2205
length=(uint) (offset-packed_tree);
2206
if (length != huff_tree->elements*2-2)
2208
VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2209
length, huff_tree->elements * 2 - 2));
2214
for (i=0 ; i < length ; i++)
2216
if (packed_tree[i] & IS_OFFSET)
2217
write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2218
huff_tree->offset_bits+1);
2220
write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2221
DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2222
i, (packed_tree[i] & IS_OFFSET) ?
2223
" -> " : "", (packed_tree[i] & IS_OFFSET) ?
2224
packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2226
VOID(printf("tree[0x%04x]: %s0x%04x\n",
2227
i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2228
(packed_tree[i] & IS_OFFSET) ?
2229
packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2234
Display coding tables and check their correctness.
2236
codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2237
for (i= 0; i < codes; i++)
2244
if (! (len= huff_tree->code_len[i]))
2246
DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2247
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2248
bindigits(huff_tree->code[i],
2249
huff_tree->code_len[i])));
2251
VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2252
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2253
bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2255
/* Check that the encode table decodes correctly. */
2259
DBUG_EXECUTE_IF("forcechkerr1", len--;);
2260
DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2261
DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2266
VOID(fflush(stdout));
2267
VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2268
hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2273
code|= (huff_tree->code[i] >> (--len)) & 1;
2275
if (bits > 8 * sizeof(code))
2277
VOID(fflush(stdout));
2278
VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2279
bits, (uint) (8 * sizeof(code))));
2283
idx+= (uint) code & 1;
2286
VOID(fflush(stdout));
2287
VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2292
if (packed_tree[idx] & IS_OFFSET)
2293
idx+= packed_tree[idx] & ~IS_OFFSET;
2295
break; /* Hit a leaf. This contains the result value. */
2300
DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2301
if (packed_tree[idx] != i)
2303
VOID(fflush(stdout));
2304
VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2305
packed_tree[idx], i));
2309
} /*end for (codes)*/
2313
/* Write column values in case of distinct column value compression. */
2314
if (huff_tree->counts->tree_buff)
2316
for (i=0 ; i < int_length ; i++)
2318
write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2319
DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2320
i, (uchar) huff_tree->counts->tree_buff[i]));
2322
VOID(printf("column_values[0x%04x]: 0x%02x\n",
2323
i, (uchar) huff_tree->counts->tree_buff[i]));
2328
DBUG_PRINT("info", (" "));
2331
my_afree((uchar*) packed_tree);
2334
VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2341
static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2346
prev_offset= offset;
2348
'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2349
then there is no left child and, hence no right child either. This
2350
is a property of a binary tree. An element is either a node with two
2351
childs, or a leaf without childs.
2353
The current element is always a node with two childs. Go left first.
2355
if (!element->a.nod.left->a.leaf.null)
2357
/* Store the byte code or the index of the column value. */
2358
prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2364
Recursively traverse the tree to the left. Mark it as an offset to
2365
another tree node (in contrast to a byte code or column value index).
2367
prev_offset[0]= IS_OFFSET+2;
2368
offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2371
/* Now, check the right child. */
2372
if (!element->a.nod.right->a.leaf.null)
2374
/* Store the byte code or the index of the column value. */
2375
prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2381
Recursively traverse the tree to the right. Mark it as an offset to
2382
another tree node (in contrast to a byte code or column value index).
2384
uint temp=(uint) (offset-prev_offset-1);
2385
prev_offset[1]= IS_OFFSET+ temp;
2386
if (huff_tree->max_offset < temp)
2387
huff_tree->max_offset = temp;
2388
return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2392
/* Get number of bits neaded to represent value */
2394
static uint max_bit(register uint value)
2396
register uint power=1;
2404
static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2407
uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2408
intervall,field_length,max_pack_length,pack_blob_length;
2409
my_off_t record_count;
2411
ulong length,pack_length;
2412
uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2413
HUFF_COUNTS *count,*end_count;
2415
MI_INFO *isam_file=mrg->file[0];
2416
uint pack_version= (uint) isam_file->s->pack.version;
2417
DBUG_ENTER("compress_isam_file");
2419
/* Allocate a buffer for the records (excluding blobs). */
2420
if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2423
end_count=huff_counts+isam_file->s->base.fields;
2424
min_record_length= (uint) ~0;
2425
max_record_length=0;
2428
Calculate the maximum number of bits required to pack the records.
2429
Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2430
The tree height determines the maximum number of bits per value.
2431
Some fields skip leading or trailing spaces or zeroes. The skipped
2432
number of bytes is encoded by 'length_bits' bits.
2433
Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2434
and varchar get a leading 0 bit.
2436
for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2438
if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2439
huff_counts[i].max_zero_fill=0;
2440
if (huff_counts[i].field_type == FIELD_CONSTANT ||
2441
huff_counts[i].field_type == FIELD_ZERO ||
2442
huff_counts[i].field_type == FIELD_CHECK)
2444
if (huff_counts[i].field_type == FIELD_INTERVALL)
2445
max_calc_length+=huff_counts[i].tree->height;
2446
else if (huff_counts[i].field_type == FIELD_BLOB ||
2447
huff_counts[i].field_type == FIELD_VARCHAR)
2448
max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2451
(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2452
huff_counts[i].tree->height+huff_counts[i].length_bits;
2454
max_calc_length= (max_calc_length + 7) / 8;
2455
pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2457
/* 'max_blob_length' is the max length of all blobs of a record. */
2458
pack_blob_length= isam_file->s->base.blobs ?
2459
calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2460
max_pack_length=pack_ref_length+pack_blob_length;
2462
DBUG_PRINT("fields", ("==="));
2464
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2466
ulong tot_blob_length=0;
2469
if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2471
record_pos= (uchar*) file_buffer.pos;
2472
file_buffer.pos+=max_pack_length;
2473
for (start_pos=record, count= huff_counts; count < end_count ; count++)
2475
end_pos=start_pos+(field_length=count->field_length);
2478
DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2479
"lbits: %2u tree: %2u length: %4u",
2480
(ulong) (count - huff_counts + 1),
2482
count->pack_type, count->max_zero_fill,
2483
count->length_bits, count->tree->tree_number,
2484
count->field_length));
2486
/* Check if the column contains spaces only. */
2487
if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2489
for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2492
DBUG_PRINT("fields",
2493
("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2494
DBUG_PRINT("fields", ("---"));
2499
DBUG_PRINT("fields",
2500
("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2503
end_pos-=count->max_zero_fill;
2504
field_length-=count->max_zero_fill;
2506
switch (count->field_type) {
2507
case FIELD_SKIP_ZERO:
2508
if (!memcmp((uchar*) start_pos,zero_string,field_length))
2510
DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2515
DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2519
DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2520
(ulong) (end_pos - start_pos)));
2521
for ( ; start_pos < end_pos ; start_pos++)
2523
DBUG_PRINT("fields",
2524
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2526
hexdigits(tree->code[(uchar) *start_pos]),
2527
(uint) tree->code_len[(uchar) *start_pos],
2528
bindigits(tree->code[(uchar) *start_pos],
2529
(uint) tree->code_len[(uchar) *start_pos])));
2530
write_bits(tree->code[(uchar) *start_pos],
2531
(uint) tree->code_len[(uchar) *start_pos]);
2534
case FIELD_SKIP_ENDSPACE:
2535
for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2536
length= (ulong) (end_pos - pos);
2537
if (count->pack_type & PACK_TYPE_SELECTED)
2539
if (length > count->min_space)
2541
DBUG_PRINT("fields",
2542
("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2543
DBUG_PRINT("fields",
2544
("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2545
length, field_length, count->length_bits));
2547
write_bits(length,count->length_bits);
2551
DBUG_PRINT("fields",
2552
("FIELD_SKIP_ENDSPACE not more than min_space, "
2560
DBUG_PRINT("fields",
2561
("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2562
length, field_length, count->length_bits));
2563
write_bits(length,count->length_bits);
2565
/* Encode all significant bytes. */
2566
DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2567
(ulong) (pos - start_pos)));
2568
for ( ; start_pos < pos ; start_pos++)
2570
DBUG_PRINT("fields",
2571
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2573
hexdigits(tree->code[(uchar) *start_pos]),
2574
(uint) tree->code_len[(uchar) *start_pos],
2575
bindigits(tree->code[(uchar) *start_pos],
2576
(uint) tree->code_len[(uchar) *start_pos])));
2577
write_bits(tree->code[(uchar) *start_pos],
2578
(uint) tree->code_len[(uchar) *start_pos]);
2582
case FIELD_SKIP_PRESPACE:
2583
for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2584
length= (ulong) (pos - start_pos);
2585
if (count->pack_type & PACK_TYPE_SELECTED)
2587
if (length > count->min_space)
2589
DBUG_PRINT("fields",
2590
("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2591
DBUG_PRINT("fields",
2592
("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2593
length, field_length, count->length_bits));
2595
write_bits(length,count->length_bits);
2599
DBUG_PRINT("fields",
2600
("FIELD_SKIP_PRESPACE not more than min_space, "
2608
DBUG_PRINT("fields",
2609
("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2610
length, field_length, count->length_bits));
2611
write_bits(length,count->length_bits);
2613
/* Encode all significant bytes. */
2614
DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2615
(ulong) (end_pos - start_pos)));
2616
for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2618
DBUG_PRINT("fields",
2619
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2621
hexdigits(tree->code[(uchar) *start_pos]),
2622
(uint) tree->code_len[(uchar) *start_pos],
2623
bindigits(tree->code[(uchar) *start_pos],
2624
(uint) tree->code_len[(uchar) *start_pos])));
2625
write_bits(tree->code[(uchar) *start_pos],
2626
(uint) tree->code_len[(uchar) *start_pos]);
2629
case FIELD_CONSTANT:
2632
DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2635
case FIELD_INTERVALL:
2637
pos=(uchar*) tree_search(&count->int_tree, start_pos,
2638
count->int_tree.custom_arg);
2639
intervall=(uint) (pos - count->tree_buff)/field_length;
2640
DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2641
DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2642
intervall, hexdigits(tree->code[intervall]),
2643
(uint) tree->code_len[intervall]));
2644
write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2649
ulong blob_length=_mi_calc_blob_length(field_length-
2650
portable_sizeof_char_ptr,
2652
/* Empty blobs are encoded with a single 1 bit. */
2655
DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2660
uchar *blob,*blob_end;
2661
DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2663
/* Write the blob length. */
2664
DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2665
blob_length, count->length_bits));
2666
write_bits(blob_length,count->length_bits);
2667
memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2669
blob_end=blob+blob_length;
2670
/* Encode the blob bytes. */
2671
for ( ; blob < blob_end ; blob++)
2673
DBUG_PRINT("fields",
2674
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2675
(uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2676
(uint) tree->code_len[(uchar) *blob],
2677
bindigits(tree->code[(uchar) *start_pos],
2678
(uint)tree->code_len[(uchar) *start_pos])));
2679
write_bits(tree->code[(uchar) *blob],
2680
(uint) tree->code_len[(uchar) *blob]);
2682
tot_blob_length+=blob_length;
2689
uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2690
ulong col_length= (var_pack_length == 1 ?
2691
(uint) *(uchar*) start_pos :
2692
uint2korr(start_pos));
2693
/* Empty varchar are encoded with a single 1 bit. */
2696
DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2697
write_bits(1,1); /* Empty varchar */
2701
uchar *end= start_pos + var_pack_length + col_length;
2702
DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2704
/* Write the varchar length. */
2705
DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2706
col_length, count->length_bits));
2707
write_bits(col_length,count->length_bits);
2708
/* Encode the varchar bytes. */
2709
for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2711
DBUG_PRINT("fields",
2712
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2714
hexdigits(tree->code[(uchar) *start_pos]),
2715
(uint) tree->code_len[(uchar) *start_pos],
2716
bindigits(tree->code[(uchar) *start_pos],
2717
(uint)tree->code_len[(uchar) *start_pos])));
2718
write_bits(tree->code[(uchar) *start_pos],
2719
(uint) tree->code_len[(uchar) *start_pos]);
2726
case FIELD_enum_val_count:
2727
abort(); /* Impossible */
2729
start_pos+=count->max_zero_fill;
2730
DBUG_PRINT("fields", ("---"));
2733
length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2734
pack_length= save_pack_length(pack_version, record_pos, length);
2735
if (pack_blob_length)
2736
pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2738
DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2739
"length-bytes: %lu", (ulong) record_count, length,
2740
tot_blob_length, pack_length));
2741
DBUG_PRINT("fields", ("==="));
2743
/* Correct file buffer if the header was smaller */
2744
if (pack_length != max_pack_length)
2746
bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2747
file_buffer.pos-= (max_pack_length-pack_length);
2749
if (length < (ulong) min_record_length)
2750
min_record_length=(uint) length;
2751
if (length > (ulong) max_record_length)
2752
max_record_length=(uint) length;
2754
if (write_loop && record_count % WRITE_COUNT == 0)
2756
VOID(printf("%lu\r", (ulong) record_count));
2757
VOID(fflush(stdout));
2760
else if (error != HA_ERR_RECORD_DELETED)
2763
if (error == HA_ERR_END_OF_FILE)
2767
VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2768
my_progname, error));
2771
VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
2773
my_afree((uchar*) record);
2774
mrg->ref_length=max_pack_length;
2775
mrg->min_pack_length=max_record_length ? min_record_length : 0;
2776
mrg->max_pack_length=max_record_length;
2777
DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2781
static char *make_new_name(char *new_name, char *old_name)
2783
return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2786
static char *make_old_name(char *new_name, char *old_name)
2788
return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2791
/* rutines for bit writing buffer */
2793
static void init_file_buffer(File file, pbool read_buffer)
2795
file_buffer.file=file;
2796
file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2798
file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2799
file_buffer.pos_in_file=0;
2804
file_buffer.pos=file_buffer.end;
2809
file_buffer.pos=file_buffer.buffer;
2810
file_buffer.bits=BITS_SAVED;
2812
file_buffer.bitbucket= 0;
2816
static int flush_buffer(ulong neaded_length)
2821
file_buffer.end is 8 bytes lower than the real end of the buffer.
2822
This is done so that the end-of-buffer condition does not need to be
2823
checked for every byte (see write_bits()). Consequently,
2824
file_buffer.pos can become greater than file_buffer.end. The
2825
algorithms in the other functions ensure that there will never be
2826
more than 8 bytes written to the buffer without an end-of-buffer
2827
check. So the buffer cannot be overrun. But we need to check for the
2828
near-to-buffer-end condition to avoid a negative result, which is
2829
casted to unsigned and thus becomes giant.
2831
if ((file_buffer.pos < file_buffer.end) &&
2832
((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2834
length=(ulong) (file_buffer.pos-file_buffer.buffer);
2835
file_buffer.pos=file_buffer.buffer;
2836
file_buffer.pos_in_file+=length;
2839
if (error_on_write|| my_write(file_buffer.file,
2840
(const uchar*) file_buffer.buffer,
2842
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2848
if (neaded_length != ~(ulong) 0 &&
2849
(ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2852
neaded_length+=256; /* some margin */
2853
tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2856
file_buffer.pos= ((uchar*) tmp +
2857
(ulong) (file_buffer.pos - file_buffer.buffer));
2858
file_buffer.buffer= (uchar*) tmp;
2859
file_buffer.end= (uchar*) (tmp+neaded_length-8);
2865
static void end_file_buffer(void)
2867
my_free((uchar*) file_buffer.buffer,MYF(0));
2870
/* output `bits` low bits of `value' */
2872
static void write_bits(register ulonglong value, register uint bits)
2874
DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2875
(bits == 8 * sizeof(value)));
2877
if ((file_buffer.bits-= (int) bits) >= 0)
2879
file_buffer.bitbucket|= value << file_buffer.bits;
2883
register ulonglong bit_buffer;
2884
bits= (uint) -file_buffer.bits;
2885
bit_buffer= (file_buffer.bitbucket |
2886
((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2887
#if BITS_SAVED == 64
2888
*file_buffer.pos++= (uchar) (bit_buffer >> 56);
2889
*file_buffer.pos++= (uchar) (bit_buffer >> 48);
2890
*file_buffer.pos++= (uchar) (bit_buffer >> 40);
2891
*file_buffer.pos++= (uchar) (bit_buffer >> 32);
2893
*file_buffer.pos++= (uchar) (bit_buffer >> 24);
2894
*file_buffer.pos++= (uchar) (bit_buffer >> 16);
2895
*file_buffer.pos++= (uchar) (bit_buffer >> 8);
2896
*file_buffer.pos++= (uchar) (bit_buffer);
2898
if (bits != 8 * sizeof(value))
2899
value&= (((ulonglong) 1) << bits) - 1;
2900
if (file_buffer.pos >= file_buffer.end)
2901
VOID(flush_buffer(~ (ulong) 0));
2902
file_buffer.bits=(int) (BITS_SAVED - bits);
2903
file_buffer.bitbucket= value << (BITS_SAVED - bits);
2908
/* Flush bits in bit_buffer to buffer */
2910
static void flush_bits(void)
2913
ulonglong bit_buffer;
2915
bits= file_buffer.bits & ~7;
2916
bit_buffer= file_buffer.bitbucket >> bits;
2917
bits= BITS_SAVED - bits;
2921
*file_buffer.pos++= (uchar) (bit_buffer >> bits);
2923
if (file_buffer.pos >= file_buffer.end)
2924
VOID(flush_buffer(~ (ulong) 0));
2925
file_buffer.bits= BITS_SAVED;
2926
file_buffer.bitbucket= 0;
2930
/****************************************************************************
2931
** functions to handle the joined files
2932
****************************************************************************/
2934
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2937
MYISAM_SHARE *share=isam_file->s;
2938
uint options=mi_uint2korr(share->state.header.options);
2940
DBUG_ENTER("save_state");
2942
options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2943
mi_int2store(share->state.header.options,options);
2945
share->state.state.data_file_length=new_length;
2946
share->state.state.del=0;
2947
share->state.state.empty=0;
2948
share->state.dellink= HA_OFFSET_ERROR;
2949
share->state.split=(ha_rows) mrg->records;
2950
share->state.version=(ulong) time((time_t*) 0);
2951
if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2954
Some indexes are disabled, cannot use current key_file_length value
2955
as an estimate of upper bound of index file size. Use packed data file
2958
share->state.state.key_file_length= new_length;
2961
If there are no disabled indexes, keep key_file_length value from
2962
original file so "myisamchk -rq" can use this value (this is necessary
2963
because index size cannot be easily calculated for fulltext keys)
2965
mi_clear_all_keys_active(share->state.key_map);
2966
for (key=0 ; key < share->base.keys ; key++)
2967
share->state.key_root[key]= HA_OFFSET_ERROR;
2968
for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2969
share->state.key_del[key]= HA_OFFSET_ERROR;
2970
isam_file->state->checksum=crc; /* Save crc here */
2971
share->changed=1; /* Force write of header */
2972
share->state.open_count=0;
2973
share->global_changed=0;
2974
(void)ftruncate(share->kfile, share->base.keystart);
2975
if (share->base.keys)
2977
DBUG_RETURN(mi_state_info_write(share->kfile,&share->state,1+2));
2981
static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2984
MI_STATE_INFO state;
2985
MI_INFO *isam_file=mrg->file[0];
2987
DBUG_ENTER("save_state_mrg");
2989
state= isam_file->s->state;
2990
options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2991
HA_OPTION_READ_ONLY_DATA);
2992
mi_int2store(state.header.options,options);
2993
state.state.data_file_length=new_length;
2995
state.state.empty=0;
2996
state.state.records=state.split=(ha_rows) mrg->records;
2997
/* See comment above in save_state about key_file_length handling. */
2998
if (mrg->src_file_has_indexes_disabled)
3000
isam_file->s->state.state.key_file_length=
3001
max(isam_file->s->state.state.key_file_length, new_length);
3003
state.dellink= HA_OFFSET_ERROR;
3004
state.version=(ulong) time((time_t*) 0);
3005
mi_clear_all_keys_active(state.key_map);
3006
state.state.checksum=crc;
3007
if (isam_file->s->base.keys)
3009
state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3010
DBUG_RETURN (mi_state_info_write(file,&state,1+2));
3014
/* reset for mrg_rrnd */
3016
static void mrg_reset(PACK_MRG_INFO *mrg)
3020
mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3025
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3033
isam_info= *(info->current=info->file);
3034
info->end=info->current+info->count;
3035
mi_reset(isam_info);
3036
mi_extra(isam_info, HA_EXTRA_CACHE, 0);
3037
filepos=isam_info->s->pack.header_length;
3041
isam_info= *info->current;
3042
filepos= isam_info->nextpos;
3047
isam_info->update&= HA_STATE_CHANGED;
3048
if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
3050
error != HA_ERR_END_OF_FILE)
3052
mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3053
if (info->current+1 == info->end)
3054
return(HA_ERR_END_OF_FILE);
3056
isam_info= *info->current;
3057
filepos=isam_info->s->pack.header_length;
3058
mi_reset(isam_info);
3059
mi_extra(isam_info,HA_EXTRA_CACHE, 0);
3064
static int mrg_close(PACK_MRG_INFO *mrg)
3068
for (i=0 ; i < mrg->count ; i++)
3069
error|=mi_close(mrg->file[i]);
3071
my_free((uchar*) mrg->file,MYF(0));
3076
#if !defined(DBUG_OFF)
3078
Fake the counts to get big Huffman codes.
3082
huff_counts A pointer to the counts array.
3083
end_count A pointer past the counts array.
3087
Huffman coding works by removing the two least frequent values from
3088
the list of values and add a new value with the sum of their
3089
incidences in a loop until only one value is left. Every time a
3090
value is reused for a new value, it gets one more bit for its
3091
encoding. Hence, the least frequent values get the longest codes.
3093
To get a maximum code length for a value, two of the values must
3094
have an incidence of 1. As their sum is 2, the next infrequent value
3095
must have at least an incidence of 2, then 4, 8, 16 and so on. This
3096
means that one needs 2**n bytes (values) for a code length of n
3097
bits. However, using more distinct values forces the use of longer
3098
codes, or reaching the code length with less total bytes (values).
3100
To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3101
I assign counts of 1 to the two most frequent values, a count of 2
3102
for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3103
the remaining values get 1. That way every possible byte has an
3104
assigned code, though not all codes are used if not all byte values
3105
are present in the column.
3107
This strategy would work with distinct column values too, but
3108
requires that at least 64(32) values are present. To make things
3109
easier here, I cancel all distinct column values and force byte
3110
compression for all columns.
3116
static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3119
my_off_t *cur_count_p;
3120
my_off_t *end_count_p;
3121
my_off_t **cur_sort_p;
3122
my_off_t **end_sort_p;
3123
my_off_t *sort_counts[256];
3125
DBUG_ENTER("fakebigcodes");
3127
for (count= huff_counts; count < end_count; count++)
3130
Remove distinct column values.
3132
if (huff_counts->tree_buff)
3134
my_free((uchar*) huff_counts->tree_buff, MYF(0));
3135
delete_tree(&huff_counts->int_tree);
3136
huff_counts->tree_buff= NULL;
3137
DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3141
Sort counts by decreasing incidence.
3143
cur_count_p= count->counts;
3144
end_count_p= cur_count_p + 256;
3145
cur_sort_p= sort_counts;
3146
while (cur_count_p < end_count_p)
3147
*(cur_sort_p++)= cur_count_p++;
3148
(void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3151
Assign faked counts.
3153
cur_sort_p= sort_counts;
3154
#if SIZEOF_LONG_LONG > 4
3155
end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3157
end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3159
/* Most frequent value gets a faked count of 1. */
3160
**(cur_sort_p++)= 1;
3162
while (cur_sort_p < end_sort_p)
3164
**(cur_sort_p++)= total;
3167
/* Set the last value. */
3168
**(cur_sort_p++)= --total;
3170
Set the remaining counts.
3172
end_sort_p= sort_counts + 256;
3173
while (cur_sort_p < end_sort_p)
3174
**(cur_sort_p++)= 1;
3181
Compare two counts for reverse sorting.
3186
count2 Another count.
3194
static int fakecmp(my_off_t **count1, my_off_t **count2)
3196
return ((**count1 < **count2) ? 1 :
3197
(**count1 > **count2) ? -1 : 0);