1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
/* Pack MyISAM file */
19
#define USE_MY_FUNC /* We need at least my_malloc */
22
#include "myisamdef.h"
23
#include <mysys/queues.h>
24
#include <mysys/my_tree.h>
25
#include <mysys/mysys_err.h>
26
#ifndef __GNU_LIBRARY__
27
#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
29
#include <mysys/my_getopt.h>
32
#if SIZEOF_LONG_LONG > 4
38
#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
39
#define HEAD_LENGTH 32
40
#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
42
#define DATA_TMP_EXT ".TMD"
43
#define OLD_EXT ".OLD"
44
#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
46
struct st_file_buffer {
48
uchar *buffer,*pos,*end;
55
struct st_huff_element;
57
typedef struct st_huff_counts {
58
uint field_length,max_zero_fill;
60
uint max_end_space,max_pre_space,length_bits,min_space;
62
enum en_fieldtype field_type;
63
struct st_huff_tree *tree; /* Tree for field */
65
my_off_t end_space[8];
66
my_off_t pre_space[8];
67
my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
68
TREE int_tree; /* Tree for detecting distinct column values. */
69
uchar *tree_buff; /* Column values, 'field_length' each. */
70
uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
73
typedef struct st_huff_element HUFF_ELEMENT;
76
WARNING: It is crucial for the optimizations in calc_packed_length()
77
that 'count' is the first element of 'HUFF_ELEMENT'.
79
struct st_huff_element {
83
HUFF_ELEMENT *left,*right;
87
uint element_nr; /* Number of element */
93
typedef struct st_huff_tree {
94
HUFF_ELEMENT *root,*element_buffer;
98
my_off_t bytes_packed;
99
uint tree_pack_length;
100
uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
106
typedef struct st_isam_mrg {
107
MI_INFO **file,**current,**end;
110
uint min_pack_length; /* Theese is used by packed data */
111
uint max_pack_length;
113
uint max_blob_length;
115
/* true if at least one source file has at least one disabled index */
116
bool src_file_has_indexes_disabled;
120
extern int main(int argc,char * *argv);
121
static void get_options(int *argc,char ***argv);
122
static MI_INFO *open_isam_file(char *name,int mode);
123
static bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
124
static int compress(PACK_MRG_INFO *file,char *join_name);
125
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
126
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
128
HUFF_COUNTS *huff_counts,
130
static int compare_tree(void* cmp_arg __attribute__((unused)),
131
const uchar *s,const uchar *t);
132
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
133
static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
135
static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
136
uint max_space_length,my_off_t *space_counts,
137
my_off_t tot_space_count,
138
enum en_fieldtype field_type);
139
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
140
static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
141
static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
142
static int save_counts_in_queue(uchar *key,element_count count,
144
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
145
static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
146
static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
147
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
148
HUFF_ELEMENT *element,uint size,
150
static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
151
my_off_t tot_elements,my_off_t filelength);
152
static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
153
static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
154
static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
155
HUFF_ELEMENT *element,
157
static uint max_bit(uint value);
158
static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
159
static char *make_new_name(char *new_name,char *old_name);
160
static char *make_old_name(char *new_name,char *old_name);
161
static void init_file_buffer(File file,bool read_buffer);
162
static int flush_buffer(ulong neaded_length);
163
static void end_file_buffer(void);
164
static void write_bits(uint64_t value, uint bits);
165
static void flush_bits(void);
166
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
168
static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
170
static int mrg_close(PACK_MRG_INFO *mrg);
171
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
172
static void mrg_reset(PACK_MRG_INFO *mrg);
174
static int error_on_write=0,test_only=0,verbose=0,silent=0,
175
write_loop=0,force_pack=0, isamchk_neaded=0;
176
static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
177
static bool backup, opt_wait;
179
tree_buff_length is somewhat arbitrary. The bigger it is the better
180
the chance to win in terms of compression factor. On the other hand,
181
this table becomes part of the compressed file header. And its length
182
is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
184
static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
185
static char tmp_dir[FN_REFLEN]={0},*join_table;
186
static my_off_t intervall_length;
187
static ha_checksum glob_crc;
188
static struct st_file_buffer file_buffer;
190
static HUFF_COUNTS *global_count;
191
static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
192
static const char *load_default_groups[]= { "myisampack",0 };
194
/* The main program */
196
int main(int argc, char **argv)
203
load_defaults("my",load_default_groups,&argc,&argv);
205
get_options(&argc,&argv);
207
error=ok=isamchk_neaded=0;
209
{ /* Join files into one */
210
if (open_isam_files(&merge,argv,(uint) argc) ||
211
compress(&merge,join_table))
217
if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
221
merge.file= &isam_file;
225
if (compress(&merge,0))
231
if (ok && isamchk_neaded && !silent)
232
puts("Remember to run myisamchk -rq on compressed tables");
233
VOID(fflush(stdout));
234
VOID(fflush(stderr));
235
free_defaults(default_argv);
236
my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
239
return 0; /* No compiler warning */
243
enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
245
static struct my_option my_long_options[] =
247
{"backup", 'b', "Make a backup of the table as table_name.OLD.",
248
(char**) &backup, (char**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
249
{"character-sets-dir", OPT_CHARSETS_DIR_MP,
250
"Directory where character sets are.", (char**) &charsets_dir,
251
(char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
252
{"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
253
0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
255
"Force packing of table even if it gets bigger or if tempfile exists.",
256
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
258
"Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
259
(char**) &join_table, (char**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
261
{"help", '?', "Display this help and exit.",
262
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
263
{"silent", 's', "Be more silent.",
264
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
265
{"tmpdir", 'T', "Use temporary directory to store temporary table.",
266
0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
267
{"test", 't', "Don't pack table, only test packing it.",
268
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
269
{"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
270
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
271
{"version", 'V', "Output version information and exit.",
272
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
273
{"wait", 'w', "Wait and retry if table is in use.", (char**) &opt_wait,
274
(char**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
275
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
278
static void print_version(void)
280
VOID(printf("%s Ver 1.23 for %s on %s\n",
281
my_progname, SYSTEM_TYPE, MACHINE_TYPE));
285
static void usage(void)
288
puts("Copyright (C) 2002 MySQL AB");
289
puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
290
puts("and you are welcome to modify and redistribute it under the GPL license\n");
292
puts("Pack a MyISAM-table to take much less space.");
293
puts("Keys are not updated, you must run myisamchk -rq on the datafile");
294
puts("afterwards to update the keys.");
295
puts("You should give the .MYI file as the filename argument.");
297
VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
298
my_print_help(my_long_options);
299
print_defaults("my", load_default_groups);
300
my_print_variables(my_long_options);
304
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
312
setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
317
tmpfile_createflag= O_RDWR | O_TRUNC;
320
write_loop= verbose= 0;
325
/* Avoid to reset 'verbose' if it was already set > 1. */
330
length= (uint) (stpcpy(tmp_dir, argument) - tmp_dir);
331
if (length != dirname_length(tmp_dir))
333
tmp_dir[length]=FN_LIBCHAR;
338
verbose++; /* Allow for selecting the level of verbosity. */
353
/* Initiates DEBUG - but no debugging here ! */
355
static void get_options(int *argc,char ***argv)
359
my_progname= argv[0][0];
360
if (isatty(fileno(stdout)))
363
if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
373
backup=0; /* Not needed */
380
static MI_INFO *open_isam_file(char *name,int mode)
385
if (!(isam_file=mi_open(name,mode,
386
(opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
387
HA_OPEN_ABORT_IF_LOCKED))))
389
VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
393
if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
397
VOID(fprintf(stderr, "%s is already compressed\n", name));
398
VOID(mi_close(isam_file));
402
puts("Recompressing already compressed table");
403
share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
405
if (! force_pack && share->state.state.records != 0 &&
406
(share->state.state.records <= 1 ||
407
share->state.state.data_file_length < 1024))
409
VOID(fprintf(stderr, "%s is too small to compress\n", name));
410
VOID(mi_close(isam_file));
413
VOID(mi_lock_database(isam_file,F_WRLCK));
418
static bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
423
mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
425
mrg->src_file_has_indexes_disabled= 0;
426
for (i=0; i < count ; i++)
428
if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
431
mrg->src_file_has_indexes_disabled|=
432
! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
433
mrg->file[i]->s->base.keys);
435
/* Check that files are identical */
436
for (j=0 ; j < count-1 ; j++)
438
MI_COLUMNDEF *m1,*m2,*end;
439
if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
440
mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
442
m1=mrg->file[j]->s->rec;
443
end=m1+mrg->file[j]->s->base.fields;
444
m2=mrg->file[j+1]->s->rec;
445
for ( ; m1 != end ; m1++,m2++)
447
if (m1->type != m2->type || m1->length != m2->length)
455
VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
456
my_progname, names[j], names[j+1]));
459
mi_close(mrg->file[i]);
460
my_free((uchar*) mrg->file,MYF(0));
465
static int compress(PACK_MRG_INFO *mrg,char *result_table)
468
File new_file,join_isam_file;
471
char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
472
uint i,header_length,fields,trees,used_trees;
473
my_off_t old_length,new_length,tot_elements;
474
HUFF_COUNTS *huff_counts;
475
HUFF_TREE *huff_trees;
477
isam_file=mrg->file[0]; /* Take this as an example */
479
new_file=join_isam_file= -1;
484
/* Create temporary or join file */
487
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
489
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
490
if (!test_only && result_table)
492
/* Make a new indexfile based on first file in list */
495
stpcpy(org_name,result_table); /* Fix error messages */
496
VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
497
if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
500
length=(uint) share->base.keystart;
501
if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
503
if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
504
my_write(join_isam_file,buff,length,
505
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
507
my_free(buff,MYF(0));
510
my_free(buff,MYF(0));
511
VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
513
else if (!tmp_dir[0])
514
VOID(make_new_name(new_name,org_name));
516
VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
518
(new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
521
/* Start calculating statistics */
524
for (i=0 ; i < mrg->count ; i++)
525
mrg->records+=mrg->file[i]->s->state.state.records;
527
if (write_loop || verbose)
529
VOID(printf("Compressing %s: (%lu records)\n",
530
result_table ? new_name : org_name, (ulong) mrg->records));
532
trees=fields=share->base.fields;
533
huff_counts=init_huff_count(isam_file,mrg->records);
537
Read the whole data file(s) for statistics.
539
if (write_loop || verbose)
540
VOID(printf("- Calculating statistics\n"));
541
if (get_statistic(mrg,huff_counts))
545
for (i=0; i < mrg->count ; i++)
546
old_length+= (mrg->file[i]->s->state.state.data_file_length -
547
mrg->file[i]->s->state.state.empty);
550
Create a global priority queue in preparation for making
551
temporary Huffman trees.
553
if (init_queue(&queue,256,0,0,compare_huff_elements,0))
557
Check each column if we should use pre-space-compress, end-space-
558
compress, empty-field-compress or zero-field-compress.
560
check_counts(huff_counts,fields,mrg->records);
563
Build a Huffman tree for each column.
565
huff_trees=make_huff_trees(huff_counts,trees);
568
If the packed lengths of combined columns is less then the sum of
569
the non-combined columns, then create common Huffman trees for them.
570
We do this only for byte compressed columns, not for distinct values
573
if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
577
Assign codes to all byte or column values.
579
if (make_huff_decode_table(huff_trees,fields))
582
/* Prepare a file buffer. */
583
init_file_buffer(new_file,0);
586
Reserve space in the target file for the fixed compressed file header.
588
file_buffer.pos_in_file=HEAD_LENGTH;
590
VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
593
Write field infos: field type, pack type, length bits, tree number.
595
write_field_info(huff_counts,fields,used_trees);
600
if (!(tot_elements=write_huff_tree(huff_trees,trees)))
604
Calculate the total length of the compression info header.
605
This includes the fixed compressed file header, the column compression
606
type descriptions, and the decode trees.
608
header_length=(uint) file_buffer.pos_in_file+
609
(uint) (file_buffer.pos-file_buffer.buffer);
612
Compress the source file into the target file.
614
if (write_loop || verbose)
615
VOID(printf("- Compressing file\n"));
616
error=compress_isam_file(mrg,huff_counts);
617
new_length=file_buffer.pos_in_file;
618
if (!error && !test_only)
620
uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
621
memset(buff, 0, sizeof(buff));
622
error=my_write(file_buffer.file,buff,sizeof(buff),
623
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
627
Write the fixed compressed file header.
630
error=write_header(mrg,header_length,used_trees,tot_elements,
633
/* Flush the file buffer. */
636
/* Display statistics. */
637
if (verbose && mrg->records)
638
VOID(printf("Min record length: %6d Max length: %6d "
639
"Mean total length: %6ld\n", mrg->min_pack_length,
640
mrg->max_pack_length, (ulong) (new_length/mrg->records)));
642
/* Close source and target file. */
645
error|=my_close(new_file,MYF(MY_WME));
648
error|=my_close(isam_file->dfile,MYF(MY_WME));
649
isam_file->dfile= -1; /* Tell mi_close file is closed */
654
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
655
if (! test_only && ! error)
659
error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
665
if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
671
error=my_copy(new_name,org_name,MYF(MY_WME));
673
error=my_rename(new_name,org_name,MYF(MY_WME));
676
VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
678
VOID(my_delete(new_name,MYF(MY_WME)));
686
error=my_copy(new_name,org_name,
687
MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
689
VOID(my_delete(new_name,MYF(MY_WME)));
692
error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
695
error=save_state(isam_file,mrg,new_length,glob_crc);
698
error|=mrg_close(mrg);
699
if (join_isam_file >= 0)
700
error|=my_close(join_isam_file,MYF(MY_WME));
703
VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
704
VOID(my_delete(new_name,MYF(MY_WME)));
707
if (write_loop || verbose)
710
VOID(printf("%.4g%% \n",
711
(((int64_t) (old_length - new_length)) * 100.0 /
712
(int64_t) old_length)));
714
puts("Empty file saved in compressed format");
719
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
721
VOID(my_close(new_file,MYF(0)));
722
if (join_isam_file >= 0)
723
VOID(my_close(join_isam_file,MYF(0)));
725
VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
729
/* Init a huff_count-struct for each field and init it */
731
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
734
register HUFF_COUNTS *count;
735
if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
737
MYF(MY_ZEROFILL | MY_WME))))
739
for (i=0 ; i < info->s->base.fields ; i++)
741
enum en_fieldtype type;
742
count[i].field_length=info->s->rec[i].length;
743
type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
744
if (type == FIELD_INTERVALL ||
745
type == FIELD_CONSTANT ||
748
if (count[i].field_length <= 8 &&
749
(type == FIELD_NORMAL ||
750
type == FIELD_SKIP_ZERO))
751
count[i].max_zero_fill= count[i].field_length;
753
For every column initialize a tree, which is used to detect distinct
754
column values. 'int_tree' works together with 'tree_buff' and
755
'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
756
This is accomplished by '-1' as the element size.
758
init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
760
if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
761
count[i].tree_pos=count[i].tree_buff =
762
my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
770
/* Free memory used by counts and trees */
772
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
773
HUFF_COUNTS *huff_counts,
780
for (i=0 ; i < trees ; i++)
782
if (huff_trees[i].element_buffer)
783
my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
784
if (huff_trees[i].code)
785
my_free((uchar*) huff_trees[i].code,MYF(0));
787
my_free((uchar*) huff_trees,MYF(0));
791
for (i=0 ; i < fields ; i++)
793
if (huff_counts[i].tree_buff)
795
my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
796
delete_tree(&huff_counts[i].int_tree);
799
my_free((uchar*) huff_counts,MYF(0));
801
delete_queue(&queue); /* This is safe to free */
805
/* Read through old file and gather some statistics */
807
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
811
ulong reclength,max_blob_length;
812
uchar *record,*pos,*next_pos,*end_pos,*start_pos;
813
ha_rows record_count;
814
bool static_row_size;
815
HUFF_COUNTS *count,*end_count;
816
TREE_ELEMENT *element;
818
reclength=mrg->file[0]->s->base.reclength;
819
record=(uchar*) my_alloca(reclength);
820
end_count=huff_counts+mrg->file[0]->s->base.fields;
821
record_count=0; glob_crc=0;
824
/* Check how to calculate checksum */
826
for (count=huff_counts ; count < end_count ; count++)
828
if (count->field_type == FIELD_BLOB ||
829
count->field_type == FIELD_VARCHAR)
837
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
839
ulong tot_blob_length=0;
842
/* glob_crc is a checksum over all bytes of all records. */
844
glob_crc+=mi_static_checksum(mrg->file[0],record);
846
glob_crc+=mi_checksum(mrg->file[0],record);
848
/* Count the incidence of values separately for every column. */
849
for (pos=record,count=huff_counts ;
854
next_pos=end_pos=(start_pos=pos)+count->field_length;
857
Put the whole column value in a tree if there is room for it.
858
'int_tree' is used to quickly check for duplicate values.
859
'tree_buff' collects as many distinct column values as
860
possible. If the field length is > 1, it is tree_buff_length,
861
else 2 bytes. Each value is 'field_length' bytes big. If there
862
are more distinct column values than fit into the buffer, we
863
give up with this tree. BLOBs and VARCHARs do not have a
864
tree_buff as it can only be used with fixed length columns.
865
For the special case of field length == 1, we handle only the
866
case that there is only one distinct value in the table(s).
867
Otherwise, we can have a maximum of 256 distinct values. This
868
is then handled by the normal Huffman tree build.
870
Another limit for collecting distinct column values is the
871
number of values itself. Since we would need to build a
872
Huffman tree for the values, we are limited by the 'IS_OFFSET'
873
constant. This constant expresses a bit which is used to
874
determine if a tree element holds a final value or an offset
875
to a child element. Hence, all values and offsets need to be
876
smaller than 'IS_OFFSET'. A tree element is implemented with
877
two integer values, one for the left branch and one for the
878
right branch. For the extreme case that the first element
879
points to the last element, the number of integers in the tree
880
must be less or equal to IS_OFFSET. So the number of elements
881
must be less or equal to IS_OFFSET / 2.
883
WARNING: At first, we insert a pointer into the record buffer
884
as the key for the tree. If we got a new distinct value, which
885
is really inserted into the tree, instead of being counted
886
only, we will copy the column value from the record buffer to
887
'tree_buff' and adjust the key pointer of the tree accordingly.
889
if (count->tree_buff)
892
if (!(element=tree_insert(&count->int_tree,pos, 0,
893
count->int_tree.custom_arg)) ||
894
(element->count == 1 &&
895
(count->tree_buff + tree_buff_length <
896
count->tree_pos + count->field_length)) ||
897
(count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
898
(count->field_length == 1 &&
899
count->int_tree.elements_in_tree > 1))
901
delete_tree(&count->int_tree);
902
my_free(count->tree_buff,MYF(0));
908
If tree_insert() succeeds, it either creates a new element
909
or increments the counter of an existing element.
911
if (element->count == 1)
913
/* Copy the new column value into 'tree_buff'. */
914
memcpy(count->tree_pos, pos, count->field_length);
915
/* Adjust the key pointer in the tree. */
916
tree_set_pointer(element,count->tree_pos);
917
/* Point behind the last column value so far. */
918
count->tree_pos+=count->field_length;
923
/* Save character counters and space-counts and zero-field-counts */
924
if (count->field_type == FIELD_NORMAL ||
925
count->field_type == FIELD_SKIP_ENDSPACE)
927
/* Ignore trailing space. */
928
for ( ; end_pos > pos ; end_pos--)
929
if (end_pos[-1] != ' ')
931
/* Empty fields are just counted. Go to the next record. */
934
count->empty_fields++;
935
count->max_zero_fill=0;
939
Count the total of all trailing spaces and the number of
940
short trailing spaces. Remember the longest trailing space.
942
length= (uint) (next_pos-end_pos);
943
count->tot_end_space+=length;
945
count->end_space[length]++;
946
if (count->max_end_space < length)
947
count->max_end_space = length;
950
if (count->field_type == FIELD_NORMAL ||
951
count->field_type == FIELD_SKIP_PRESPACE)
953
/* Ignore leading space. */
954
for (pos=start_pos; pos < end_pos ; pos++)
957
/* Empty fields are just counted. Go to the next record. */
960
count->empty_fields++;
961
count->max_zero_fill=0;
965
Count the total of all leading spaces and the number of
966
short leading spaces. Remember the longest leading space.
968
length= (uint) (pos-start_pos);
969
count->tot_pre_space+=length;
971
count->pre_space[length]++;
972
if (count->max_pre_space < length)
973
count->max_pre_space = length;
976
/* Calculate pos, end_pos, and max_length for variable length fields. */
977
if (count->field_type == FIELD_BLOB)
979
uint field_length=count->field_length -portable_sizeof_char_ptr;
980
ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
981
memcpy(&pos, start_pos + field_length, sizeof(char*));
982
end_pos=pos+blob_length;
983
tot_blob_length+=blob_length;
984
set_if_bigger(count->max_length,blob_length);
986
else if (count->field_type == FIELD_VARCHAR)
988
uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
989
length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
990
uint2korr(start_pos));
991
pos= start_pos+pack_length;
993
set_if_bigger(count->max_length,length);
996
/* Evaluate 'max_zero_fill' for short fields. */
997
if (count->field_length <= 8 &&
998
(count->field_type == FIELD_NORMAL ||
999
count->field_type == FIELD_SKIP_ZERO))
1002
/* Zero fields are just counted. Go to the next record. */
1003
if (!memcmp(start_pos,zero_string,count->field_length))
1005
count->zero_fields++;
1009
max_zero_fill starts with field_length. It is decreased every
1010
time a shorter "zero trailer" is found. It is set to zero when
1011
an empty field is found (see above). This suggests that the
1012
variable should be called 'min_zero_fill'.
1014
for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1016
if (i < count->max_zero_fill)
1017
count->max_zero_fill=i;
1020
/* Ignore zero fields and check fields. */
1021
if (count->field_type == FIELD_ZERO ||
1022
count->field_type == FIELD_CHECK)
1026
Count the incidence of every byte value in the
1027
significant field value.
1029
for ( ; pos < end_pos ; pos++)
1030
count->counts[(uchar) *pos]++;
1032
/* Step to next field. */
1035
if (tot_blob_length > max_blob_length)
1036
max_blob_length=tot_blob_length;
1038
if (write_loop && record_count % WRITE_COUNT == 0)
1040
VOID(printf("%lu\r", (ulong) record_count));
1041
VOID(fflush(stdout));
1044
else if (error != HA_ERR_RECORD_DELETED)
1046
VOID(fprintf(stderr, "Got error %d while reading rows", error));
1050
/* Step to next record. */
1054
VOID(printf(" \r"));
1055
VOID(fflush(stdout));
1059
VOID(printf("Found the following number of incidents "
1060
"of the byte codes:\n"));
1061
for (count= huff_counts ; count < end_count; count++)
1064
my_off_t total_count;
1068
VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1069
if (count->tree_buff)
1072
VOID(printf("number of distinct values: %u\n",
1073
(uint) ((count->tree_pos - count->tree_buff) /
1074
count->field_length)));
1077
for (idx= 0; idx < 256; idx++)
1079
if (count->counts[idx])
1081
total_count+= count->counts[idx];
1083
VOID(printf("counts[0x%02x]: %12s\n", idx,
1084
llstr((int64_t) count->counts[idx], llbuf)));
1087
if ((verbose >= 2) && total_count)
1089
VOID(printf("total: %12s\n",
1090
llstr((int64_t) total_count, llbuf)));
1094
mrg->records=record_count;
1095
mrg->max_blob_length=max_blob_length;
1096
my_afree((uchar*) record);
1097
return(error != HA_ERR_END_OF_FILE);
1100
static int compare_huff_elements(void *not_used __attribute__((unused)),
1103
return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1104
(*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1107
/* Check each tree if we should use pre-space-compress, end-space-
1108
compress, empty-field-compress or zero-field-compress */
1110
static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1113
uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1114
my_off_t old_length,new_length,length;
1116
memset(field_count, 0, sizeof(field_count));
1117
space_fields=fill_zero_fields=0;
1119
for (; trees-- ; huff_counts++)
1121
if (huff_counts->field_type == FIELD_BLOB)
1123
huff_counts->length_bits=max_bit(huff_counts->max_length);
1126
else if (huff_counts->field_type == FIELD_VARCHAR)
1128
huff_counts->length_bits=max_bit(huff_counts->max_length);
1131
else if (huff_counts->field_type == FIELD_CHECK)
1133
huff_counts->bytes_packed=0;
1134
huff_counts->counts[0]=0;
1138
huff_counts->field_type=FIELD_NORMAL;
1139
huff_counts->pack_type=0;
1141
/* Check for zero-filled records (in this column), or zero records. */
1142
if (huff_counts->zero_fields || ! records)
1144
my_off_t old_space_count;
1146
If there are only zero filled records (in this column),
1147
or no records at all, we are done.
1149
if (huff_counts->zero_fields == records)
1151
huff_counts->field_type= FIELD_ZERO;
1152
huff_counts->bytes_packed=0;
1153
huff_counts->counts[0]=0;
1156
/* Remeber the number of significant spaces. */
1157
old_space_count=huff_counts->counts[' '];
1158
/* Add all leading and trailing spaces. */
1159
huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1160
huff_counts->tot_pre_space +
1161
huff_counts->empty_fields *
1162
huff_counts->field_length);
1163
/* Check, what the compressed length of this would be. */
1164
old_length=calc_packed_length(huff_counts,0)+records/8;
1165
/* Get the number of zero bytes. */
1166
length=huff_counts->zero_fields*huff_counts->field_length;
1167
/* Add it to the counts. */
1168
huff_counts->counts[0]+=length;
1169
/* Check, what the compressed length of this would be. */
1170
new_length=calc_packed_length(huff_counts,0);
1171
/* If the compression without the zeroes would be shorter, we are done. */
1172
if (old_length < new_length && huff_counts->field_length > 1)
1174
huff_counts->field_type=FIELD_SKIP_ZERO;
1175
huff_counts->counts[0]-=length;
1176
huff_counts->bytes_packed=old_length- records/8;
1179
/* Remove the insignificant spaces, but keep the zeroes. */
1180
huff_counts->counts[' ']=old_space_count;
1182
/* Check, what the compressed length of this column would be. */
1183
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1186
If there are enough empty records (in this column),
1187
treating them specially may pay off.
1189
if (huff_counts->empty_fields)
1191
if (huff_counts->field_length > 2 &&
1192
huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1193
(1+max_bit(max(huff_counts->max_pre_space,
1194
huff_counts->max_end_space))) <
1195
records * max_bit(huff_counts->field_length))
1197
huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1201
length=huff_counts->empty_fields*huff_counts->field_length;
1202
if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1204
huff_counts->tot_end_space+=length;
1205
huff_counts->max_end_space=huff_counts->field_length;
1206
if (huff_counts->field_length < 8)
1207
huff_counts->end_space[huff_counts->field_length]+=
1208
huff_counts->empty_fields;
1210
if (huff_counts->tot_pre_space)
1212
huff_counts->tot_pre_space+=length;
1213
huff_counts->max_pre_space=huff_counts->field_length;
1214
if (huff_counts->field_length < 8)
1215
huff_counts->pre_space[huff_counts->field_length]+=
1216
huff_counts->empty_fields;
1222
If there are enough trailing spaces (in this column),
1223
treating them specially may pay off.
1225
if (huff_counts->tot_end_space)
1227
huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1228
if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1229
huff_counts->end_space,
1230
huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1232
huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1236
If there are enough leading spaces (in this column),
1237
treating them specially may pay off.
1239
if (huff_counts->tot_pre_space)
1241
if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1242
huff_counts->pre_space,
1243
huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1247
found_pack: /* Found field-packing */
1249
/* Test if we can use zero-fill */
1251
if (huff_counts->max_zero_fill &&
1252
(huff_counts->field_type == FIELD_NORMAL ||
1253
huff_counts->field_type == FIELD_SKIP_ZERO))
1255
huff_counts->counts[0]-=huff_counts->max_zero_fill*
1256
(huff_counts->field_type == FIELD_SKIP_ZERO ?
1257
records - huff_counts->zero_fields : records);
1258
huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1259
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1262
/* Test if intervall-field is better */
1264
if (huff_counts->tree_buff)
1268
tree.element_buffer=0;
1269
if (!make_huff_tree(&tree,huff_counts) &&
1270
tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1272
if (tree.elements == 1)
1273
huff_counts->field_type=FIELD_CONSTANT;
1275
huff_counts->field_type=FIELD_INTERVALL;
1276
huff_counts->pack_type=0;
1280
my_free((uchar*) huff_counts->tree_buff,MYF(0));
1281
delete_tree(&huff_counts->int_tree);
1282
huff_counts->tree_buff=0;
1284
if (tree.element_buffer)
1285
my_free((uchar*) tree.element_buffer,MYF(0));
1287
if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1289
if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1291
field_count[huff_counts->field_type]++;
1294
VOID(printf("\nnormal: %3d empty-space: %3d "
1295
"empty-zero: %3d empty-fill: %3d\n"
1296
"pre-space: %3d end-space: %3d "
1297
"intervall-fields: %3d zero: %3d\n",
1298
field_count[FIELD_NORMAL],space_fields,
1299
field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1300
field_count[FIELD_SKIP_PRESPACE],
1301
field_count[FIELD_SKIP_ENDSPACE],
1302
field_count[FIELD_INTERVALL],
1303
field_count[FIELD_ZERO]));
1307
/* Test if we can use space-compression and empty-field-compression */
1310
test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1311
uint max_space_length, my_off_t *space_counts,
1312
my_off_t tot_space_count, enum en_fieldtype field_type)
1316
my_off_t space_count,min_space_count,min_pack,new_length,skip;
1318
length_bits=max_bit(max_space_length);
1320
/* Default no end_space-packing */
1321
space_count=huff_counts->counts[(uint) ' '];
1322
min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1323
min_pack=calc_packed_length(huff_counts,0);
1325
huff_counts->counts[(uint) ' ']=space_count;
1327
/* Test with allways space-count */
1328
new_length=huff_counts->bytes_packed+length_bits*records/8;
1329
if (new_length+1 < min_pack)
1332
min_pack=new_length;
1333
min_space_count=space_count;
1335
/* Test with length-flag */
1336
for (skip=0L, i=0 ; i < 8 ; i++)
1338
if (space_counts[i])
1341
huff_counts->counts[(uint) ' ']+=space_counts[i];
1342
skip+=huff_counts->pre_space[i];
1343
new_length=calc_packed_length(huff_counts,0)+
1344
(records+(records-skip)*(1+length_bits))/8;
1345
if (new_length < min_pack)
1348
min_pack=new_length;
1349
min_space_count=huff_counts->counts[(uint) ' '];
1354
huff_counts->counts[(uint) ' ']=min_space_count;
1355
huff_counts->bytes_packed=min_pack;
1358
return(0); /* No space-compress */
1359
case -1: /* Always space-count */
1360
huff_counts->field_type=field_type;
1361
huff_counts->min_space=0;
1362
huff_counts->length_bits=max_bit(max_space_length);
1365
huff_counts->field_type=field_type;
1366
huff_counts->min_space=(uint) min_pos;
1367
huff_counts->pack_type|=PACK_TYPE_SELECTED;
1368
huff_counts->length_bits=max_bit(max_space_length);
1371
return(1); /* Using space-compress */
1375
/* Make a huff_tree of each huff_count */
1377
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1380
HUFF_TREE *huff_tree;
1382
if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1383
MYF(MY_WME | MY_ZEROFILL))))
1386
for (tree=0 ; tree < trees ; tree++)
1388
if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1391
my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1392
my_free((uchar*) huff_tree,MYF(0));
1400
Build a Huffman tree.
1404
huff_tree The Huffman tree.
1405
huff_counts The counts.
1408
Build a Huffman tree according to huff_counts->counts or
1409
huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1410
tree_buff_length of distinct column values. In that case, whole
1411
values can be Huffman encoded instead of single bytes.
1418
static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1420
uint i,found,bits_packed,first,last;
1421
my_off_t bytes_packed;
1422
HUFF_ELEMENT *a,*b,*new_huff_el;
1425
if (huff_counts->tree_buff)
1427
/* Calculate the number of distinct values in tree_buff. */
1428
found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1429
huff_counts->field_length;
1430
first=0; last=found-1;
1434
/* Count the number of byte codes found in the column. */
1435
for (i=found=0 ; i < 256 ; i++)
1437
if (huff_counts->counts[i])
1448
/* When using 'tree_buff' we can have more that 256 values. */
1449
if (queue.max_elements < found)
1451
delete_queue(&queue);
1452
if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1456
/* Allocate or reallocate an element buffer for the Huffman tree. */
1457
if (!huff_tree->element_buffer)
1459
if (!(huff_tree->element_buffer=
1460
(HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1467
(HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1468
found*2*sizeof(HUFF_ELEMENT),
1471
huff_tree->element_buffer=temp;
1474
huff_counts->tree=huff_tree;
1475
huff_tree->counts=huff_counts;
1476
huff_tree->min_chr=first;
1477
huff_tree->max_chr=last;
1478
huff_tree->char_bits=max_bit(last-first);
1479
huff_tree->offset_bits=max_bit(found-1)+1;
1481
if (huff_counts->tree_buff)
1483
huff_tree->elements=0;
1484
huff_tree->tree_pack_length=(1+15+16+5+5+
1485
(huff_tree->char_bits+1)*found+
1486
(huff_tree->offset_bits+1)*
1488
(uint) (huff_tree->counts->tree_pos-
1489
huff_tree->counts->tree_buff);
1491
Put a HUFF_ELEMENT into the queue for every distinct column value.
1493
tree_walk() calls save_counts_in_queue() for every element in
1494
'int_tree'. This takes elements from the target trees element
1495
buffer and places references to them into the buffer of the
1496
priority queue. We insert in column value order, but the order is
1497
in fact irrelevant here. We will establish the correct order
1500
tree_walk(&huff_counts->int_tree,
1501
(int (*)(void*, element_count,void*)) save_counts_in_queue,
1502
(uchar*) huff_tree, left_root_right);
1506
huff_tree->elements=found;
1507
huff_tree->tree_pack_length=(9+9+5+5+
1508
(huff_tree->char_bits+1)*found+
1509
(huff_tree->offset_bits+1)*
1512
Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1514
The elements are taken from the target trees element buffer.
1515
Instead of using queue_insert(), we just place references to the
1516
elements into the buffer of the priority queue. We insert in byte
1517
value order, but the order is in fact irrelevant here. We will
1518
establish the correct order later.
1520
for (i=first, found=0 ; i <= last ; i++)
1522
if (huff_counts->counts[i])
1524
new_huff_el=huff_tree->element_buffer+(found++);
1525
new_huff_el->count=huff_counts->counts[i];
1526
new_huff_el->a.leaf.null=0;
1527
new_huff_el->a.leaf.element_nr=i;
1528
queue.root[found]=(uchar*) new_huff_el;
1532
If there is only a single byte value in this field in all records,
1533
add a second element with zero incidence. This is required to enter
1534
the loop, which builds the Huffman tree.
1538
new_huff_el=huff_tree->element_buffer+(found++);
1539
new_huff_el->count=0;
1540
new_huff_el->a.leaf.null=0;
1542
new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1544
new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1545
queue.root[found]=(uchar*) new_huff_el;
1549
/* Make a queue from the queue buffer. */
1550
queue.elements=found;
1553
Make a priority queue from the queue. Construct its index so that we
1554
have a partially ordered tree.
1556
for (i=found/2 ; i > 0 ; i--)
1557
_downheap(&queue,i);
1559
/* The Huffman algorithm. */
1560
bytes_packed=0; bits_packed=0;
1561
for (i=1 ; i < found ; i++)
1564
Pop the top element from the queue (the one with the least incidence).
1565
Popping from a priority queue includes a re-ordering of the queue,
1566
to get the next least incidence element to the top.
1568
a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1570
Copy the next least incidence element. The queue implementation
1571
reserves root[0] for temporary purposes. root[1] is the top.
1573
b=(HUFF_ELEMENT*) queue.root[1];
1574
/* Get a new element from the element buffer. */
1575
new_huff_el=huff_tree->element_buffer+found+i;
1576
/* The new element gets the sum of the two least incidence elements. */
1577
new_huff_el->count=a->count+b->count;
1579
The Huffman algorithm assigns another bit to the code for a byte
1580
every time that bytes incidence is combined (directly or indirectly)
1581
to a new element as one of the two least incidence elements.
1582
This means that one more bit per incidence of that byte is required
1583
in the resulting file. So we add the new combined incidence as the
1584
number of bits by which the result grows.
1586
bits_packed+=(uint) (new_huff_el->count & 7);
1587
bytes_packed+=new_huff_el->count/8;
1588
/* The new element points to its children, lesser in left. */
1589
new_huff_el->a.nod.left=a;
1590
new_huff_el->a.nod.right=b;
1592
Replace the copied top element by the new element and re-order the
1595
queue.root[1]=(uchar*) new_huff_el;
1596
queue_replaced(&queue);
1598
huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1599
huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1603
static int compare_tree(void* cmp_arg __attribute__((unused)),
1604
register const uchar *s, register const uchar *t)
1607
for (length=global_count->field_length; length-- ;)
1609
return (int) s[-1] - (int) t[-1];
1614
Organize distinct column values and their incidences into a priority queue.
1617
save_counts_in_queue()
1618
key The column value.
1619
count The incidence of this value.
1620
tree The Huffman tree to be built later.
1623
We use the element buffer of the targeted tree. The distinct column
1624
values are organized in a priority queue first. The Huffman
1625
algorithm will later organize the elements into a Huffman tree. For
1626
the time being, we just place references to the elements into the
1627
queue buffer. The buffer will later be organized into a priority
1634
static int save_counts_in_queue(uchar *key, element_count count,
1637
HUFF_ELEMENT *new_huff_el;
1639
new_huff_el=tree->element_buffer+(tree->elements++);
1640
new_huff_el->count=count;
1641
new_huff_el->a.leaf.null=0;
1642
new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1643
tree->counts->field_length;
1644
queue.root[tree->elements]=(uchar*) new_huff_el;
1650
Calculate length of file if given counts should be used.
1653
calc_packed_length()
1654
huff_counts The counts for a column of the table(s).
1655
add_tree_lenght If the decode tree length should be added.
1658
We need to follow the Huffman algorithm until we know, how many bits
1659
are required for each byte code. But we do not need the resulting
1660
Huffman tree. Hence, we can leave out some steps which are essential
1661
in make_huff_tree().
1664
Number of bytes required to compress this table column.
1667
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1668
uint add_tree_lenght)
1670
uint i,found,bits_packed,first,last;
1671
my_off_t bytes_packed;
1672
HUFF_ELEMENT element_buffer[256];
1675
WARNING: We use a small hack for efficiency: Instead of placing
1676
references to HUFF_ELEMENTs into the queue, we just insert
1677
references to the counts of the byte codes which appeared in this
1678
table column. During the Huffman algorithm they are successively
1679
replaced by references to HUFF_ELEMENTs. This works, because
1680
HUFF_ELEMENTs have the incidence count at their beginning.
1681
Regardless, wether the queue array contains references to counts of
1682
type my_off_t or references to HUFF_ELEMENTs which have the count of
1683
type my_off_t at their beginning, it always points to a count of the
1686
Instead of using queue_insert(), we just copy the references into
1687
the buffer of the priority queue. We insert in byte value order, but
1688
the order is in fact irrelevant here. We will establish the correct
1692
for (i=found=0 ; i < 256 ; i++)
1694
if (huff_counts->counts[i])
1699
/* We start with root[1], which is the queues top element. */
1700
queue.root[found]=(uchar*) &huff_counts->counts[i];
1704
return(0); /* Empty tree */
1706
If there is only a single byte value in this field in all records,
1707
add a second element with zero incidence. This is required to enter
1708
the loop, which follows the Huffman algorithm.
1711
queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1713
/* Make a queue from the queue buffer. */
1714
queue.elements=found;
1716
bytes_packed=0; bits_packed=0;
1717
/* Add the length of the coding table, which would become part of the file. */
1718
if (add_tree_lenght)
1719
bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1720
(max_bit(found-1)+1+1)*(found-2) +7)/8;
1723
Make a priority queue from the queue. Construct its index so that we
1724
have a partially ordered tree.
1726
for (i=(found+1)/2 ; i > 0 ; i--)
1727
_downheap(&queue,i);
1729
/* The Huffman algorithm. */
1730
for (i=0 ; i < found-1 ; i++)
1734
HUFF_ELEMENT *new_huff_el;
1737
Pop the top element from the queue (the one with the least
1738
incidence). Popping from a priority queue includes a re-ordering
1739
of the queue, to get the next least incidence element to the top.
1741
a= (my_off_t*) queue_remove(&queue, 0);
1743
Copy the next least incidence element. The queue implementation
1744
reserves root[0] for temporary purposes. root[1] is the top.
1746
b= (my_off_t*) queue.root[1];
1747
/* Create a new element in a local (automatic) buffer. */
1748
new_huff_el= element_buffer + i;
1749
/* The new element gets the sum of the two least incidence elements. */
1750
new_huff_el->count= *a + *b;
1752
The Huffman algorithm assigns another bit to the code for a byte
1753
every time that bytes incidence is combined (directly or indirectly)
1754
to a new element as one of the two least incidence elements.
1755
This means that one more bit per incidence of that byte is required
1756
in the resulting file. So we add the new combined incidence as the
1757
number of bits by which the result grows.
1759
bits_packed+=(uint) (new_huff_el->count & 7);
1760
bytes_packed+=new_huff_el->count/8;
1762
Replace the copied top element by the new element and re-order the
1763
queue. This successively replaces the references to counts by
1764
references to HUFF_ELEMENTs.
1766
queue.root[1]=(uchar*) new_huff_el;
1767
queue_replaced(&queue);
1769
return(bytes_packed+(bits_packed+7)/8);
1773
/* Remove trees that don't give any compression */
1775
static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1778
HUFF_COUNTS count,*i,*j,*last_count;
1780
last_count=huff_counts+trees;
1781
for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1783
if (!i->tree->tree_number)
1785
i->tree->tree_number= ++tree_number;
1787
continue; /* Don't join intervall */
1788
for (j=i+1 ; j < last_count ; j++)
1790
if (! j->tree->tree_number && ! j->tree_buff)
1792
for (k=0 ; k < 256 ; k++)
1793
count.counts[k]=i->counts[k]+j->counts[k];
1794
if (calc_packed_length(&count,1) <=
1795
i->tree->bytes_packed + j->tree->bytes_packed+
1796
i->tree->tree_pack_length+j->tree->tree_pack_length+
1799
memcpy(i->counts, count.counts, sizeof(count.counts[0])*256);
1800
my_free((uchar*) j->tree->element_buffer,MYF(0));
1801
j->tree->element_buffer=0;
1803
memcpy(i->counts, count.counts, sizeof(count.counts[0])*256);
1804
if (make_huff_tree(i->tree,i))
1812
VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
1813
return tree_number; /* Return trees left */
1818
Fill in huff_tree encode tables.
1821
make_huff_decode_table()
1822
huff_tree An array of HUFF_TREE which are to be encoded.
1823
trees The number of HUFF_TREE in the array.
1830
static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1833
for ( ; trees-- ; huff_tree++)
1835
if (huff_tree->tree_number > 0)
1837
elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1838
if (!(huff_tree->code =
1839
(uint64_t*) my_malloc(elements*
1840
(sizeof(uint64_t) + sizeof(uchar)),
1841
MYF(MY_WME | MY_ZEROFILL))))
1843
huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1844
make_traverse_code_tree(huff_tree, huff_tree->root,
1845
8 * sizeof(uint64_t), 0LL);
1852
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1853
HUFF_ELEMENT *element,
1854
uint size, uint64_t code)
1857
if (!element->a.leaf.null)
1859
chr=element->a.leaf.element_nr;
1860
huff_tree->code_len[chr]= (uchar) (8 * sizeof(uint64_t) - size);
1861
huff_tree->code[chr]= (code >> size);
1862
if (huff_tree->height < 8 * sizeof(uint64_t) - size)
1863
huff_tree->height= 8 * sizeof(uint64_t) - size;
1868
make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1869
make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1870
code + (((uint64_t) 1) << size));
1877
Convert a value into binary digits.
1882
length The number of low order bits to convert.
1885
The result string is in static storage. It is reused on every call.
1886
So you cannot use it twice in one expression.
1889
A pointer to a static NUL-terminated string.
1892
static char *bindigits(uint64_t value, uint bits)
1894
static char digits[72];
1898
assert(idx < sizeof(digits));
1900
*(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1907
Convert a value into hexadecimal digits.
1914
The result string is in static storage. It is reused on every call.
1915
So you cannot use it twice in one expression.
1918
A pointer to a static NUL-terminated string.
1921
static char *hexdigits(uint64_t value)
1923
static char digits[20];
1925
uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1927
assert(idx < sizeof(digits));
1930
if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1931
*(ptr - 1)+= 'a' - '9' - 1;
1938
/* Write header to new packed data file */
1940
static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
1941
my_off_t tot_elements,my_off_t filelength)
1943
uchar *buff= (uchar*) file_buffer.pos;
1945
memset(buff, 0, HEAD_LENGTH);
1946
memcpy(buff,myisam_pack_file_magic,4);
1947
int4store(buff+4,head_length);
1948
int4store(buff+8, mrg->min_pack_length);
1949
int4store(buff+12,mrg->max_pack_length);
1950
int4store(buff+16,tot_elements);
1951
int4store(buff+20,intervall_length);
1952
int2store(buff+24,trees);
1953
buff[26]=(char) mrg->ref_length;
1954
/* Save record pointer length */
1955
buff[27]= (uchar) mi_get_pointer_length((uint64_t) filelength,2);
1958
VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
1959
return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
1960
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
1963
/* Write fieldinfo to new packed file */
1965
static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
1968
uint huff_tree_bits;
1969
huff_tree_bits=max_bit(trees ? trees-1 : 0);
1974
VOID(printf("column types:\n"));
1975
VOID(printf("FIELD_NORMAL 0\n"));
1976
VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
1977
VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
1978
VOID(printf("FIELD_SKIP_ZERO 3\n"));
1979
VOID(printf("FIELD_BLOB 4\n"));
1980
VOID(printf("FIELD_CONSTANT 5\n"));
1981
VOID(printf("FIELD_INTERVALL 6\n"));
1982
VOID(printf("FIELD_ZERO 7\n"));
1983
VOID(printf("FIELD_VARCHAR 8\n"));
1984
VOID(printf("FIELD_CHECK 9\n"));
1986
VOID(printf("pack type as a set of flags:\n"));
1987
VOID(printf("PACK_TYPE_SELECTED 1\n"));
1988
VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
1989
VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
1992
for (i=0 ; i++ < fields ; counts++)
1994
write_bits((uint64_t) (int) counts->field_type, 5);
1995
write_bits(counts->pack_type,6);
1996
if (counts->pack_type & PACK_TYPE_ZERO_FILL)
1997
write_bits(counts->max_zero_fill,5);
1999
write_bits(counts->length_bits,5);
2000
write_bits((uint64_t) counts->tree->tree_number - 1, huff_tree_bits);
2002
VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2003
"tree: %2u length: %4u\n", i , counts->field_type,
2004
counts->pack_type, counts->max_zero_fill, counts->length_bits,
2005
counts->tree->tree_number, counts->field_length));
2011
/* Write all huff_trees to new datafile. Return tot count of
2012
elements in all trees
2013
Returns 0 on error */
2015
static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2021
uint *packed_tree,*offset,length;
2024
/* Find the highest number of elements in the trees. */
2025
for (i=length=0 ; i < trees ; i++)
2026
if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2027
length=huff_tree[i].elements;
2029
Allocate a buffer for packing a decode tree. Two numbers per element
2030
(left child and right child).
2032
if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2034
my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2042
for (elements=0; trees-- ; huff_tree++)
2044
/* Skip columns that have been joined with other columns. */
2045
if (huff_tree->tree_number == 0)
2046
continue; /* Deleted tree */
2050
/* Count the total number of elements (byte codes or column values). */
2051
elements+=huff_tree->elements;
2052
huff_tree->max_offset=2;
2053
/* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2054
if (huff_tree->elements <= 1)
2057
offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2059
/* This should be the same as 'length' above. */
2060
huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2063
Since we check this during collecting the distinct column values,
2064
this should never happen.
2066
if (huff_tree->max_offset >= IS_OFFSET)
2067
{ /* This should be impossible */
2068
VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2069
huff_tree->max_offset));
2070
my_afree((uchar*) packed_tree);
2074
if (!huff_tree->counts->tree_buff)
2076
/* We do a byte compression on this column. Mark with bit 0. */
2078
write_bits(huff_tree->min_chr,8);
2079
write_bits(huff_tree->elements,9);
2080
write_bits(huff_tree->char_bits,5);
2081
write_bits(huff_tree->offset_bits,5);
2086
int_length=(uint) (huff_tree->counts->tree_pos -
2087
huff_tree->counts->tree_buff);
2088
/* We have distinct column values for this column. Mark with bit 1. */
2090
write_bits(huff_tree->elements,15);
2091
write_bits(int_length,16);
2092
write_bits(huff_tree->char_bits,5);
2093
write_bits(huff_tree->offset_bits,5);
2094
intervall_length+=int_length;
2097
VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2098
"%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2099
huff_tree->char_bits, huff_tree->offset_bits,
2100
huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2101
huff_tree->counts->tree_buff ? int_length :
2102
huff_tree->min_chr, huff_tree->height));
2104
/* Check that the code tree length matches the element count. */
2105
length=(uint) (offset-packed_tree);
2106
if (length != huff_tree->elements*2-2)
2108
VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2109
length, huff_tree->elements * 2 - 2));
2114
for (i=0 ; i < length ; i++)
2116
if (packed_tree[i] & IS_OFFSET)
2117
write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2118
huff_tree->offset_bits+1);
2120
write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2122
VOID(printf("tree[0x%04x]: %s0x%04x\n",
2123
i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2124
(packed_tree[i] & IS_OFFSET) ?
2125
packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2130
Display coding tables and check their correctness.
2132
codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2133
for (i= 0; i < codes; i++)
2140
if (! (len= huff_tree->code_len[i]))
2143
VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2144
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2145
bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2147
/* Check that the encode table decodes correctly. */
2155
VOID(fflush(stdout));
2156
VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2157
hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2162
code|= (huff_tree->code[i] >> (--len)) & 1;
2164
if (bits > 8 * sizeof(code))
2166
VOID(fflush(stdout));
2167
VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2168
bits, (uint) (8 * sizeof(code))));
2172
idx+= (uint) code & 1;
2175
VOID(fflush(stdout));
2176
VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2181
if (packed_tree[idx] & IS_OFFSET)
2182
idx+= packed_tree[idx] & ~IS_OFFSET;
2184
break; /* Hit a leaf. This contains the result value. */
2189
if (packed_tree[idx] != i)
2191
VOID(fflush(stdout));
2192
VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2193
packed_tree[idx], i));
2197
} /*end for (codes)*/
2201
/* Write column values in case of distinct column value compression. */
2202
if (huff_tree->counts->tree_buff)
2204
for (i=0 ; i < int_length ; i++)
2206
write_bits((uint64_t) (uchar) huff_tree->counts->tree_buff[i], 8);
2208
VOID(printf("column_values[0x%04x]: 0x%02x\n",
2209
i, (uchar) huff_tree->counts->tree_buff[i]));
2216
my_afree((uchar*) packed_tree);
2219
VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2226
static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2231
prev_offset= offset;
2233
'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2234
then there is no left child and, hence no right child either. This
2235
is a property of a binary tree. An element is either a node with two
2236
childs, or a leaf without childs.
2238
The current element is always a node with two childs. Go left first.
2240
if (!element->a.nod.left->a.leaf.null)
2242
/* Store the byte code or the index of the column value. */
2243
prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2249
Recursively traverse the tree to the left. Mark it as an offset to
2250
another tree node (in contrast to a byte code or column value index).
2252
prev_offset[0]= IS_OFFSET+2;
2253
offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2256
/* Now, check the right child. */
2257
if (!element->a.nod.right->a.leaf.null)
2259
/* Store the byte code or the index of the column value. */
2260
prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2266
Recursively traverse the tree to the right. Mark it as an offset to
2267
another tree node (in contrast to a byte code or column value index).
2269
uint temp=(uint) (offset-prev_offset-1);
2270
prev_offset[1]= IS_OFFSET+ temp;
2271
if (huff_tree->max_offset < temp)
2272
huff_tree->max_offset = temp;
2273
return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2277
/* Get number of bits neaded to represent value */
2279
static uint max_bit(register uint value)
2281
register uint power=1;
2289
static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2292
uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2293
intervall,field_length,max_pack_length,pack_blob_length;
2294
my_off_t record_count;
2296
ulong length,pack_length;
2297
uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2298
HUFF_COUNTS *count,*end_count;
2300
MI_INFO *isam_file=mrg->file[0];
2301
uint pack_version= (uint) isam_file->s->pack.version;
2303
/* Allocate a buffer for the records (excluding blobs). */
2304
if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2307
end_count=huff_counts+isam_file->s->base.fields;
2308
min_record_length= (uint) ~0;
2309
max_record_length=0;
2312
Calculate the maximum number of bits required to pack the records.
2313
Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2314
The tree height determines the maximum number of bits per value.
2315
Some fields skip leading or trailing spaces or zeroes. The skipped
2316
number of bytes is encoded by 'length_bits' bits.
2317
Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2318
and varchar get a leading 0 bit.
2320
for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2322
if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2323
huff_counts[i].max_zero_fill=0;
2324
if (huff_counts[i].field_type == FIELD_CONSTANT ||
2325
huff_counts[i].field_type == FIELD_ZERO ||
2326
huff_counts[i].field_type == FIELD_CHECK)
2328
if (huff_counts[i].field_type == FIELD_INTERVALL)
2329
max_calc_length+=huff_counts[i].tree->height;
2330
else if (huff_counts[i].field_type == FIELD_BLOB ||
2331
huff_counts[i].field_type == FIELD_VARCHAR)
2332
max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2335
(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2336
huff_counts[i].tree->height+huff_counts[i].length_bits;
2338
max_calc_length= (max_calc_length + 7) / 8;
2339
pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2341
/* 'max_blob_length' is the max length of all blobs of a record. */
2342
pack_blob_length= isam_file->s->base.blobs ?
2343
calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2344
max_pack_length=pack_ref_length+pack_blob_length;
2347
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2349
ulong tot_blob_length=0;
2352
if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2354
record_pos= (uchar*) file_buffer.pos;
2355
file_buffer.pos+=max_pack_length;
2356
for (start_pos=record, count= huff_counts; count < end_count ; count++)
2358
end_pos=start_pos+(field_length=count->field_length);
2361
/* Check if the column contains spaces only. */
2362
if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2364
for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2373
end_pos-=count->max_zero_fill;
2374
field_length-=count->max_zero_fill;
2376
switch (count->field_type) {
2377
case FIELD_SKIP_ZERO:
2378
if (!memcmp(start_pos,zero_string,field_length))
2387
for ( ; start_pos < end_pos ; start_pos++)
2389
write_bits(tree->code[(uchar) *start_pos],
2390
(uint) tree->code_len[(uchar) *start_pos]);
2393
case FIELD_SKIP_ENDSPACE:
2394
for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2395
length= (ulong) (end_pos - pos);
2396
if (count->pack_type & PACK_TYPE_SELECTED)
2398
if (length > count->min_space)
2401
write_bits(length,count->length_bits);
2411
write_bits(length,count->length_bits);
2413
/* Encode all significant bytes. */
2414
for ( ; start_pos < pos ; start_pos++)
2416
write_bits(tree->code[(uchar) *start_pos],
2417
(uint) tree->code_len[(uchar) *start_pos]);
2421
case FIELD_SKIP_PRESPACE:
2422
for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2423
length= (ulong) (pos - start_pos);
2424
if (count->pack_type & PACK_TYPE_SELECTED)
2426
if (length > count->min_space)
2429
write_bits(length,count->length_bits);
2439
write_bits(length,count->length_bits);
2441
/* Encode all significant bytes. */
2442
for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2444
write_bits(tree->code[(uchar) *start_pos],
2445
(uint) tree->code_len[(uchar) *start_pos]);
2448
case FIELD_CONSTANT:
2453
case FIELD_INTERVALL:
2455
pos=(uchar*) tree_search(&count->int_tree, start_pos,
2456
count->int_tree.custom_arg);
2457
intervall=(uint) (pos - count->tree_buff)/field_length;
2458
write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2463
ulong blob_length=_mi_calc_blob_length(field_length-
2464
portable_sizeof_char_ptr,
2466
/* Empty blobs are encoded with a single 1 bit. */
2473
uchar *blob,*blob_end;
2475
/* Write the blob length. */
2476
write_bits(blob_length,count->length_bits);
2477
memcpy(&blob, end_pos - portable_sizeof_char_ptr, sizeof(char*));
2478
blob_end=blob+blob_length;
2479
/* Encode the blob bytes. */
2480
for ( ; blob < blob_end ; blob++)
2482
write_bits(tree->code[(uchar) *blob],
2483
(uint) tree->code_len[(uchar) *blob]);
2485
tot_blob_length+=blob_length;
2492
uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2493
ulong col_length= (var_pack_length == 1 ?
2494
(uint) *(uchar*) start_pos :
2495
uint2korr(start_pos));
2496
/* Empty varchar are encoded with a single 1 bit. */
2499
write_bits(1,1); /* Empty varchar */
2503
uchar *end= start_pos + var_pack_length + col_length;
2505
/* Write the varchar length. */
2506
write_bits(col_length,count->length_bits);
2507
/* Encode the varchar bytes. */
2508
for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2510
write_bits(tree->code[(uchar) *start_pos],
2511
(uint) tree->code_len[(uchar) *start_pos]);
2518
case FIELD_enum_val_count:
2519
abort(); /* Impossible */
2521
start_pos+=count->max_zero_fill;
2524
length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2525
pack_length= save_pack_length(pack_version, record_pos, length);
2526
if (pack_blob_length)
2527
pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2529
/* Correct file buffer if the header was smaller */
2530
if (pack_length != max_pack_length)
2532
memcpy(record_pos+pack_length,record_pos+max_pack_length,length);
2533
file_buffer.pos-= (max_pack_length-pack_length);
2535
if (length < (ulong) min_record_length)
2536
min_record_length=(uint) length;
2537
if (length > (ulong) max_record_length)
2538
max_record_length=(uint) length;
2540
if (write_loop && record_count % WRITE_COUNT == 0)
2542
VOID(printf("%lu\r", (ulong) record_count));
2543
VOID(fflush(stdout));
2546
else if (error != HA_ERR_RECORD_DELETED)
2549
if (error == HA_ERR_END_OF_FILE)
2553
VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2554
my_progname, error));
2557
VOID(printf("wrote %s records.\n", llstr((int64_t) record_count, llbuf)));
2559
my_afree((uchar*) record);
2560
mrg->ref_length=max_pack_length;
2561
mrg->min_pack_length=max_record_length ? min_record_length : 0;
2562
mrg->max_pack_length=max_record_length;
2563
return(error || error_on_write || flush_buffer(~(ulong) 0));
2567
static char *make_new_name(char *new_name, char *old_name)
2569
return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2572
static char *make_old_name(char *new_name, char *old_name)
2574
return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2577
/* rutines for bit writing buffer */
2579
static void init_file_buffer(File file, bool read_buffer)
2581
file_buffer.file=file;
2582
file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2584
file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2585
file_buffer.pos_in_file=0;
2590
file_buffer.pos=file_buffer.end;
2595
file_buffer.pos=file_buffer.buffer;
2596
file_buffer.bits=BITS_SAVED;
2598
file_buffer.bitbucket= 0;
2602
static int flush_buffer(ulong neaded_length)
2607
file_buffer.end is 8 bytes lower than the real end of the buffer.
2608
This is done so that the end-of-buffer condition does not need to be
2609
checked for every byte (see write_bits()). Consequently,
2610
file_buffer.pos can become greater than file_buffer.end. The
2611
algorithms in the other functions ensure that there will never be
2612
more than 8 bytes written to the buffer without an end-of-buffer
2613
check. So the buffer cannot be overrun. But we need to check for the
2614
near-to-buffer-end condition to avoid a negative result, which is
2615
casted to unsigned and thus becomes giant.
2617
if ((file_buffer.pos < file_buffer.end) &&
2618
((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2620
length=(ulong) (file_buffer.pos-file_buffer.buffer);
2621
file_buffer.pos=file_buffer.buffer;
2622
file_buffer.pos_in_file+=length;
2625
if (error_on_write|| my_write(file_buffer.file,
2626
(const uchar*) file_buffer.buffer,
2628
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2634
if (neaded_length != ~(ulong) 0 &&
2635
(ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2638
neaded_length+=256; /* some margin */
2639
tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2642
file_buffer.pos= ((uchar*) tmp +
2643
(ulong) (file_buffer.pos - file_buffer.buffer));
2644
file_buffer.buffer= (uchar*) tmp;
2645
file_buffer.end= (uchar*) (tmp+neaded_length-8);
2651
static void end_file_buffer(void)
2653
my_free((uchar*) file_buffer.buffer,MYF(0));
2656
/* output `bits` low bits of `value' */
2658
static void write_bits(register uint64_t value, register uint bits)
2660
assert(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2661
(bits == 8 * sizeof(value)));
2663
if ((file_buffer.bits-= (int) bits) >= 0)
2665
file_buffer.bitbucket|= value << file_buffer.bits;
2669
register uint64_t bit_buffer;
2670
bits= (uint) -file_buffer.bits;
2671
bit_buffer= (file_buffer.bitbucket |
2672
((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2673
#if BITS_SAVED == 64
2674
*file_buffer.pos++= (uchar) (bit_buffer >> 56);
2675
*file_buffer.pos++= (uchar) (bit_buffer >> 48);
2676
*file_buffer.pos++= (uchar) (bit_buffer >> 40);
2677
*file_buffer.pos++= (uchar) (bit_buffer >> 32);
2679
*file_buffer.pos++= (uchar) (bit_buffer >> 24);
2680
*file_buffer.pos++= (uchar) (bit_buffer >> 16);
2681
*file_buffer.pos++= (uchar) (bit_buffer >> 8);
2682
*file_buffer.pos++= (uchar) (bit_buffer);
2684
if (bits != 8 * sizeof(value))
2685
value&= (((uint64_t) 1) << bits) - 1;
2686
if (file_buffer.pos >= file_buffer.end)
2687
VOID(flush_buffer(~ (ulong) 0));
2688
file_buffer.bits=(int) (BITS_SAVED - bits);
2689
file_buffer.bitbucket= value << (BITS_SAVED - bits);
2694
/* Flush bits in bit_buffer to buffer */
2696
static void flush_bits(void)
2699
uint64_t bit_buffer;
2701
bits= file_buffer.bits & ~7;
2702
bit_buffer= file_buffer.bitbucket >> bits;
2703
bits= BITS_SAVED - bits;
2707
*file_buffer.pos++= (uchar) (bit_buffer >> bits);
2709
if (file_buffer.pos >= file_buffer.end)
2710
VOID(flush_buffer(~ (ulong) 0));
2711
file_buffer.bits= BITS_SAVED;
2712
file_buffer.bitbucket= 0;
2716
/****************************************************************************
2717
** functions to handle the joined files
2718
****************************************************************************/
2720
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2723
MYISAM_SHARE *share=isam_file->s;
2724
uint options=mi_uint2korr(share->state.header.options);
2727
options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2728
mi_int2store(share->state.header.options,options);
2730
share->state.state.data_file_length=new_length;
2731
share->state.state.del=0;
2732
share->state.state.empty=0;
2733
share->state.dellink= HA_OFFSET_ERROR;
2734
share->state.split=(ha_rows) mrg->records;
2735
share->state.version=(ulong) time((time_t*) 0);
2736
if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2739
Some indexes are disabled, cannot use current key_file_length value
2740
as an estimate of upper bound of index file size. Use packed data file
2743
share->state.state.key_file_length= new_length;
2746
If there are no disabled indexes, keep key_file_length value from
2747
original file so "myisamchk -rq" can use this value (this is necessary
2748
because index size cannot be easily calculated for fulltext keys)
2750
mi_clear_all_keys_active(share->state.key_map);
2751
for (key=0 ; key < share->base.keys ; key++)
2752
share->state.key_root[key]= HA_OFFSET_ERROR;
2753
for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2754
share->state.key_del[key]= HA_OFFSET_ERROR;
2755
isam_file->state->checksum=crc; /* Save crc here */
2756
share->changed=1; /* Force write of header */
2757
share->state.open_count=0;
2758
share->global_changed=0;
2759
(void)ftruncate(share->kfile, share->base.keystart);
2760
if (share->base.keys)
2762
return(mi_state_info_write(share->kfile,&share->state,1+2));
2766
static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2769
MI_STATE_INFO state;
2770
MI_INFO *isam_file=mrg->file[0];
2773
state= isam_file->s->state;
2774
options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2775
HA_OPTION_READ_ONLY_DATA);
2776
mi_int2store(state.header.options,options);
2777
state.state.data_file_length=new_length;
2779
state.state.empty=0;
2780
state.state.records=state.split=(ha_rows) mrg->records;
2781
/* See comment above in save_state about key_file_length handling. */
2782
if (mrg->src_file_has_indexes_disabled)
2784
isam_file->s->state.state.key_file_length=
2785
max(isam_file->s->state.state.key_file_length, new_length);
2787
state.dellink= HA_OFFSET_ERROR;
2788
state.version=(ulong) time((time_t*) 0);
2789
mi_clear_all_keys_active(state.key_map);
2790
state.state.checksum=crc;
2791
if (isam_file->s->base.keys)
2793
state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
2794
return (mi_state_info_write(file,&state,1+2));
2798
/* reset for mrg_rrnd */
2800
static void mrg_reset(PACK_MRG_INFO *mrg)
2804
mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
2809
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
2817
isam_info= *(info->current=info->file);
2818
info->end=info->current+info->count;
2819
mi_reset(isam_info);
2820
mi_extra(isam_info, HA_EXTRA_CACHE, 0);
2821
filepos=isam_info->s->pack.header_length;
2825
isam_info= *info->current;
2826
filepos= isam_info->nextpos;
2831
isam_info->update&= HA_STATE_CHANGED;
2832
if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
2834
error != HA_ERR_END_OF_FILE)
2836
mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
2837
if (info->current+1 == info->end)
2838
return(HA_ERR_END_OF_FILE);
2840
isam_info= *info->current;
2841
filepos=isam_info->s->pack.header_length;
2842
mi_reset(isam_info);
2843
mi_extra(isam_info,HA_EXTRA_CACHE, 0);
2848
static int mrg_close(PACK_MRG_INFO *mrg)
2852
for (i=0 ; i < mrg->count ; i++)
2853
error|=mi_close(mrg->file[i]);
2855
my_free((uchar*) mrg->file,MYF(0));