1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
/* Pack MyISAM file */
19
#define USE_MY_FUNC /* We need at least my_malloc */
22
#include "myisamdef.h"
25
#include "mysys_err.h"
26
#ifndef __GNU_LIBRARY__
27
#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
29
#include <my_getopt.h>
32
#if SIZEOF_LONG_LONG > 4
38
#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
39
#define HEAD_LENGTH 32
40
#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
42
#define DATA_TMP_EXT ".TMD"
43
#define OLD_EXT ".OLD"
44
#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
46
struct st_file_buffer {
48
uchar *buffer,*pos,*end;
55
struct st_huff_element;
57
typedef struct st_huff_counts {
58
uint field_length,max_zero_fill;
60
uint max_end_space,max_pre_space,length_bits,min_space;
62
enum en_fieldtype field_type;
63
struct st_huff_tree *tree; /* Tree for field */
65
my_off_t end_space[8];
66
my_off_t pre_space[8];
67
my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
68
TREE int_tree; /* Tree for detecting distinct column values. */
69
uchar *tree_buff; /* Column values, 'field_length' each. */
70
uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
73
typedef struct st_huff_element HUFF_ELEMENT;
76
WARNING: It is crucial for the optimizations in calc_packed_length()
77
that 'count' is the first element of 'HUFF_ELEMENT'.
79
struct st_huff_element {
83
HUFF_ELEMENT *left,*right;
87
uint element_nr; /* Number of element */
93
typedef struct st_huff_tree {
94
HUFF_ELEMENT *root,*element_buffer;
98
my_off_t bytes_packed;
99
uint tree_pack_length;
100
uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
106
typedef struct st_isam_mrg {
107
MI_INFO **file,**current,**end;
110
uint min_pack_length; /* Theese is used by packed data */
111
uint max_pack_length;
113
uint max_blob_length;
115
/* true if at least one source file has at least one disabled index */
116
my_bool src_file_has_indexes_disabled;
120
extern int main(int argc,char * *argv);
121
static void get_options(int *argc,char ***argv);
122
static MI_INFO *open_isam_file(char *name,int mode);
123
static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
124
static int compress(PACK_MRG_INFO *file,char *join_name);
125
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
126
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
128
HUFF_COUNTS *huff_counts,
130
static int compare_tree(void* cmp_arg __attribute__((unused)),
131
const uchar *s,const uchar *t);
132
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
133
static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
135
static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
136
uint max_space_length,my_off_t *space_counts,
137
my_off_t tot_space_count,
138
enum en_fieldtype field_type);
139
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
140
static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
141
static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
142
static int save_counts_in_queue(uchar *key,element_count count,
144
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
145
static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
146
static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
147
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
148
HUFF_ELEMENT *element,uint size,
150
static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
151
my_off_t tot_elements,my_off_t filelength);
152
static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
153
static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
154
static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
155
HUFF_ELEMENT *element,
157
static uint max_bit(uint value);
158
static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
159
static char *make_new_name(char *new_name,char *old_name);
160
static char *make_old_name(char *new_name,char *old_name);
161
static void init_file_buffer(File file,bool read_buffer);
162
static int flush_buffer(ulong neaded_length);
163
static void end_file_buffer(void);
164
static void write_bits(uint64_t value, uint bits);
165
static void flush_bits(void);
166
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
168
static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
170
static int mrg_close(PACK_MRG_INFO *mrg);
171
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
172
static void mrg_reset(PACK_MRG_INFO *mrg);
174
static int error_on_write=0,test_only=0,verbose=0,silent=0,
175
write_loop=0,force_pack=0, isamchk_neaded=0;
176
static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
177
static my_bool backup, opt_wait;
179
tree_buff_length is somewhat arbitrary. The bigger it is the better
180
the chance to win in terms of compression factor. On the other hand,
181
this table becomes part of the compressed file header. And its length
182
is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
184
static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
185
static char tmp_dir[FN_REFLEN]={0},*join_table;
186
static my_off_t intervall_length;
187
static ha_checksum glob_crc;
188
static struct st_file_buffer file_buffer;
190
static HUFF_COUNTS *global_count;
191
static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
192
static const char *load_default_groups[]= { "myisampack",0 };
194
/* The main program */
196
int main(int argc, char **argv)
203
load_defaults("my",load_default_groups,&argc,&argv);
205
get_options(&argc,&argv);
207
error=ok=isamchk_neaded=0;
209
{ /* Join files into one */
210
if (open_isam_files(&merge,argv,(uint) argc) ||
211
compress(&merge,join_table))
217
if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
221
merge.file= &isam_file;
225
if (compress(&merge,0))
231
if (ok && isamchk_neaded && !silent)
232
puts("Remember to run myisamchk -rq on compressed tables");
233
VOID(fflush(stdout));
234
VOID(fflush(stderr));
235
free_defaults(default_argv);
236
my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
239
return 0; /* No compiler warning */
243
enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
245
static struct my_option my_long_options[] =
248
{"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
249
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
251
{"backup", 'b', "Make a backup of the table as table_name.OLD.",
252
(char**) &backup, (char**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
253
{"character-sets-dir", OPT_CHARSETS_DIR_MP,
254
"Directory where character sets are.", (char**) &charsets_dir,
255
(char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
256
{"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
257
0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
259
"Force packing of table even if it gets bigger or if tempfile exists.",
260
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
262
"Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
263
(char**) &join_table, (char**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
265
{"help", '?', "Display this help and exit.",
266
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
267
{"silent", 's', "Be more silent.",
268
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
269
{"tmpdir", 'T', "Use temporary directory to store temporary table.",
270
0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
271
{"test", 't', "Don't pack table, only test packing it.",
272
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
273
{"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
274
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
275
{"version", 'V', "Output version information and exit.",
276
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
277
{"wait", 'w', "Wait and retry if table is in use.", (char**) &opt_wait,
278
(char**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
279
{ 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
282
#include <help_start.h>
284
static void print_version(void)
286
VOID(printf("%s Ver 1.23 for %s on %s\n",
287
my_progname, SYSTEM_TYPE, MACHINE_TYPE));
291
static void usage(void)
294
puts("Copyright (C) 2002 MySQL AB");
295
puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
296
puts("and you are welcome to modify and redistribute it under the GPL license\n");
298
puts("Pack a MyISAM-table to take much less space.");
299
puts("Keys are not updated, you must run myisamchk -rq on the datafile");
300
puts("afterwards to update the keys.");
301
puts("You should give the .MYI file as the filename argument.");
303
VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
304
my_print_help(my_long_options);
305
print_defaults("my", load_default_groups);
306
my_print_variables(my_long_options);
309
#include <help_end.h>
312
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
320
setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
325
tmpfile_createflag= O_RDWR | O_TRUNC;
328
write_loop= verbose= 0;
333
/* Avoid to reset 'verbose' if it was already set > 1. */
338
length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
339
if (length != dirname_length(tmp_dir))
341
tmp_dir[length]=FN_LIBCHAR;
346
verbose++; /* Allow for selecting the level of verbosity. */
361
/* Initiates DEBUG - but no debugging here ! */
363
static void get_options(int *argc,char ***argv)
367
my_progname= argv[0][0];
368
if (isatty(fileno(stdout)))
371
if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
381
backup=0; /* Not needed */
388
static MI_INFO *open_isam_file(char *name,int mode)
393
if (!(isam_file=mi_open(name,mode,
394
(opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
395
HA_OPEN_ABORT_IF_LOCKED))))
397
VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
401
if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
405
VOID(fprintf(stderr, "%s is already compressed\n", name));
406
VOID(mi_close(isam_file));
410
puts("Recompressing already compressed table");
411
share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
413
if (! force_pack && share->state.state.records != 0 &&
414
(share->state.state.records <= 1 ||
415
share->state.state.data_file_length < 1024))
417
VOID(fprintf(stderr, "%s is too small to compress\n", name));
418
VOID(mi_close(isam_file));
421
VOID(mi_lock_database(isam_file,F_WRLCK));
426
static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
431
mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
433
mrg->src_file_has_indexes_disabled= 0;
434
for (i=0; i < count ; i++)
436
if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
439
mrg->src_file_has_indexes_disabled|=
440
! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
441
mrg->file[i]->s->base.keys);
443
/* Check that files are identical */
444
for (j=0 ; j < count-1 ; j++)
446
MI_COLUMNDEF *m1,*m2,*end;
447
if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
448
mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
450
m1=mrg->file[j]->s->rec;
451
end=m1+mrg->file[j]->s->base.fields;
452
m2=mrg->file[j+1]->s->rec;
453
for ( ; m1 != end ; m1++,m2++)
455
if (m1->type != m2->type || m1->length != m2->length)
463
VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
464
my_progname, names[j], names[j+1]));
467
mi_close(mrg->file[i]);
468
my_free((uchar*) mrg->file,MYF(0));
473
static int compress(PACK_MRG_INFO *mrg,char *result_table)
476
File new_file,join_isam_file;
479
char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
480
uint i,header_length,fields,trees,used_trees;
481
my_off_t old_length,new_length,tot_elements;
482
HUFF_COUNTS *huff_counts;
483
HUFF_TREE *huff_trees;
485
isam_file=mrg->file[0]; /* Take this as an example */
487
new_file=join_isam_file= -1;
492
/* Create temporary or join file */
495
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
497
VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
498
if (!test_only && result_table)
500
/* Make a new indexfile based on first file in list */
503
strmov(org_name,result_table); /* Fix error messages */
504
VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
505
if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
508
length=(uint) share->base.keystart;
509
if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
511
if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
512
my_write(join_isam_file,buff,length,
513
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
515
my_free(buff,MYF(0));
518
my_free(buff,MYF(0));
519
VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
521
else if (!tmp_dir[0])
522
VOID(make_new_name(new_name,org_name));
524
VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
526
(new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
529
/* Start calculating statistics */
532
for (i=0 ; i < mrg->count ; i++)
533
mrg->records+=mrg->file[i]->s->state.state.records;
535
if (write_loop || verbose)
537
VOID(printf("Compressing %s: (%lu records)\n",
538
result_table ? new_name : org_name, (ulong) mrg->records));
540
trees=fields=share->base.fields;
541
huff_counts=init_huff_count(isam_file,mrg->records);
545
Read the whole data file(s) for statistics.
547
if (write_loop || verbose)
548
VOID(printf("- Calculating statistics\n"));
549
if (get_statistic(mrg,huff_counts))
553
for (i=0; i < mrg->count ; i++)
554
old_length+= (mrg->file[i]->s->state.state.data_file_length -
555
mrg->file[i]->s->state.state.empty);
558
Create a global priority queue in preparation for making
559
temporary Huffman trees.
561
if (init_queue(&queue,256,0,0,compare_huff_elements,0))
565
Check each column if we should use pre-space-compress, end-space-
566
compress, empty-field-compress or zero-field-compress.
568
check_counts(huff_counts,fields,mrg->records);
571
Build a Huffman tree for each column.
573
huff_trees=make_huff_trees(huff_counts,trees);
576
If the packed lengths of combined columns is less then the sum of
577
the non-combined columns, then create common Huffman trees for them.
578
We do this only for byte compressed columns, not for distinct values
581
if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
585
Assign codes to all byte or column values.
587
if (make_huff_decode_table(huff_trees,fields))
590
/* Prepare a file buffer. */
591
init_file_buffer(new_file,0);
594
Reserve space in the target file for the fixed compressed file header.
596
file_buffer.pos_in_file=HEAD_LENGTH;
598
VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
601
Write field infos: field type, pack type, length bits, tree number.
603
write_field_info(huff_counts,fields,used_trees);
608
if (!(tot_elements=write_huff_tree(huff_trees,trees)))
612
Calculate the total length of the compression info header.
613
This includes the fixed compressed file header, the column compression
614
type descriptions, and the decode trees.
616
header_length=(uint) file_buffer.pos_in_file+
617
(uint) (file_buffer.pos-file_buffer.buffer);
620
Compress the source file into the target file.
622
if (write_loop || verbose)
623
VOID(printf("- Compressing file\n"));
624
error=compress_isam_file(mrg,huff_counts);
625
new_length=file_buffer.pos_in_file;
626
if (!error && !test_only)
628
uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
629
bzero(buff,sizeof(buff));
630
error=my_write(file_buffer.file,buff,sizeof(buff),
631
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
635
Write the fixed compressed file header.
638
error=write_header(mrg,header_length,used_trees,tot_elements,
641
/* Flush the file buffer. */
644
/* Display statistics. */
645
if (verbose && mrg->records)
646
VOID(printf("Min record length: %6d Max length: %6d "
647
"Mean total length: %6ld\n", mrg->min_pack_length,
648
mrg->max_pack_length, (ulong) (new_length/mrg->records)));
650
/* Close source and target file. */
653
error|=my_close(new_file,MYF(MY_WME));
656
error|=my_close(isam_file->dfile,MYF(MY_WME));
657
isam_file->dfile= -1; /* Tell mi_close file is closed */
662
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
663
if (! test_only && ! error)
667
error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
673
if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
679
error=my_copy(new_name,org_name,MYF(MY_WME));
681
error=my_rename(new_name,org_name,MYF(MY_WME));
684
VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
686
VOID(my_delete(new_name,MYF(MY_WME)));
694
error=my_copy(new_name,org_name,
695
MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
697
VOID(my_delete(new_name,MYF(MY_WME)));
700
error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
703
error=save_state(isam_file,mrg,new_length,glob_crc);
706
error|=mrg_close(mrg);
707
if (join_isam_file >= 0)
708
error|=my_close(join_isam_file,MYF(MY_WME));
711
VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
712
VOID(my_delete(new_name,MYF(MY_WME)));
715
if (write_loop || verbose)
718
VOID(printf("%.4g%% \n",
719
(((int64_t) (old_length - new_length)) * 100.0 /
720
(int64_t) old_length)));
722
puts("Empty file saved in compressed format");
727
free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
729
VOID(my_close(new_file,MYF(0)));
730
if (join_isam_file >= 0)
731
VOID(my_close(join_isam_file,MYF(0)));
733
VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
737
/* Init a huff_count-struct for each field and init it */
739
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
742
register HUFF_COUNTS *count;
743
if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
745
MYF(MY_ZEROFILL | MY_WME))))
747
for (i=0 ; i < info->s->base.fields ; i++)
749
enum en_fieldtype type;
750
count[i].field_length=info->s->rec[i].length;
751
type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
752
if (type == FIELD_INTERVALL ||
753
type == FIELD_CONSTANT ||
756
if (count[i].field_length <= 8 &&
757
(type == FIELD_NORMAL ||
758
type == FIELD_SKIP_ZERO))
759
count[i].max_zero_fill= count[i].field_length;
761
For every column initialize a tree, which is used to detect distinct
762
column values. 'int_tree' works together with 'tree_buff' and
763
'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
764
This is accomplished by '-1' as the element size.
766
init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
768
if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
769
count[i].tree_pos=count[i].tree_buff =
770
my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
778
/* Free memory used by counts and trees */
780
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
781
HUFF_COUNTS *huff_counts,
788
for (i=0 ; i < trees ; i++)
790
if (huff_trees[i].element_buffer)
791
my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
792
if (huff_trees[i].code)
793
my_free((uchar*) huff_trees[i].code,MYF(0));
795
my_free((uchar*) huff_trees,MYF(0));
799
for (i=0 ; i < fields ; i++)
801
if (huff_counts[i].tree_buff)
803
my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
804
delete_tree(&huff_counts[i].int_tree);
807
my_free((uchar*) huff_counts,MYF(0));
809
delete_queue(&queue); /* This is safe to free */
813
/* Read through old file and gather some statistics */
815
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
819
ulong reclength,max_blob_length;
820
uchar *record,*pos,*next_pos,*end_pos,*start_pos;
821
ha_rows record_count;
822
my_bool static_row_size;
823
HUFF_COUNTS *count,*end_count;
824
TREE_ELEMENT *element;
826
reclength=mrg->file[0]->s->base.reclength;
827
record=(uchar*) my_alloca(reclength);
828
end_count=huff_counts+mrg->file[0]->s->base.fields;
829
record_count=0; glob_crc=0;
832
/* Check how to calculate checksum */
834
for (count=huff_counts ; count < end_count ; count++)
836
if (count->field_type == FIELD_BLOB ||
837
count->field_type == FIELD_VARCHAR)
845
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
847
ulong tot_blob_length=0;
850
/* glob_crc is a checksum over all bytes of all records. */
852
glob_crc+=mi_static_checksum(mrg->file[0],record);
854
glob_crc+=mi_checksum(mrg->file[0],record);
856
/* Count the incidence of values separately for every column. */
857
for (pos=record,count=huff_counts ;
862
next_pos=end_pos=(start_pos=pos)+count->field_length;
865
Put the whole column value in a tree if there is room for it.
866
'int_tree' is used to quickly check for duplicate values.
867
'tree_buff' collects as many distinct column values as
868
possible. If the field length is > 1, it is tree_buff_length,
869
else 2 bytes. Each value is 'field_length' bytes big. If there
870
are more distinct column values than fit into the buffer, we
871
give up with this tree. BLOBs and VARCHARs do not have a
872
tree_buff as it can only be used with fixed length columns.
873
For the special case of field length == 1, we handle only the
874
case that there is only one distinct value in the table(s).
875
Otherwise, we can have a maximum of 256 distinct values. This
876
is then handled by the normal Huffman tree build.
878
Another limit for collecting distinct column values is the
879
number of values itself. Since we would need to build a
880
Huffman tree for the values, we are limited by the 'IS_OFFSET'
881
constant. This constant expresses a bit which is used to
882
determine if a tree element holds a final value or an offset
883
to a child element. Hence, all values and offsets need to be
884
smaller than 'IS_OFFSET'. A tree element is implemented with
885
two integer values, one for the left branch and one for the
886
right branch. For the extreme case that the first element
887
points to the last element, the number of integers in the tree
888
must be less or equal to IS_OFFSET. So the number of elements
889
must be less or equal to IS_OFFSET / 2.
891
WARNING: At first, we insert a pointer into the record buffer
892
as the key for the tree. If we got a new distinct value, which
893
is really inserted into the tree, instead of being counted
894
only, we will copy the column value from the record buffer to
895
'tree_buff' and adjust the key pointer of the tree accordingly.
897
if (count->tree_buff)
900
if (!(element=tree_insert(&count->int_tree,pos, 0,
901
count->int_tree.custom_arg)) ||
902
(element->count == 1 &&
903
(count->tree_buff + tree_buff_length <
904
count->tree_pos + count->field_length)) ||
905
(count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
906
(count->field_length == 1 &&
907
count->int_tree.elements_in_tree > 1))
909
delete_tree(&count->int_tree);
910
my_free(count->tree_buff,MYF(0));
916
If tree_insert() succeeds, it either creates a new element
917
or increments the counter of an existing element.
919
if (element->count == 1)
921
/* Copy the new column value into 'tree_buff'. */
922
memcpy(count->tree_pos,pos,(size_t) count->field_length);
923
/* Adjust the key pointer in the tree. */
924
tree_set_pointer(element,count->tree_pos);
925
/* Point behind the last column value so far. */
926
count->tree_pos+=count->field_length;
931
/* Save character counters and space-counts and zero-field-counts */
932
if (count->field_type == FIELD_NORMAL ||
933
count->field_type == FIELD_SKIP_ENDSPACE)
935
/* Ignore trailing space. */
936
for ( ; end_pos > pos ; end_pos--)
937
if (end_pos[-1] != ' ')
939
/* Empty fields are just counted. Go to the next record. */
942
count->empty_fields++;
943
count->max_zero_fill=0;
947
Count the total of all trailing spaces and the number of
948
short trailing spaces. Remember the longest trailing space.
950
length= (uint) (next_pos-end_pos);
951
count->tot_end_space+=length;
953
count->end_space[length]++;
954
if (count->max_end_space < length)
955
count->max_end_space = length;
958
if (count->field_type == FIELD_NORMAL ||
959
count->field_type == FIELD_SKIP_PRESPACE)
961
/* Ignore leading space. */
962
for (pos=start_pos; pos < end_pos ; pos++)
965
/* Empty fields are just counted. Go to the next record. */
968
count->empty_fields++;
969
count->max_zero_fill=0;
973
Count the total of all leading spaces and the number of
974
short leading spaces. Remember the longest leading space.
976
length= (uint) (pos-start_pos);
977
count->tot_pre_space+=length;
979
count->pre_space[length]++;
980
if (count->max_pre_space < length)
981
count->max_pre_space = length;
984
/* Calculate pos, end_pos, and max_length for variable length fields. */
985
if (count->field_type == FIELD_BLOB)
987
uint field_length=count->field_length -portable_sizeof_char_ptr;
988
ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
989
memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
990
end_pos=pos+blob_length;
991
tot_blob_length+=blob_length;
992
set_if_bigger(count->max_length,blob_length);
994
else if (count->field_type == FIELD_VARCHAR)
996
uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
997
length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
998
uint2korr(start_pos));
999
pos= start_pos+pack_length;
1000
end_pos= pos+length;
1001
set_if_bigger(count->max_length,length);
1004
/* Evaluate 'max_zero_fill' for short fields. */
1005
if (count->field_length <= 8 &&
1006
(count->field_type == FIELD_NORMAL ||
1007
count->field_type == FIELD_SKIP_ZERO))
1010
/* Zero fields are just counted. Go to the next record. */
1011
if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1013
count->zero_fields++;
1017
max_zero_fill starts with field_length. It is decreased every
1018
time a shorter "zero trailer" is found. It is set to zero when
1019
an empty field is found (see above). This suggests that the
1020
variable should be called 'min_zero_fill'.
1022
for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1024
if (i < count->max_zero_fill)
1025
count->max_zero_fill=i;
1028
/* Ignore zero fields and check fields. */
1029
if (count->field_type == FIELD_ZERO ||
1030
count->field_type == FIELD_CHECK)
1034
Count the incidence of every byte value in the
1035
significant field value.
1037
for ( ; pos < end_pos ; pos++)
1038
count->counts[(uchar) *pos]++;
1040
/* Step to next field. */
1043
if (tot_blob_length > max_blob_length)
1044
max_blob_length=tot_blob_length;
1046
if (write_loop && record_count % WRITE_COUNT == 0)
1048
VOID(printf("%lu\r", (ulong) record_count));
1049
VOID(fflush(stdout));
1052
else if (error != HA_ERR_RECORD_DELETED)
1054
VOID(fprintf(stderr, "Got error %d while reading rows", error));
1058
/* Step to next record. */
1062
VOID(printf(" \r"));
1063
VOID(fflush(stdout));
1067
VOID(printf("Found the following number of incidents "
1068
"of the byte codes:\n"));
1069
for (count= huff_counts ; count < end_count; count++)
1072
my_off_t total_count;
1076
VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1077
if (count->tree_buff)
1080
VOID(printf("number of distinct values: %u\n",
1081
(uint) ((count->tree_pos - count->tree_buff) /
1082
count->field_length)));
1085
for (idx= 0; idx < 256; idx++)
1087
if (count->counts[idx])
1089
total_count+= count->counts[idx];
1091
VOID(printf("counts[0x%02x]: %12s\n", idx,
1092
llstr((int64_t) count->counts[idx], llbuf)));
1095
if ((verbose >= 2) && total_count)
1097
VOID(printf("total: %12s\n",
1098
llstr((int64_t) total_count, llbuf)));
1102
mrg->records=record_count;
1103
mrg->max_blob_length=max_blob_length;
1104
my_afree((uchar*) record);
1105
return(error != HA_ERR_END_OF_FILE);
1108
static int compare_huff_elements(void *not_used __attribute__((unused)),
1111
return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1112
(*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1115
/* Check each tree if we should use pre-space-compress, end-space-
1116
compress, empty-field-compress or zero-field-compress */
1118
static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1121
uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1122
my_off_t old_length,new_length,length;
1124
bzero((uchar*) field_count,sizeof(field_count));
1125
space_fields=fill_zero_fields=0;
1127
for (; trees-- ; huff_counts++)
1129
if (huff_counts->field_type == FIELD_BLOB)
1131
huff_counts->length_bits=max_bit(huff_counts->max_length);
1134
else if (huff_counts->field_type == FIELD_VARCHAR)
1136
huff_counts->length_bits=max_bit(huff_counts->max_length);
1139
else if (huff_counts->field_type == FIELD_CHECK)
1141
huff_counts->bytes_packed=0;
1142
huff_counts->counts[0]=0;
1146
huff_counts->field_type=FIELD_NORMAL;
1147
huff_counts->pack_type=0;
1149
/* Check for zero-filled records (in this column), or zero records. */
1150
if (huff_counts->zero_fields || ! records)
1152
my_off_t old_space_count;
1154
If there are only zero filled records (in this column),
1155
or no records at all, we are done.
1157
if (huff_counts->zero_fields == records)
1159
huff_counts->field_type= FIELD_ZERO;
1160
huff_counts->bytes_packed=0;
1161
huff_counts->counts[0]=0;
1164
/* Remeber the number of significant spaces. */
1165
old_space_count=huff_counts->counts[' '];
1166
/* Add all leading and trailing spaces. */
1167
huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1168
huff_counts->tot_pre_space +
1169
huff_counts->empty_fields *
1170
huff_counts->field_length);
1171
/* Check, what the compressed length of this would be. */
1172
old_length=calc_packed_length(huff_counts,0)+records/8;
1173
/* Get the number of zero bytes. */
1174
length=huff_counts->zero_fields*huff_counts->field_length;
1175
/* Add it to the counts. */
1176
huff_counts->counts[0]+=length;
1177
/* Check, what the compressed length of this would be. */
1178
new_length=calc_packed_length(huff_counts,0);
1179
/* If the compression without the zeroes would be shorter, we are done. */
1180
if (old_length < new_length && huff_counts->field_length > 1)
1182
huff_counts->field_type=FIELD_SKIP_ZERO;
1183
huff_counts->counts[0]-=length;
1184
huff_counts->bytes_packed=old_length- records/8;
1187
/* Remove the insignificant spaces, but keep the zeroes. */
1188
huff_counts->counts[' ']=old_space_count;
1190
/* Check, what the compressed length of this column would be. */
1191
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1194
If there are enough empty records (in this column),
1195
treating them specially may pay off.
1197
if (huff_counts->empty_fields)
1199
if (huff_counts->field_length > 2 &&
1200
huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1201
(1+max_bit(max(huff_counts->max_pre_space,
1202
huff_counts->max_end_space))) <
1203
records * max_bit(huff_counts->field_length))
1205
huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1209
length=huff_counts->empty_fields*huff_counts->field_length;
1210
if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1212
huff_counts->tot_end_space+=length;
1213
huff_counts->max_end_space=huff_counts->field_length;
1214
if (huff_counts->field_length < 8)
1215
huff_counts->end_space[huff_counts->field_length]+=
1216
huff_counts->empty_fields;
1218
if (huff_counts->tot_pre_space)
1220
huff_counts->tot_pre_space+=length;
1221
huff_counts->max_pre_space=huff_counts->field_length;
1222
if (huff_counts->field_length < 8)
1223
huff_counts->pre_space[huff_counts->field_length]+=
1224
huff_counts->empty_fields;
1230
If there are enough trailing spaces (in this column),
1231
treating them specially may pay off.
1233
if (huff_counts->tot_end_space)
1235
huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1236
if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1237
huff_counts->end_space,
1238
huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1240
huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1244
If there are enough leading spaces (in this column),
1245
treating them specially may pay off.
1247
if (huff_counts->tot_pre_space)
1249
if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1250
huff_counts->pre_space,
1251
huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1255
found_pack: /* Found field-packing */
1257
/* Test if we can use zero-fill */
1259
if (huff_counts->max_zero_fill &&
1260
(huff_counts->field_type == FIELD_NORMAL ||
1261
huff_counts->field_type == FIELD_SKIP_ZERO))
1263
huff_counts->counts[0]-=huff_counts->max_zero_fill*
1264
(huff_counts->field_type == FIELD_SKIP_ZERO ?
1265
records - huff_counts->zero_fields : records);
1266
huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1267
huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1270
/* Test if intervall-field is better */
1272
if (huff_counts->tree_buff)
1276
tree.element_buffer=0;
1277
if (!make_huff_tree(&tree,huff_counts) &&
1278
tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1280
if (tree.elements == 1)
1281
huff_counts->field_type=FIELD_CONSTANT;
1283
huff_counts->field_type=FIELD_INTERVALL;
1284
huff_counts->pack_type=0;
1288
my_free((uchar*) huff_counts->tree_buff,MYF(0));
1289
delete_tree(&huff_counts->int_tree);
1290
huff_counts->tree_buff=0;
1292
if (tree.element_buffer)
1293
my_free((uchar*) tree.element_buffer,MYF(0));
1295
if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1297
if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1299
field_count[huff_counts->field_type]++;
1302
VOID(printf("\nnormal: %3d empty-space: %3d "
1303
"empty-zero: %3d empty-fill: %3d\n"
1304
"pre-space: %3d end-space: %3d "
1305
"intervall-fields: %3d zero: %3d\n",
1306
field_count[FIELD_NORMAL],space_fields,
1307
field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1308
field_count[FIELD_SKIP_PRESPACE],
1309
field_count[FIELD_SKIP_ENDSPACE],
1310
field_count[FIELD_INTERVALL],
1311
field_count[FIELD_ZERO]));
1315
/* Test if we can use space-compression and empty-field-compression */
1318
test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1319
uint max_space_length, my_off_t *space_counts,
1320
my_off_t tot_space_count, enum en_fieldtype field_type)
1324
my_off_t space_count,min_space_count,min_pack,new_length,skip;
1326
length_bits=max_bit(max_space_length);
1328
/* Default no end_space-packing */
1329
space_count=huff_counts->counts[(uint) ' '];
1330
min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1331
min_pack=calc_packed_length(huff_counts,0);
1333
huff_counts->counts[(uint) ' ']=space_count;
1335
/* Test with allways space-count */
1336
new_length=huff_counts->bytes_packed+length_bits*records/8;
1337
if (new_length+1 < min_pack)
1340
min_pack=new_length;
1341
min_space_count=space_count;
1343
/* Test with length-flag */
1344
for (skip=0L, i=0 ; i < 8 ; i++)
1346
if (space_counts[i])
1349
huff_counts->counts[(uint) ' ']+=space_counts[i];
1350
skip+=huff_counts->pre_space[i];
1351
new_length=calc_packed_length(huff_counts,0)+
1352
(records+(records-skip)*(1+length_bits))/8;
1353
if (new_length < min_pack)
1356
min_pack=new_length;
1357
min_space_count=huff_counts->counts[(uint) ' '];
1362
huff_counts->counts[(uint) ' ']=min_space_count;
1363
huff_counts->bytes_packed=min_pack;
1366
return(0); /* No space-compress */
1367
case -1: /* Always space-count */
1368
huff_counts->field_type=field_type;
1369
huff_counts->min_space=0;
1370
huff_counts->length_bits=max_bit(max_space_length);
1373
huff_counts->field_type=field_type;
1374
huff_counts->min_space=(uint) min_pos;
1375
huff_counts->pack_type|=PACK_TYPE_SELECTED;
1376
huff_counts->length_bits=max_bit(max_space_length);
1379
return(1); /* Using space-compress */
1383
/* Make a huff_tree of each huff_count */
1385
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1388
HUFF_TREE *huff_tree;
1390
if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1391
MYF(MY_WME | MY_ZEROFILL))))
1394
for (tree=0 ; tree < trees ; tree++)
1396
if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1399
my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1400
my_free((uchar*) huff_tree,MYF(0));
1408
Build a Huffman tree.
1412
huff_tree The Huffman tree.
1413
huff_counts The counts.
1416
Build a Huffman tree according to huff_counts->counts or
1417
huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1418
tree_buff_length of distinct column values. In that case, whole
1419
values can be Huffman encoded instead of single bytes.
1426
static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1428
uint i,found,bits_packed,first,last;
1429
my_off_t bytes_packed;
1430
HUFF_ELEMENT *a,*b,*new_huff_el;
1433
if (huff_counts->tree_buff)
1435
/* Calculate the number of distinct values in tree_buff. */
1436
found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1437
huff_counts->field_length;
1438
first=0; last=found-1;
1442
/* Count the number of byte codes found in the column. */
1443
for (i=found=0 ; i < 256 ; i++)
1445
if (huff_counts->counts[i])
1456
/* When using 'tree_buff' we can have more that 256 values. */
1457
if (queue.max_elements < found)
1459
delete_queue(&queue);
1460
if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1464
/* Allocate or reallocate an element buffer for the Huffman tree. */
1465
if (!huff_tree->element_buffer)
1467
if (!(huff_tree->element_buffer=
1468
(HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1475
(HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1476
found*2*sizeof(HUFF_ELEMENT),
1479
huff_tree->element_buffer=temp;
1482
huff_counts->tree=huff_tree;
1483
huff_tree->counts=huff_counts;
1484
huff_tree->min_chr=first;
1485
huff_tree->max_chr=last;
1486
huff_tree->char_bits=max_bit(last-first);
1487
huff_tree->offset_bits=max_bit(found-1)+1;
1489
if (huff_counts->tree_buff)
1491
huff_tree->elements=0;
1492
huff_tree->tree_pack_length=(1+15+16+5+5+
1493
(huff_tree->char_bits+1)*found+
1494
(huff_tree->offset_bits+1)*
1496
(uint) (huff_tree->counts->tree_pos-
1497
huff_tree->counts->tree_buff);
1499
Put a HUFF_ELEMENT into the queue for every distinct column value.
1501
tree_walk() calls save_counts_in_queue() for every element in
1502
'int_tree'. This takes elements from the target trees element
1503
buffer and places references to them into the buffer of the
1504
priority queue. We insert in column value order, but the order is
1505
in fact irrelevant here. We will establish the correct order
1508
tree_walk(&huff_counts->int_tree,
1509
(int (*)(void*, element_count,void*)) save_counts_in_queue,
1510
(uchar*) huff_tree, left_root_right);
1514
huff_tree->elements=found;
1515
huff_tree->tree_pack_length=(9+9+5+5+
1516
(huff_tree->char_bits+1)*found+
1517
(huff_tree->offset_bits+1)*
1520
Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1522
The elements are taken from the target trees element buffer.
1523
Instead of using queue_insert(), we just place references to the
1524
elements into the buffer of the priority queue. We insert in byte
1525
value order, but the order is in fact irrelevant here. We will
1526
establish the correct order later.
1528
for (i=first, found=0 ; i <= last ; i++)
1530
if (huff_counts->counts[i])
1532
new_huff_el=huff_tree->element_buffer+(found++);
1533
new_huff_el->count=huff_counts->counts[i];
1534
new_huff_el->a.leaf.null=0;
1535
new_huff_el->a.leaf.element_nr=i;
1536
queue.root[found]=(uchar*) new_huff_el;
1540
If there is only a single byte value in this field in all records,
1541
add a second element with zero incidence. This is required to enter
1542
the loop, which builds the Huffman tree.
1546
new_huff_el=huff_tree->element_buffer+(found++);
1547
new_huff_el->count=0;
1548
new_huff_el->a.leaf.null=0;
1550
new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1552
new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1553
queue.root[found]=(uchar*) new_huff_el;
1557
/* Make a queue from the queue buffer. */
1558
queue.elements=found;
1561
Make a priority queue from the queue. Construct its index so that we
1562
have a partially ordered tree.
1564
for (i=found/2 ; i > 0 ; i--)
1565
_downheap(&queue,i);
1567
/* The Huffman algorithm. */
1568
bytes_packed=0; bits_packed=0;
1569
for (i=1 ; i < found ; i++)
1572
Pop the top element from the queue (the one with the least incidence).
1573
Popping from a priority queue includes a re-ordering of the queue,
1574
to get the next least incidence element to the top.
1576
a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1578
Copy the next least incidence element. The queue implementation
1579
reserves root[0] for temporary purposes. root[1] is the top.
1581
b=(HUFF_ELEMENT*) queue.root[1];
1582
/* Get a new element from the element buffer. */
1583
new_huff_el=huff_tree->element_buffer+found+i;
1584
/* The new element gets the sum of the two least incidence elements. */
1585
new_huff_el->count=a->count+b->count;
1587
The Huffman algorithm assigns another bit to the code for a byte
1588
every time that bytes incidence is combined (directly or indirectly)
1589
to a new element as one of the two least incidence elements.
1590
This means that one more bit per incidence of that byte is required
1591
in the resulting file. So we add the new combined incidence as the
1592
number of bits by which the result grows.
1594
bits_packed+=(uint) (new_huff_el->count & 7);
1595
bytes_packed+=new_huff_el->count/8;
1596
/* The new element points to its children, lesser in left. */
1597
new_huff_el->a.nod.left=a;
1598
new_huff_el->a.nod.right=b;
1600
Replace the copied top element by the new element and re-order the
1603
queue.root[1]=(uchar*) new_huff_el;
1604
queue_replaced(&queue);
1606
huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1607
huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1611
static int compare_tree(void* cmp_arg __attribute__((unused)),
1612
register const uchar *s, register const uchar *t)
1615
for (length=global_count->field_length; length-- ;)
1617
return (int) s[-1] - (int) t[-1];
1622
Organize distinct column values and their incidences into a priority queue.
1625
save_counts_in_queue()
1626
key The column value.
1627
count The incidence of this value.
1628
tree The Huffman tree to be built later.
1631
We use the element buffer of the targeted tree. The distinct column
1632
values are organized in a priority queue first. The Huffman
1633
algorithm will later organize the elements into a Huffman tree. For
1634
the time being, we just place references to the elements into the
1635
queue buffer. The buffer will later be organized into a priority
1642
static int save_counts_in_queue(uchar *key, element_count count,
1645
HUFF_ELEMENT *new_huff_el;
1647
new_huff_el=tree->element_buffer+(tree->elements++);
1648
new_huff_el->count=count;
1649
new_huff_el->a.leaf.null=0;
1650
new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1651
tree->counts->field_length;
1652
queue.root[tree->elements]=(uchar*) new_huff_el;
1658
Calculate length of file if given counts should be used.
1661
calc_packed_length()
1662
huff_counts The counts for a column of the table(s).
1663
add_tree_lenght If the decode tree length should be added.
1666
We need to follow the Huffman algorithm until we know, how many bits
1667
are required for each byte code. But we do not need the resulting
1668
Huffman tree. Hence, we can leave out some steps which are essential
1669
in make_huff_tree().
1672
Number of bytes required to compress this table column.
1675
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1676
uint add_tree_lenght)
1678
uint i,found,bits_packed,first,last;
1679
my_off_t bytes_packed;
1680
HUFF_ELEMENT element_buffer[256];
1683
WARNING: We use a small hack for efficiency: Instead of placing
1684
references to HUFF_ELEMENTs into the queue, we just insert
1685
references to the counts of the byte codes which appeared in this
1686
table column. During the Huffman algorithm they are successively
1687
replaced by references to HUFF_ELEMENTs. This works, because
1688
HUFF_ELEMENTs have the incidence count at their beginning.
1689
Regardless, wether the queue array contains references to counts of
1690
type my_off_t or references to HUFF_ELEMENTs which have the count of
1691
type my_off_t at their beginning, it always points to a count of the
1694
Instead of using queue_insert(), we just copy the references into
1695
the buffer of the priority queue. We insert in byte value order, but
1696
the order is in fact irrelevant here. We will establish the correct
1700
for (i=found=0 ; i < 256 ; i++)
1702
if (huff_counts->counts[i])
1707
/* We start with root[1], which is the queues top element. */
1708
queue.root[found]=(uchar*) &huff_counts->counts[i];
1712
return(0); /* Empty tree */
1714
If there is only a single byte value in this field in all records,
1715
add a second element with zero incidence. This is required to enter
1716
the loop, which follows the Huffman algorithm.
1719
queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1721
/* Make a queue from the queue buffer. */
1722
queue.elements=found;
1724
bytes_packed=0; bits_packed=0;
1725
/* Add the length of the coding table, which would become part of the file. */
1726
if (add_tree_lenght)
1727
bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1728
(max_bit(found-1)+1+1)*(found-2) +7)/8;
1731
Make a priority queue from the queue. Construct its index so that we
1732
have a partially ordered tree.
1734
for (i=(found+1)/2 ; i > 0 ; i--)
1735
_downheap(&queue,i);
1737
/* The Huffman algorithm. */
1738
for (i=0 ; i < found-1 ; i++)
1742
HUFF_ELEMENT *new_huff_el;
1745
Pop the top element from the queue (the one with the least
1746
incidence). Popping from a priority queue includes a re-ordering
1747
of the queue, to get the next least incidence element to the top.
1749
a= (my_off_t*) queue_remove(&queue, 0);
1751
Copy the next least incidence element. The queue implementation
1752
reserves root[0] for temporary purposes. root[1] is the top.
1754
b= (my_off_t*) queue.root[1];
1755
/* Create a new element in a local (automatic) buffer. */
1756
new_huff_el= element_buffer + i;
1757
/* The new element gets the sum of the two least incidence elements. */
1758
new_huff_el->count= *a + *b;
1760
The Huffman algorithm assigns another bit to the code for a byte
1761
every time that bytes incidence is combined (directly or indirectly)
1762
to a new element as one of the two least incidence elements.
1763
This means that one more bit per incidence of that byte is required
1764
in the resulting file. So we add the new combined incidence as the
1765
number of bits by which the result grows.
1767
bits_packed+=(uint) (new_huff_el->count & 7);
1768
bytes_packed+=new_huff_el->count/8;
1770
Replace the copied top element by the new element and re-order the
1771
queue. This successively replaces the references to counts by
1772
references to HUFF_ELEMENTs.
1774
queue.root[1]=(uchar*) new_huff_el;
1775
queue_replaced(&queue);
1777
return(bytes_packed+(bits_packed+7)/8);
1781
/* Remove trees that don't give any compression */
1783
static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1786
HUFF_COUNTS count,*i,*j,*last_count;
1788
last_count=huff_counts+trees;
1789
for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1791
if (!i->tree->tree_number)
1793
i->tree->tree_number= ++tree_number;
1795
continue; /* Don't join intervall */
1796
for (j=i+1 ; j < last_count ; j++)
1798
if (! j->tree->tree_number && ! j->tree_buff)
1800
for (k=0 ; k < 256 ; k++)
1801
count.counts[k]=i->counts[k]+j->counts[k];
1802
if (calc_packed_length(&count,1) <=
1803
i->tree->bytes_packed + j->tree->bytes_packed+
1804
i->tree->tree_pack_length+j->tree->tree_pack_length+
1807
memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1808
sizeof(count.counts[0])*256);
1809
my_free((uchar*) j->tree->element_buffer,MYF(0));
1810
j->tree->element_buffer=0;
1812
bmove((uchar*) i->counts,(uchar*) count.counts,
1813
sizeof(count.counts[0])*256);
1814
if (make_huff_tree(i->tree,i))
1822
VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
1823
return tree_number; /* Return trees left */
1828
Fill in huff_tree encode tables.
1831
make_huff_decode_table()
1832
huff_tree An array of HUFF_TREE which are to be encoded.
1833
trees The number of HUFF_TREE in the array.
1840
static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1843
for ( ; trees-- ; huff_tree++)
1845
if (huff_tree->tree_number > 0)
1847
elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1848
if (!(huff_tree->code =
1849
(uint64_t*) my_malloc(elements*
1850
(sizeof(uint64_t) + sizeof(uchar)),
1851
MYF(MY_WME | MY_ZEROFILL))))
1853
huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1854
make_traverse_code_tree(huff_tree, huff_tree->root,
1855
8 * sizeof(uint64_t), 0LL);
1862
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1863
HUFF_ELEMENT *element,
1864
uint size, uint64_t code)
1867
if (!element->a.leaf.null)
1869
chr=element->a.leaf.element_nr;
1870
huff_tree->code_len[chr]= (uchar) (8 * sizeof(uint64_t) - size);
1871
huff_tree->code[chr]= (code >> size);
1872
if (huff_tree->height < 8 * sizeof(uint64_t) - size)
1873
huff_tree->height= 8 * sizeof(uint64_t) - size;
1878
make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1879
make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1880
code + (((uint64_t) 1) << size));
1887
Convert a value into binary digits.
1892
length The number of low order bits to convert.
1895
The result string is in static storage. It is reused on every call.
1896
So you cannot use it twice in one expression.
1899
A pointer to a static NUL-terminated string.
1902
static char *bindigits(uint64_t value, uint bits)
1904
static char digits[72];
1908
assert(idx < sizeof(digits));
1910
*(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1917
Convert a value into hexadecimal digits.
1924
The result string is in static storage. It is reused on every call.
1925
So you cannot use it twice in one expression.
1928
A pointer to a static NUL-terminated string.
1931
static char *hexdigits(uint64_t value)
1933
static char digits[20];
1935
uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1937
assert(idx < sizeof(digits));
1940
if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1941
*(ptr - 1)+= 'a' - '9' - 1;
1948
/* Write header to new packed data file */
1950
static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
1951
my_off_t tot_elements,my_off_t filelength)
1953
uchar *buff= (uchar*) file_buffer.pos;
1955
bzero(buff,HEAD_LENGTH);
1956
memcpy_fixed(buff,myisam_pack_file_magic,4);
1957
int4store(buff+4,head_length);
1958
int4store(buff+8, mrg->min_pack_length);
1959
int4store(buff+12,mrg->max_pack_length);
1960
int4store(buff+16,tot_elements);
1961
int4store(buff+20,intervall_length);
1962
int2store(buff+24,trees);
1963
buff[26]=(char) mrg->ref_length;
1964
/* Save record pointer length */
1965
buff[27]= (uchar) mi_get_pointer_length((uint64_t) filelength,2);
1968
VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
1969
return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
1970
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
1973
/* Write fieldinfo to new packed file */
1975
static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
1978
uint huff_tree_bits;
1979
huff_tree_bits=max_bit(trees ? trees-1 : 0);
1984
VOID(printf("column types:\n"));
1985
VOID(printf("FIELD_NORMAL 0\n"));
1986
VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
1987
VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
1988
VOID(printf("FIELD_SKIP_ZERO 3\n"));
1989
VOID(printf("FIELD_BLOB 4\n"));
1990
VOID(printf("FIELD_CONSTANT 5\n"));
1991
VOID(printf("FIELD_INTERVALL 6\n"));
1992
VOID(printf("FIELD_ZERO 7\n"));
1993
VOID(printf("FIELD_VARCHAR 8\n"));
1994
VOID(printf("FIELD_CHECK 9\n"));
1996
VOID(printf("pack type as a set of flags:\n"));
1997
VOID(printf("PACK_TYPE_SELECTED 1\n"));
1998
VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
1999
VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
2002
for (i=0 ; i++ < fields ; counts++)
2004
write_bits((uint64_t) (int) counts->field_type, 5);
2005
write_bits(counts->pack_type,6);
2006
if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2007
write_bits(counts->max_zero_fill,5);
2009
write_bits(counts->length_bits,5);
2010
write_bits((uint64_t) counts->tree->tree_number - 1, huff_tree_bits);
2012
VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2013
"tree: %2u length: %4u\n", i , counts->field_type,
2014
counts->pack_type, counts->max_zero_fill, counts->length_bits,
2015
counts->tree->tree_number, counts->field_length));
2021
/* Write all huff_trees to new datafile. Return tot count of
2022
elements in all trees
2023
Returns 0 on error */
2025
static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2031
uint *packed_tree,*offset,length;
2034
/* Find the highest number of elements in the trees. */
2035
for (i=length=0 ; i < trees ; i++)
2036
if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2037
length=huff_tree[i].elements;
2039
Allocate a buffer for packing a decode tree. Two numbers per element
2040
(left child and right child).
2042
if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2044
my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2052
for (elements=0; trees-- ; huff_tree++)
2054
/* Skip columns that have been joined with other columns. */
2055
if (huff_tree->tree_number == 0)
2056
continue; /* Deleted tree */
2060
/* Count the total number of elements (byte codes or column values). */
2061
elements+=huff_tree->elements;
2062
huff_tree->max_offset=2;
2063
/* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2064
if (huff_tree->elements <= 1)
2067
offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2069
/* This should be the same as 'length' above. */
2070
huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2073
Since we check this during collecting the distinct column values,
2074
this should never happen.
2076
if (huff_tree->max_offset >= IS_OFFSET)
2077
{ /* This should be impossible */
2078
VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2079
huff_tree->max_offset));
2080
my_afree((uchar*) packed_tree);
2084
if (!huff_tree->counts->tree_buff)
2086
/* We do a byte compression on this column. Mark with bit 0. */
2088
write_bits(huff_tree->min_chr,8);
2089
write_bits(huff_tree->elements,9);
2090
write_bits(huff_tree->char_bits,5);
2091
write_bits(huff_tree->offset_bits,5);
2096
int_length=(uint) (huff_tree->counts->tree_pos -
2097
huff_tree->counts->tree_buff);
2098
/* We have distinct column values for this column. Mark with bit 1. */
2100
write_bits(huff_tree->elements,15);
2101
write_bits(int_length,16);
2102
write_bits(huff_tree->char_bits,5);
2103
write_bits(huff_tree->offset_bits,5);
2104
intervall_length+=int_length;
2107
VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2108
"%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2109
huff_tree->char_bits, huff_tree->offset_bits,
2110
huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2111
huff_tree->counts->tree_buff ? int_length :
2112
huff_tree->min_chr, huff_tree->height));
2114
/* Check that the code tree length matches the element count. */
2115
length=(uint) (offset-packed_tree);
2116
if (length != huff_tree->elements*2-2)
2118
VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2119
length, huff_tree->elements * 2 - 2));
2124
for (i=0 ; i < length ; i++)
2126
if (packed_tree[i] & IS_OFFSET)
2127
write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2128
huff_tree->offset_bits+1);
2130
write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2132
VOID(printf("tree[0x%04x]: %s0x%04x\n",
2133
i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2134
(packed_tree[i] & IS_OFFSET) ?
2135
packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2140
Display coding tables and check their correctness.
2142
codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2143
for (i= 0; i < codes; i++)
2150
if (! (len= huff_tree->code_len[i]))
2153
VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2154
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2155
bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2157
/* Check that the encode table decodes correctly. */
2165
VOID(fflush(stdout));
2166
VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2167
hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2172
code|= (huff_tree->code[i] >> (--len)) & 1;
2174
if (bits > 8 * sizeof(code))
2176
VOID(fflush(stdout));
2177
VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2178
bits, (uint) (8 * sizeof(code))));
2182
idx+= (uint) code & 1;
2185
VOID(fflush(stdout));
2186
VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2191
if (packed_tree[idx] & IS_OFFSET)
2192
idx+= packed_tree[idx] & ~IS_OFFSET;
2194
break; /* Hit a leaf. This contains the result value. */
2199
if (packed_tree[idx] != i)
2201
VOID(fflush(stdout));
2202
VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2203
packed_tree[idx], i));
2207
} /*end for (codes)*/
2211
/* Write column values in case of distinct column value compression. */
2212
if (huff_tree->counts->tree_buff)
2214
for (i=0 ; i < int_length ; i++)
2216
write_bits((uint64_t) (uchar) huff_tree->counts->tree_buff[i], 8);
2218
VOID(printf("column_values[0x%04x]: 0x%02x\n",
2219
i, (uchar) huff_tree->counts->tree_buff[i]));
2226
my_afree((uchar*) packed_tree);
2229
VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2236
static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2241
prev_offset= offset;
2243
'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2244
then there is no left child and, hence no right child either. This
2245
is a property of a binary tree. An element is either a node with two
2246
childs, or a leaf without childs.
2248
The current element is always a node with two childs. Go left first.
2250
if (!element->a.nod.left->a.leaf.null)
2252
/* Store the byte code or the index of the column value. */
2253
prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2259
Recursively traverse the tree to the left. Mark it as an offset to
2260
another tree node (in contrast to a byte code or column value index).
2262
prev_offset[0]= IS_OFFSET+2;
2263
offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2266
/* Now, check the right child. */
2267
if (!element->a.nod.right->a.leaf.null)
2269
/* Store the byte code or the index of the column value. */
2270
prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2276
Recursively traverse the tree to the right. Mark it as an offset to
2277
another tree node (in contrast to a byte code or column value index).
2279
uint temp=(uint) (offset-prev_offset-1);
2280
prev_offset[1]= IS_OFFSET+ temp;
2281
if (huff_tree->max_offset < temp)
2282
huff_tree->max_offset = temp;
2283
return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2287
/* Get number of bits neaded to represent value */
2289
static uint max_bit(register uint value)
2291
register uint power=1;
2299
static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2302
uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2303
intervall,field_length,max_pack_length,pack_blob_length;
2304
my_off_t record_count;
2306
ulong length,pack_length;
2307
uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2308
HUFF_COUNTS *count,*end_count;
2310
MI_INFO *isam_file=mrg->file[0];
2311
uint pack_version= (uint) isam_file->s->pack.version;
2313
/* Allocate a buffer for the records (excluding blobs). */
2314
if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2317
end_count=huff_counts+isam_file->s->base.fields;
2318
min_record_length= (uint) ~0;
2319
max_record_length=0;
2322
Calculate the maximum number of bits required to pack the records.
2323
Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2324
The tree height determines the maximum number of bits per value.
2325
Some fields skip leading or trailing spaces or zeroes. The skipped
2326
number of bytes is encoded by 'length_bits' bits.
2327
Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2328
and varchar get a leading 0 bit.
2330
for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2332
if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2333
huff_counts[i].max_zero_fill=0;
2334
if (huff_counts[i].field_type == FIELD_CONSTANT ||
2335
huff_counts[i].field_type == FIELD_ZERO ||
2336
huff_counts[i].field_type == FIELD_CHECK)
2338
if (huff_counts[i].field_type == FIELD_INTERVALL)
2339
max_calc_length+=huff_counts[i].tree->height;
2340
else if (huff_counts[i].field_type == FIELD_BLOB ||
2341
huff_counts[i].field_type == FIELD_VARCHAR)
2342
max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2345
(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2346
huff_counts[i].tree->height+huff_counts[i].length_bits;
2348
max_calc_length= (max_calc_length + 7) / 8;
2349
pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2351
/* 'max_blob_length' is the max length of all blobs of a record. */
2352
pack_blob_length= isam_file->s->base.blobs ?
2353
calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2354
max_pack_length=pack_ref_length+pack_blob_length;
2357
while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2359
ulong tot_blob_length=0;
2362
if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2364
record_pos= (uchar*) file_buffer.pos;
2365
file_buffer.pos+=max_pack_length;
2366
for (start_pos=record, count= huff_counts; count < end_count ; count++)
2368
end_pos=start_pos+(field_length=count->field_length);
2371
/* Check if the column contains spaces only. */
2372
if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2374
for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2383
end_pos-=count->max_zero_fill;
2384
field_length-=count->max_zero_fill;
2386
switch (count->field_type) {
2387
case FIELD_SKIP_ZERO:
2388
if (!memcmp((uchar*) start_pos,zero_string,field_length))
2397
for ( ; start_pos < end_pos ; start_pos++)
2399
write_bits(tree->code[(uchar) *start_pos],
2400
(uint) tree->code_len[(uchar) *start_pos]);
2403
case FIELD_SKIP_ENDSPACE:
2404
for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2405
length= (ulong) (end_pos - pos);
2406
if (count->pack_type & PACK_TYPE_SELECTED)
2408
if (length > count->min_space)
2411
write_bits(length,count->length_bits);
2421
write_bits(length,count->length_bits);
2423
/* Encode all significant bytes. */
2424
for ( ; start_pos < pos ; start_pos++)
2426
write_bits(tree->code[(uchar) *start_pos],
2427
(uint) tree->code_len[(uchar) *start_pos]);
2431
case FIELD_SKIP_PRESPACE:
2432
for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2433
length= (ulong) (pos - start_pos);
2434
if (count->pack_type & PACK_TYPE_SELECTED)
2436
if (length > count->min_space)
2439
write_bits(length,count->length_bits);
2449
write_bits(length,count->length_bits);
2451
/* Encode all significant bytes. */
2452
for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2454
write_bits(tree->code[(uchar) *start_pos],
2455
(uint) tree->code_len[(uchar) *start_pos]);
2458
case FIELD_CONSTANT:
2463
case FIELD_INTERVALL:
2465
pos=(uchar*) tree_search(&count->int_tree, start_pos,
2466
count->int_tree.custom_arg);
2467
intervall=(uint) (pos - count->tree_buff)/field_length;
2468
write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2473
ulong blob_length=_mi_calc_blob_length(field_length-
2474
portable_sizeof_char_ptr,
2476
/* Empty blobs are encoded with a single 1 bit. */
2483
uchar *blob,*blob_end;
2485
/* Write the blob length. */
2486
write_bits(blob_length,count->length_bits);
2487
memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2489
blob_end=blob+blob_length;
2490
/* Encode the blob bytes. */
2491
for ( ; blob < blob_end ; blob++)
2493
write_bits(tree->code[(uchar) *blob],
2494
(uint) tree->code_len[(uchar) *blob]);
2496
tot_blob_length+=blob_length;
2503
uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2504
ulong col_length= (var_pack_length == 1 ?
2505
(uint) *(uchar*) start_pos :
2506
uint2korr(start_pos));
2507
/* Empty varchar are encoded with a single 1 bit. */
2510
write_bits(1,1); /* Empty varchar */
2514
uchar *end= start_pos + var_pack_length + col_length;
2516
/* Write the varchar length. */
2517
write_bits(col_length,count->length_bits);
2518
/* Encode the varchar bytes. */
2519
for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2521
write_bits(tree->code[(uchar) *start_pos],
2522
(uint) tree->code_len[(uchar) *start_pos]);
2529
case FIELD_enum_val_count:
2530
abort(); /* Impossible */
2532
start_pos+=count->max_zero_fill;
2535
length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2536
pack_length= save_pack_length(pack_version, record_pos, length);
2537
if (pack_blob_length)
2538
pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2540
/* Correct file buffer if the header was smaller */
2541
if (pack_length != max_pack_length)
2543
bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2544
file_buffer.pos-= (max_pack_length-pack_length);
2546
if (length < (ulong) min_record_length)
2547
min_record_length=(uint) length;
2548
if (length > (ulong) max_record_length)
2549
max_record_length=(uint) length;
2551
if (write_loop && record_count % WRITE_COUNT == 0)
2553
VOID(printf("%lu\r", (ulong) record_count));
2554
VOID(fflush(stdout));
2557
else if (error != HA_ERR_RECORD_DELETED)
2560
if (error == HA_ERR_END_OF_FILE)
2564
VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2565
my_progname, error));
2568
VOID(printf("wrote %s records.\n", llstr((int64_t) record_count, llbuf)));
2570
my_afree((uchar*) record);
2571
mrg->ref_length=max_pack_length;
2572
mrg->min_pack_length=max_record_length ? min_record_length : 0;
2573
mrg->max_pack_length=max_record_length;
2574
return(error || error_on_write || flush_buffer(~(ulong) 0));
2578
static char *make_new_name(char *new_name, char *old_name)
2580
return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2583
static char *make_old_name(char *new_name, char *old_name)
2585
return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2588
/* rutines for bit writing buffer */
2590
static void init_file_buffer(File file, bool read_buffer)
2592
file_buffer.file=file;
2593
file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2595
file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2596
file_buffer.pos_in_file=0;
2601
file_buffer.pos=file_buffer.end;
2606
file_buffer.pos=file_buffer.buffer;
2607
file_buffer.bits=BITS_SAVED;
2609
file_buffer.bitbucket= 0;
2613
static int flush_buffer(ulong neaded_length)
2618
file_buffer.end is 8 bytes lower than the real end of the buffer.
2619
This is done so that the end-of-buffer condition does not need to be
2620
checked for every byte (see write_bits()). Consequently,
2621
file_buffer.pos can become greater than file_buffer.end. The
2622
algorithms in the other functions ensure that there will never be
2623
more than 8 bytes written to the buffer without an end-of-buffer
2624
check. So the buffer cannot be overrun. But we need to check for the
2625
near-to-buffer-end condition to avoid a negative result, which is
2626
casted to unsigned and thus becomes giant.
2628
if ((file_buffer.pos < file_buffer.end) &&
2629
((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2631
length=(ulong) (file_buffer.pos-file_buffer.buffer);
2632
file_buffer.pos=file_buffer.buffer;
2633
file_buffer.pos_in_file+=length;
2636
if (error_on_write|| my_write(file_buffer.file,
2637
(const uchar*) file_buffer.buffer,
2639
MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2645
if (neaded_length != ~(ulong) 0 &&
2646
(ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2649
neaded_length+=256; /* some margin */
2650
tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2653
file_buffer.pos= ((uchar*) tmp +
2654
(ulong) (file_buffer.pos - file_buffer.buffer));
2655
file_buffer.buffer= (uchar*) tmp;
2656
file_buffer.end= (uchar*) (tmp+neaded_length-8);
2662
static void end_file_buffer(void)
2664
my_free((uchar*) file_buffer.buffer,MYF(0));
2667
/* output `bits` low bits of `value' */
2669
static void write_bits(register uint64_t value, register uint bits)
2671
assert(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2672
(bits == 8 * sizeof(value)));
2674
if ((file_buffer.bits-= (int) bits) >= 0)
2676
file_buffer.bitbucket|= value << file_buffer.bits;
2680
register uint64_t bit_buffer;
2681
bits= (uint) -file_buffer.bits;
2682
bit_buffer= (file_buffer.bitbucket |
2683
((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2684
#if BITS_SAVED == 64
2685
*file_buffer.pos++= (uchar) (bit_buffer >> 56);
2686
*file_buffer.pos++= (uchar) (bit_buffer >> 48);
2687
*file_buffer.pos++= (uchar) (bit_buffer >> 40);
2688
*file_buffer.pos++= (uchar) (bit_buffer >> 32);
2690
*file_buffer.pos++= (uchar) (bit_buffer >> 24);
2691
*file_buffer.pos++= (uchar) (bit_buffer >> 16);
2692
*file_buffer.pos++= (uchar) (bit_buffer >> 8);
2693
*file_buffer.pos++= (uchar) (bit_buffer);
2695
if (bits != 8 * sizeof(value))
2696
value&= (((uint64_t) 1) << bits) - 1;
2697
if (file_buffer.pos >= file_buffer.end)
2698
VOID(flush_buffer(~ (ulong) 0));
2699
file_buffer.bits=(int) (BITS_SAVED - bits);
2700
file_buffer.bitbucket= value << (BITS_SAVED - bits);
2705
/* Flush bits in bit_buffer to buffer */
2707
static void flush_bits(void)
2710
uint64_t bit_buffer;
2712
bits= file_buffer.bits & ~7;
2713
bit_buffer= file_buffer.bitbucket >> bits;
2714
bits= BITS_SAVED - bits;
2718
*file_buffer.pos++= (uchar) (bit_buffer >> bits);
2720
if (file_buffer.pos >= file_buffer.end)
2721
VOID(flush_buffer(~ (ulong) 0));
2722
file_buffer.bits= BITS_SAVED;
2723
file_buffer.bitbucket= 0;
2727
/****************************************************************************
2728
** functions to handle the joined files
2729
****************************************************************************/
2731
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2734
MYISAM_SHARE *share=isam_file->s;
2735
uint options=mi_uint2korr(share->state.header.options);
2738
options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2739
mi_int2store(share->state.header.options,options);
2741
share->state.state.data_file_length=new_length;
2742
share->state.state.del=0;
2743
share->state.state.empty=0;
2744
share->state.dellink= HA_OFFSET_ERROR;
2745
share->state.split=(ha_rows) mrg->records;
2746
share->state.version=(ulong) time((time_t*) 0);
2747
if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2750
Some indexes are disabled, cannot use current key_file_length value
2751
as an estimate of upper bound of index file size. Use packed data file
2754
share->state.state.key_file_length= new_length;
2757
If there are no disabled indexes, keep key_file_length value from
2758
original file so "myisamchk -rq" can use this value (this is necessary
2759
because index size cannot be easily calculated for fulltext keys)
2761
mi_clear_all_keys_active(share->state.key_map);
2762
for (key=0 ; key < share->base.keys ; key++)
2763
share->state.key_root[key]= HA_OFFSET_ERROR;
2764
for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2765
share->state.key_del[key]= HA_OFFSET_ERROR;
2766
isam_file->state->checksum=crc; /* Save crc here */
2767
share->changed=1; /* Force write of header */
2768
share->state.open_count=0;
2769
share->global_changed=0;
2770
(void)ftruncate(share->kfile, share->base.keystart);
2771
if (share->base.keys)
2773
return(mi_state_info_write(share->kfile,&share->state,1+2));
2777
static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2780
MI_STATE_INFO state;
2781
MI_INFO *isam_file=mrg->file[0];
2784
state= isam_file->s->state;
2785
options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2786
HA_OPTION_READ_ONLY_DATA);
2787
mi_int2store(state.header.options,options);
2788
state.state.data_file_length=new_length;
2790
state.state.empty=0;
2791
state.state.records=state.split=(ha_rows) mrg->records;
2792
/* See comment above in save_state about key_file_length handling. */
2793
if (mrg->src_file_has_indexes_disabled)
2795
isam_file->s->state.state.key_file_length=
2796
max(isam_file->s->state.state.key_file_length, new_length);
2798
state.dellink= HA_OFFSET_ERROR;
2799
state.version=(ulong) time((time_t*) 0);
2800
mi_clear_all_keys_active(state.key_map);
2801
state.state.checksum=crc;
2802
if (isam_file->s->base.keys)
2804
state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
2805
return (mi_state_info_write(file,&state,1+2));
2809
/* reset for mrg_rrnd */
2811
static void mrg_reset(PACK_MRG_INFO *mrg)
2815
mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
2820
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
2828
isam_info= *(info->current=info->file);
2829
info->end=info->current+info->count;
2830
mi_reset(isam_info);
2831
mi_extra(isam_info, HA_EXTRA_CACHE, 0);
2832
filepos=isam_info->s->pack.header_length;
2836
isam_info= *info->current;
2837
filepos= isam_info->nextpos;
2842
isam_info->update&= HA_STATE_CHANGED;
2843
if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
2845
error != HA_ERR_END_OF_FILE)
2847
mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
2848
if (info->current+1 == info->end)
2849
return(HA_ERR_END_OF_FILE);
2851
isam_info= *info->current;
2852
filepos=isam_info->s->pack.header_length;
2853
mi_reset(isam_info);
2854
mi_extra(isam_info,HA_EXTRA_CACHE, 0);
2859
static int mrg_close(PACK_MRG_INFO *mrg)
2863
for (i=0 ; i < mrg->count ; i++)
2864
error|=mi_close(mrg->file[i]);
2866
my_free((uchar*) mrg->file,MYF(0));