~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000-2006 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
16
/* Pack MyISAM file */
17
18
#ifndef USE_MY_FUNC
19
#define USE_MY_FUNC			/* We need at least my_malloc */
20
#endif
21
22
#include "myisamdef.h"
23
#include <queues.h>
24
#include <my_tree.h>
25
#include "mysys_err.h"
26
#ifndef __GNU_LIBRARY__
27
#define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
28
#endif
29
#include <my_getopt.h>
30
#include <assert.h>
31
32
#if SIZEOF_LONG_LONG > 4
33
#define BITS_SAVED 64
34
#else
35
#define BITS_SAVED 32
36
#endif
37
38
#define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
39
#define HEAD_LENGTH	32
40
#define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
41
42
#define DATA_TMP_EXT		".TMD"
43
#define OLD_EXT			".OLD"
44
#define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
45
46
struct st_file_buffer {
47
  File file;
48
  uchar *buffer,*pos,*end;
49
  my_off_t pos_in_file;
50
  int bits;
151 by Brian Aker
Ulonglong to uint64_t
51
  uint64_t bitbucket;
1 by brian
clean slate
52
};
53
54
struct st_huff_tree;
55
struct st_huff_element;
56
57
typedef struct st_huff_counts {
58
  uint	field_length,max_zero_fill;
59
  uint	pack_type;
60
  uint	max_end_space,max_pre_space,length_bits,min_space;
61
  ulong max_length;
62
  enum en_fieldtype field_type;
63
  struct st_huff_tree *tree;		/* Tree for field */
64
  my_off_t counts[256];
65
  my_off_t end_space[8];
66
  my_off_t pre_space[8];
67
  my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
68
  TREE int_tree;        /* Tree for detecting distinct column values. */
69
  uchar *tree_buff;      /* Column values, 'field_length' each. */
70
  uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
71
} HUFF_COUNTS;
72
73
typedef struct st_huff_element HUFF_ELEMENT;
74
75
/*
76
  WARNING: It is crucial for the optimizations in calc_packed_length()
77
  that 'count' is the first element of 'HUFF_ELEMENT'.
78
*/
79
struct st_huff_element {
80
  my_off_t count;
81
  union un_element {
82
    struct st_nod {
83
      HUFF_ELEMENT *left,*right;
84
    } nod;
85
    struct st_leaf {
86
      HUFF_ELEMENT *null;
87
      uint	element_nr;		/* Number of element */
88
    } leaf;
89
  } a;
90
};
91
92
93
typedef struct st_huff_tree {
94
  HUFF_ELEMENT *root,*element_buffer;
95
  HUFF_COUNTS *counts;
96
  uint tree_number;
97
  uint elements;
98
  my_off_t bytes_packed;
99
  uint tree_pack_length;
100
  uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
151 by Brian Aker
Ulonglong to uint64_t
101
  uint64_t *code;
1 by brian
clean slate
102
  uchar *code_len;
103
} HUFF_TREE;
104
105
106
typedef struct st_isam_mrg {
107
  MI_INFO **file,**current,**end;
108
  uint free_file;
109
  uint count;
110
  uint	min_pack_length;		/* Theese is used by packed data */
111
  uint	max_pack_length;
112
  uint	ref_length;
113
  uint	max_blob_length;
114
  my_off_t records;
115
  /* true if at least one source file has at least one disabled index */
116
  my_bool src_file_has_indexes_disabled;
117
} PACK_MRG_INFO;
118
119
120
extern int main(int argc,char * *argv);
121
static void get_options(int *argc,char ***argv);
122
static MI_INFO *open_isam_file(char *name,int mode);
123
static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
124
static int compress(PACK_MRG_INFO *file,char *join_name);
125
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
126
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
127
					   uint trees,
128
					   HUFF_COUNTS *huff_counts,
129
					   uint fields);
130
static int compare_tree(void* cmp_arg __attribute__((unused)),
131
			const uchar *s,const uchar *t);
132
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
133
static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
134
			 my_off_t records);
135
static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
136
			       uint max_space_length,my_off_t *space_counts,
137
			       my_off_t tot_space_count,
138
			       enum en_fieldtype field_type);
139
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
140
static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
141
static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
142
static int save_counts_in_queue(uchar *key,element_count count,
143
				    HUFF_TREE *tree);
144
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
145
static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
146
static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
147
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
148
				    HUFF_ELEMENT *element,uint size,
151 by Brian Aker
Ulonglong to uint64_t
149
				    uint64_t code);
1 by brian
clean slate
150
static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
151
			my_off_t tot_elements,my_off_t filelength);
152
static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
153
static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
154
static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
155
				       HUFF_ELEMENT *element,
156
				       uint *offset);
157
static uint max_bit(uint value);
158
static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
159
static char *make_new_name(char *new_name,char *old_name);
160
static char *make_old_name(char *new_name,char *old_name);
154 by Brian Aker
Removed oddball types in my_global.h
161
static void init_file_buffer(File file,bool read_buffer);
1 by brian
clean slate
162
static int flush_buffer(ulong neaded_length);
163
static void end_file_buffer(void);
151 by Brian Aker
Ulonglong to uint64_t
164
static void write_bits(uint64_t value, uint bits);
1 by brian
clean slate
165
static void flush_bits(void);
166
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
167
		      ha_checksum crc);
168
static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
169
			  ha_checksum crc);
170
static int mrg_close(PACK_MRG_INFO *mrg);
171
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
172
static void mrg_reset(PACK_MRG_INFO *mrg);
173
174
static int error_on_write=0,test_only=0,verbose=0,silent=0,
175
	   write_loop=0,force_pack=0, isamchk_neaded=0;
176
static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
177
static my_bool backup, opt_wait;
178
/*
179
  tree_buff_length is somewhat arbitrary. The bigger it is the better
180
  the chance to win in terms of compression factor. On the other hand,
181
  this table becomes part of the compressed file header. And its length
182
  is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
183
*/
184
static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
185
static char tmp_dir[FN_REFLEN]={0},*join_table;
186
static my_off_t intervall_length;
187
static ha_checksum glob_crc;
188
static struct st_file_buffer file_buffer;
189
static QUEUE queue;
190
static HUFF_COUNTS *global_count;
191
static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
192
static const char *load_default_groups[]= { "myisampack",0 };
193
194
	/* The main program */
195
196
int main(int argc, char **argv)
197
{
198
  int error,ok;
199
  PACK_MRG_INFO merge;
200
  char **default_argv;
201
  MY_INIT(argv[0]);
202
203
  load_defaults("my",load_default_groups,&argc,&argv);
204
  default_argv= argv;
205
  get_options(&argc,&argv);
206
207
  error=ok=isamchk_neaded=0;
208
  if (join_table)
209
  {						/* Join files into one */
210
    if (open_isam_files(&merge,argv,(uint) argc) ||
211
	compress(&merge,join_table))
212
      error=1;
213
  }
214
  else while (argc--)
215
  {
216
    MI_INFO *isam_file;
217
    if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
218
      error=1;
219
    else
220
    {
221
      merge.file= &isam_file;
222
      merge.current=0;
223
      merge.free_file=0;
224
      merge.count=1;
225
      if (compress(&merge,0))
226
	error=1;
227
      else
228
	ok=1;
229
    }
230
  }
231
  if (ok && isamchk_neaded && !silent)
232
    puts("Remember to run myisamchk -rq on compressed tables");
233
  VOID(fflush(stdout));
234
  VOID(fflush(stderr));
235
  free_defaults(default_argv);
236
  my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
237
  exit(error ? 2 : 0);
238
#ifndef _lint
239
  return 0;					/* No compiler warning */
240
#endif
241
}
242
243
enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
244
245
static struct my_option my_long_options[] =
246
{
247
  {"backup", 'b', "Make a backup of the table as table_name.OLD.",
77.1.77 by Monty Taylor
A crapton more warning cleanups (I turned on more warnings)
248
   (char**) &backup, (char**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
1 by brian
clean slate
249
  {"character-sets-dir", OPT_CHARSETS_DIR_MP,
77.1.77 by Monty Taylor
A crapton more warning cleanups (I turned on more warnings)
250
   "Directory where character sets are.", (char**) &charsets_dir,
251
   (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
1 by brian
clean slate
252
  {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
253
   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
254
  {"force", 'f',
255
   "Force packing of table even if it gets bigger or if tempfile exists.",
256
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
257
  {"join", 'j',
258
   "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
77.1.77 by Monty Taylor
A crapton more warning cleanups (I turned on more warnings)
259
   (char**) &join_table, (char**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
1 by brian
clean slate
260
   0, 0, 0},
261
  {"help", '?', "Display this help and exit.",
262
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
263
  {"silent", 's', "Be more silent.",
264
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
265
  {"tmpdir", 'T', "Use temporary directory to store temporary table.",
266
   0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
267
  {"test", 't', "Don't pack table, only test packing it.",
268
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
269
  {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
270
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
271
  {"version", 'V', "Output version information and exit.",
272
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
77.1.77 by Monty Taylor
A crapton more warning cleanups (I turned on more warnings)
273
  {"wait", 'w', "Wait and retry if table is in use.", (char**) &opt_wait,
274
   (char**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
1 by brian
clean slate
275
  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
276
};
277
278
static void print_version(void)
279
{
280
  VOID(printf("%s Ver 1.23 for %s on %s\n",
281
              my_progname, SYSTEM_TYPE, MACHINE_TYPE));
282
}
283
284
285
static void usage(void)
286
{
287
  print_version();
288
  puts("Copyright (C) 2002 MySQL AB");
289
  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
290
  puts("and you are welcome to modify and redistribute it under the GPL license\n");
291
292
  puts("Pack a MyISAM-table to take much less space.");
293
  puts("Keys are not updated, you must run myisamchk -rq on the datafile");
294
  puts("afterwards to update the keys.");
295
  puts("You should give the .MYI file as the filename argument.");
296
297
  VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
298
  my_print_help(my_long_options);
299
  print_defaults("my", load_default_groups);
300
  my_print_variables(my_long_options);
301
}
302
77.1.86 by Monty Taylor
A few remaining my_bool -> bool cleanups.
303
static bool
1 by brian
clean slate
304
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
305
	       char *argument)
306
{
307
  uint length;
308
309
  switch(optid) {
310
#ifdef __NETWARE__
311
  case OPT_AUTO_CLOSE:
312
    setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
313
    break;
314
#endif
315
  case 'f':
316
    force_pack= 1;
317
    tmpfile_createflag= O_RDWR | O_TRUNC;
318
    break;
319
  case 's':
320
    write_loop= verbose= 0;
321
    silent= 1;
322
    break;
323
  case 't':
324
    test_only= 1;
325
    /* Avoid to reset 'verbose' if it was already set > 1. */
326
    if (! verbose)
327
      verbose= 1;
328
    break;
329
  case 'T':
330
    length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
331
    if (length != dirname_length(tmp_dir))
332
    {
333
      tmp_dir[length]=FN_LIBCHAR;
334
      tmp_dir[length+1]=0;
335
    }
336
    break;
337
  case 'v':
338
    verbose++; /* Allow for selecting the level of verbosity. */
339
    silent= 0;
340
    break;
341
  case 'V':
342
    print_version();
343
    exit(0);
344
  case 'I':
345
  case '?':
346
    usage();
347
    exit(0);
348
  }
349
  return 0;
350
}
351
352
	/* reads options */
353
	/* Initiates DEBUG - but no debugging here ! */
354
355
static void get_options(int *argc,char ***argv)
356
{
357
  int ho_error;
358
359
  my_progname= argv[0][0];
360
  if (isatty(fileno(stdout)))
361
    write_loop=1;
362
363
  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
364
    exit(ho_error);
365
366
  if (!*argc)
367
  {
368
    usage();
369
    exit(1);
370
  }
371
  if (join_table)
372
  {
373
    backup=0;					/* Not needed */
374
    tmp_dir[0]=0;
375
  }
376
  return;
377
}
378
379
380
static MI_INFO *open_isam_file(char *name,int mode)
381
{
382
  MI_INFO *isam_file;
383
  MYISAM_SHARE *share;
384
385
  if (!(isam_file=mi_open(name,mode,
386
			  (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
387
			   HA_OPEN_ABORT_IF_LOCKED))))
388
  {
389
    VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
390
    return(0);
1 by brian
clean slate
391
  }
392
  share=isam_file->s;
393
  if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
394
  {
395
    if (!force_pack)
396
    {
397
      VOID(fprintf(stderr, "%s is already compressed\n", name));
398
      VOID(mi_close(isam_file));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
399
      return(0);
1 by brian
clean slate
400
    }
401
    if (verbose)
402
      puts("Recompressing already compressed table");
403
    share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
404
  }
405
  if (! force_pack && share->state.state.records != 0 &&
406
      (share->state.state.records <= 1 ||
407
       share->state.state.data_file_length < 1024))
408
  {
409
    VOID(fprintf(stderr, "%s is too small to compress\n", name));
410
    VOID(mi_close(isam_file));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
411
    return(0);
1 by brian
clean slate
412
  }
413
  VOID(mi_lock_database(isam_file,F_WRLCK));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
414
  return(isam_file);
1 by brian
clean slate
415
}
416
417
418
static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
419
{
420
  uint i,j;
421
  mrg->count=0;
422
  mrg->current=0;
423
  mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
424
  mrg->free_file=1;
425
  mrg->src_file_has_indexes_disabled= 0;
426
  for (i=0; i < count ; i++)
427
  {
428
    if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
429
      goto error;
430
431
    mrg->src_file_has_indexes_disabled|=
432
      ! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
433
                              mrg->file[i]->s->base.keys);
434
  }
435
  /* Check that files are identical */
436
  for (j=0 ; j < count-1 ; j++)
437
  {
438
    MI_COLUMNDEF *m1,*m2,*end;
439
    if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
440
	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
441
      goto diff_file;
442
    m1=mrg->file[j]->s->rec;
443
    end=m1+mrg->file[j]->s->base.fields;
444
    m2=mrg->file[j+1]->s->rec;
445
    for ( ; m1 != end ; m1++,m2++)
446
    {
447
      if (m1->type != m2->type || m1->length != m2->length)
448
	goto diff_file;
449
    }
450
  }
451
  mrg->count=count;
452
  return 0;
453
454
 diff_file:
455
  VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
456
               my_progname, names[j], names[j+1]));
457
 error:
458
  while (i--)
459
    mi_close(mrg->file[i]);
460
  my_free((uchar*) mrg->file,MYF(0));
461
  return 1;
462
}
463
464
465
static int compress(PACK_MRG_INFO *mrg,char *result_table)
466
{
467
  int error;
468
  File new_file,join_isam_file;
469
  MI_INFO *isam_file;
470
  MYISAM_SHARE *share;
471
  char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
472
  uint i,header_length,fields,trees,used_trees;
473
  my_off_t old_length,new_length,tot_elements;
474
  HUFF_COUNTS *huff_counts;
475
  HUFF_TREE *huff_trees;
476
477
  isam_file=mrg->file[0];			/* Take this as an example */
478
  share=isam_file->s;
479
  new_file=join_isam_file= -1;
480
  trees=fields=0;
481
  huff_trees=0;
482
  huff_counts=0;
483
484
  /* Create temporary or join file */
485
486
  if (backup)
487
    VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
488
  else
489
    VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
490
  if (!test_only && result_table)
491
  {
492
    /* Make a new indexfile based on first file in list */
493
    uint length;
494
    uchar *buff;
495
    strmov(org_name,result_table);		/* Fix error messages */
496
    VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
497
    if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
498
	< 0)
499
      goto err;
500
    length=(uint) share->base.keystart;
501
    if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
502
      goto err;
503
    if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
504
	my_write(join_isam_file,buff,length,
505
		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
506
    {
507
      my_free(buff,MYF(0));
508
      goto err;
509
    }
510
    my_free(buff,MYF(0));
511
    VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
512
  }
513
  else if (!tmp_dir[0])
514
    VOID(make_new_name(new_name,org_name));
515
  else
516
    VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
517
  if (!test_only &&
518
      (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
519
    goto err;
520
521
  /* Start calculating statistics */
522
523
  mrg->records=0;
524
  for (i=0 ; i < mrg->count ; i++)
525
    mrg->records+=mrg->file[i]->s->state.state.records;
526
527
  if (write_loop || verbose)
528
  {
529
    VOID(printf("Compressing %s: (%lu records)\n",
530
                result_table ? new_name : org_name, (ulong) mrg->records));
531
  }
532
  trees=fields=share->base.fields;
533
  huff_counts=init_huff_count(isam_file,mrg->records);
534
  QUICK_SAFEMALLOC;
535
536
  /*
537
    Read the whole data file(s) for statistics.
538
  */
539
  if (write_loop || verbose)
540
    VOID(printf("- Calculating statistics\n"));
541
  if (get_statistic(mrg,huff_counts))
542
    goto err;
543
  NORMAL_SAFEMALLOC;
544
  old_length=0;
545
  for (i=0; i < mrg->count ; i++)
546
    old_length+= (mrg->file[i]->s->state.state.data_file_length -
547
		  mrg->file[i]->s->state.state.empty);
548
549
  /*
550
    Create a global priority queue in preparation for making 
551
    temporary Huffman trees.
552
  */
553
  if (init_queue(&queue,256,0,0,compare_huff_elements,0))
554
    goto err;
555
556
  /*
557
    Check each column if we should use pre-space-compress, end-space-
558
    compress, empty-field-compress or zero-field-compress.
559
  */
560
  check_counts(huff_counts,fields,mrg->records);
561
562
  /*
563
    Build a Huffman tree for each column.
564
  */
565
  huff_trees=make_huff_trees(huff_counts,trees);
566
567
  /*
568
    If the packed lengths of combined columns is less then the sum of
569
    the non-combined columns, then create common Huffman trees for them.
570
    We do this only for byte compressed columns, not for distinct values
571
    compressed columns.
572
  */
573
  if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
574
    goto err;
575
576
  /*
577
    Assign codes to all byte or column values.
578
  */
579
  if (make_huff_decode_table(huff_trees,fields))
580
    goto err;
581
582
  /* Prepare a file buffer. */
583
  init_file_buffer(new_file,0);
584
585
  /*
586
    Reserve space in the target file for the fixed compressed file header.
587
  */
588
  file_buffer.pos_in_file=HEAD_LENGTH;
589
  if (! test_only)
590
    VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
591
592
  /*
593
    Write field infos: field type, pack type, length bits, tree number.
594
  */
595
  write_field_info(huff_counts,fields,used_trees);
596
597
  /*
598
    Write decode trees.
599
  */
600
  if (!(tot_elements=write_huff_tree(huff_trees,trees)))
601
    goto err;
602
603
  /*
604
    Calculate the total length of the compression info header.
605
    This includes the fixed compressed file header, the column compression
606
    type descriptions, and the decode trees.
607
  */
608
  header_length=(uint) file_buffer.pos_in_file+
609
    (uint) (file_buffer.pos-file_buffer.buffer);
610
611
  /*
612
    Compress the source file into the target file.
613
  */
614
  if (write_loop || verbose)
615
    VOID(printf("- Compressing file\n"));
616
  error=compress_isam_file(mrg,huff_counts);
617
  new_length=file_buffer.pos_in_file;
618
  if (!error && !test_only)
619
  {
620
    uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
212.6.1 by Mats Kindahl
Replacing all bzero() calls with memset() calls and removing the bzero.c file.
621
    memset(buff, 0, sizeof(buff));
1 by brian
clean slate
622
    error=my_write(file_buffer.file,buff,sizeof(buff),
623
		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
624
  }
625
626
  /*
627
    Write the fixed compressed file header.
628
  */
629
  if (!error)
630
    error=write_header(mrg,header_length,used_trees,tot_elements,
631
		       new_length);
632
633
  /* Flush the file buffer. */
634
  end_file_buffer();
635
636
  /* Display statistics. */
637
  if (verbose && mrg->records)
638
    VOID(printf("Min record length: %6d   Max length: %6d   "
639
                "Mean total length: %6ld\n", mrg->min_pack_length,
640
                mrg->max_pack_length, (ulong) (new_length/mrg->records)));
641
642
  /* Close source and target file. */
643
  if (!test_only)
644
  {
645
    error|=my_close(new_file,MYF(MY_WME));
646
    if (!result_table)
647
    {
648
      error|=my_close(isam_file->dfile,MYF(MY_WME));
649
      isam_file->dfile= -1;		/* Tell mi_close file is closed */
650
    }
651
  }
652
653
  /* Cleanup. */
654
  free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
655
  if (! test_only && ! error)
656
  {
657
    if (result_table)
658
    {
659
      error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
660
    }
661
    else
662
    {
663
      if (backup)
664
      {
665
	if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
666
		      MYF(MY_WME)))
667
	  error=1;
668
	else
669
	{
670
	  if (tmp_dir[0])
671
	    error=my_copy(new_name,org_name,MYF(MY_WME));
672
	  else
673
	    error=my_rename(new_name,org_name,MYF(MY_WME));
674
	  if (!error)
675
          {
676
	    VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
677
            if (tmp_dir[0])
678
              VOID(my_delete(new_name,MYF(MY_WME)));
679
          }
680
	}
681
      }
682
      else
683
      {
684
	if (tmp_dir[0])
685
        {
686
	  error=my_copy(new_name,org_name,
687
			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
688
          if (!error)
689
            VOID(my_delete(new_name,MYF(MY_WME)));
690
        }
691
	else
692
	  error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
693
      }
694
      if (! error)
695
	error=save_state(isam_file,mrg,new_length,glob_crc);
696
    }
697
  }
698
  error|=mrg_close(mrg);
699
  if (join_isam_file >= 0)
700
    error|=my_close(join_isam_file,MYF(MY_WME));
701
  if (error)
702
  {
703
    VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
704
    VOID(my_delete(new_name,MYF(MY_WME)));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
705
    return(-1);
1 by brian
clean slate
706
  }
707
  if (write_loop || verbose)
708
  {
709
    if (old_length)
710
      VOID(printf("%.4g%%     \n",
152 by Brian Aker
longlong replacement
711
                  (((int64_t) (old_length - new_length)) * 100.0 /
712
                   (int64_t) old_length)));
1 by brian
clean slate
713
    else
714
      puts("Empty file saved in compressed format");
715
  }
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
716
  return(0);
1 by brian
clean slate
717
718
 err:
719
  free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
720
  if (new_file >= 0)
721
    VOID(my_close(new_file,MYF(0)));
722
  if (join_isam_file >= 0)
723
    VOID(my_close(join_isam_file,MYF(0)));
724
  mrg_close(mrg);
725
  VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
726
  return(-1);
1 by brian
clean slate
727
}
728
729
	/* Init a huff_count-struct for each field and init it */
730
731
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
732
{
733
  register uint i;
734
  register HUFF_COUNTS *count;
735
  if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
736
					sizeof(HUFF_COUNTS),
737
					MYF(MY_ZEROFILL | MY_WME))))
738
  {
739
    for (i=0 ; i < info->s->base.fields ; i++)
740
    {
741
      enum en_fieldtype type;
742
      count[i].field_length=info->s->rec[i].length;
743
      type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
744
      if (type == FIELD_INTERVALL ||
745
	  type == FIELD_CONSTANT ||
746
	  type == FIELD_ZERO)
747
	type = FIELD_NORMAL;
748
      if (count[i].field_length <= 8 &&
749
	  (type == FIELD_NORMAL ||
750
	   type == FIELD_SKIP_ZERO))
751
	count[i].max_zero_fill= count[i].field_length;
752
      /*
753
        For every column initialize a tree, which is used to detect distinct
754
        column values. 'int_tree' works together with 'tree_buff' and
755
        'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
756
        This is accomplished by '-1' as the element size.
757
      */
758
      init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
759
		NULL);
760
      if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
761
	count[i].tree_pos=count[i].tree_buff =
762
	  my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
763
		    MYF(MY_WME));
764
    }
765
  }
766
  return count;
767
}
768
769
770
	/* Free memory used by counts and trees */
771
772
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
773
					   HUFF_COUNTS *huff_counts,
774
					   uint fields)
775
{
776
  register uint i;
777
778
  if (huff_trees)
779
  {
780
    for (i=0 ; i < trees ; i++)
781
    {
782
      if (huff_trees[i].element_buffer)
783
	my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
784
      if (huff_trees[i].code)
785
	my_free((uchar*) huff_trees[i].code,MYF(0));
786
    }
787
    my_free((uchar*) huff_trees,MYF(0));
788
  }
789
  if (huff_counts)
790
  {
791
    for (i=0 ; i < fields ; i++)
792
    {
793
      if (huff_counts[i].tree_buff)
794
      {
795
	my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
796
	delete_tree(&huff_counts[i].int_tree);
797
      }
798
    }
799
    my_free((uchar*) huff_counts,MYF(0));
800
  }
801
  delete_queue(&queue);		/* This is safe to free */
802
  return;
803
}
804
805
	/* Read through old file and gather some statistics */
806
807
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
808
{
809
  int error;
810
  uint length;
811
  ulong reclength,max_blob_length;
812
  uchar *record,*pos,*next_pos,*end_pos,*start_pos;
813
  ha_rows record_count;
814
  my_bool static_row_size;
815
  HUFF_COUNTS *count,*end_count;
816
  TREE_ELEMENT *element;
817
818
  reclength=mrg->file[0]->s->base.reclength;
819
  record=(uchar*) my_alloca(reclength);
820
  end_count=huff_counts+mrg->file[0]->s->base.fields;
821
  record_count=0; glob_crc=0;
822
  max_blob_length=0;
823
824
  /* Check how to calculate checksum */
825
  static_row_size=1;
826
  for (count=huff_counts ; count < end_count ; count++)
827
  {
828
    if (count->field_type == FIELD_BLOB ||
829
        count->field_type == FIELD_VARCHAR)
830
    {
831
      static_row_size=0;
832
      break;
833
    }
834
  }
835
836
  mrg_reset(mrg);
837
  while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
838
  {
839
    ulong tot_blob_length=0;
840
    if (! error)
841
    {
842
      /* glob_crc is a checksum over all bytes of all records. */
843
      if (static_row_size)
844
	glob_crc+=mi_static_checksum(mrg->file[0],record);
845
      else
846
	glob_crc+=mi_checksum(mrg->file[0],record);
847
848
      /* Count the incidence of values separately for every column. */
849
      for (pos=record,count=huff_counts ;
850
	   count < end_count ;
851
	   count++,
852
	   pos=next_pos)
853
      {
854
	next_pos=end_pos=(start_pos=pos)+count->field_length;
855
856
	/*
857
          Put the whole column value in a tree if there is room for it.
858
          'int_tree' is used to quickly check for duplicate values.
859
          'tree_buff' collects as many distinct column values as
860
          possible. If the field length is > 1, it is tree_buff_length,
861
          else 2 bytes. Each value is 'field_length' bytes big. If there
862
          are more distinct column values than fit into the buffer, we
863
          give up with this tree. BLOBs and VARCHARs do not have a
864
          tree_buff as it can only be used with fixed length columns.
865
          For the special case of field length == 1, we handle only the
866
          case that there is only one distinct value in the table(s).
867
          Otherwise, we can have a maximum of 256 distinct values. This
868
          is then handled by the normal Huffman tree build.
869
870
          Another limit for collecting distinct column values is the
871
          number of values itself. Since we would need to build a
872
          Huffman tree for the values, we are limited by the 'IS_OFFSET'
873
          constant. This constant expresses a bit which is used to
874
          determine if a tree element holds a final value or an offset
875
          to a child element. Hence, all values and offsets need to be
876
          smaller than 'IS_OFFSET'. A tree element is implemented with
877
          two integer values, one for the left branch and one for the
878
          right branch. For the extreme case that the first element
879
          points to the last element, the number of integers in the tree
880
          must be less or equal to IS_OFFSET. So the number of elements
881
          must be less or equal to IS_OFFSET / 2.
882
883
          WARNING: At first, we insert a pointer into the record buffer
884
          as the key for the tree. If we got a new distinct value, which
885
          is really inserted into the tree, instead of being counted
886
          only, we will copy the column value from the record buffer to
887
          'tree_buff' and adjust the key pointer of the tree accordingly.
888
        */
889
	if (count->tree_buff)
890
	{
891
	  global_count=count;
892
	  if (!(element=tree_insert(&count->int_tree,pos, 0, 
893
				    count->int_tree.custom_arg)) ||
894
	      (element->count == 1 &&
895
	       (count->tree_buff + tree_buff_length <
896
                count->tree_pos + count->field_length)) ||
897
              (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
898
	      (count->field_length == 1 &&
899
	       count->int_tree.elements_in_tree > 1))
900
	  {
901
	    delete_tree(&count->int_tree);
902
	    my_free(count->tree_buff,MYF(0));
903
	    count->tree_buff=0;
904
	  }
905
	  else
906
	  {
907
            /*
908
              If tree_insert() succeeds, it either creates a new element
909
              or increments the counter of an existing element.
910
            */
911
	    if (element->count == 1)
912
	    {
913
              /* Copy the new column value into 'tree_buff'. */
914
	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
915
              /* Adjust the key pointer in the tree. */
916
	      tree_set_pointer(element,count->tree_pos);
917
              /* Point behind the last column value so far. */
918
	      count->tree_pos+=count->field_length;
919
	    }
920
	  }
921
	}
922
923
	/* Save character counters and space-counts and zero-field-counts */
924
	if (count->field_type == FIELD_NORMAL ||
925
	    count->field_type == FIELD_SKIP_ENDSPACE)
926
	{
927
          /* Ignore trailing space. */
928
	  for ( ; end_pos > pos ; end_pos--)
929
	    if (end_pos[-1] != ' ')
930
	      break;
931
          /* Empty fields are just counted. Go to the next record. */
932
	  if (end_pos == pos)
933
	  {
934
	    count->empty_fields++;
935
	    count->max_zero_fill=0;
936
	    continue;
937
	  }
938
          /*
939
            Count the total of all trailing spaces and the number of
940
            short trailing spaces. Remember the longest trailing space.
941
          */
942
	  length= (uint) (next_pos-end_pos);
943
	  count->tot_end_space+=length;
944
	  if (length < 8)
945
	    count->end_space[length]++;
946
	  if (count->max_end_space < length)
947
	    count->max_end_space = length;
948
	}
949
950
	if (count->field_type == FIELD_NORMAL ||
951
	    count->field_type == FIELD_SKIP_PRESPACE)
952
	{
953
          /* Ignore leading space. */
954
	  for (pos=start_pos; pos < end_pos ; pos++)
955
	    if (pos[0] != ' ')
956
	      break;
957
          /* Empty fields are just counted. Go to the next record. */
958
	  if (end_pos == pos)
959
	  {
960
	    count->empty_fields++;
961
	    count->max_zero_fill=0;
962
	    continue;
963
	  }
964
          /*
965
            Count the total of all leading spaces and the number of
966
            short leading spaces. Remember the longest leading space.
967
          */
968
	  length= (uint) (pos-start_pos);
969
	  count->tot_pre_space+=length;
970
	  if (length < 8)
971
	    count->pre_space[length]++;
972
	  if (count->max_pre_space < length)
973
	    count->max_pre_space = length;
974
	}
975
976
        /* Calculate pos, end_pos, and max_length for variable length fields. */
977
	if (count->field_type == FIELD_BLOB)
978
	{
979
	  uint field_length=count->field_length -portable_sizeof_char_ptr;
980
	  ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
981
	  memcpy_fixed((char*) &pos,  start_pos+field_length,sizeof(char*));
982
	  end_pos=pos+blob_length;
983
	  tot_blob_length+=blob_length;
984
	  set_if_bigger(count->max_length,blob_length);
985
	}
986
	else if (count->field_type == FIELD_VARCHAR)
987
	{
988
          uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
989
	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
990
                   uint2korr(start_pos));
991
	  pos= start_pos+pack_length;
992
	  end_pos= pos+length;
993
	  set_if_bigger(count->max_length,length);
994
	}
995
996
        /* Evaluate 'max_zero_fill' for short fields. */
997
	if (count->field_length <= 8 &&
998
	    (count->field_type == FIELD_NORMAL ||
999
	     count->field_type == FIELD_SKIP_ZERO))
1000
	{
1001
	  uint i;
1002
          /* Zero fields are just counted. Go to the next record. */
1003
	  if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1004
	  {
1005
	    count->zero_fields++;
1006
	    continue;
1007
	  }
1008
          /*
1009
            max_zero_fill starts with field_length. It is decreased every
1010
            time a shorter "zero trailer" is found. It is set to zero when
1011
            an empty field is found (see above). This suggests that the
1012
            variable should be called 'min_zero_fill'.
1013
          */
1014
	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1015
	       i++) ;
1016
	  if (i < count->max_zero_fill)
1017
	    count->max_zero_fill=i;
1018
	}
1019
1020
        /* Ignore zero fields and check fields. */
1021
	if (count->field_type == FIELD_ZERO ||
1022
	    count->field_type == FIELD_CHECK)
1023
	  continue;
1024
1025
        /*
1026
          Count the incidence of every byte value in the
1027
          significant field value.
1028
        */
1029
	for ( ; pos < end_pos ; pos++)
1030
	  count->counts[(uchar) *pos]++;
1031
1032
        /* Step to next field. */
1033
      }
1034
1035
      if (tot_blob_length > max_blob_length)
1036
	max_blob_length=tot_blob_length;
1037
      record_count++;
1038
      if (write_loop && record_count % WRITE_COUNT == 0)
1039
      {
1040
	VOID(printf("%lu\r", (ulong) record_count));
1041
        VOID(fflush(stdout));
1042
      }
1043
    }
1044
    else if (error != HA_ERR_RECORD_DELETED)
1045
    {
1046
      VOID(fprintf(stderr, "Got error %d while reading rows", error));
1047
      break;
1048
    }
1049
1050
    /* Step to next record. */
1051
  }
1052
  if (write_loop)
1053
  {
1054
    VOID(printf("            \r"));
1055
    VOID(fflush(stdout));
1056
  }
1057
1058
  if (verbose >= 2)
1059
    VOID(printf("Found the following number of incidents "
1060
                "of the byte codes:\n"));
1061
  for (count= huff_counts ; count < end_count; count++)
1062
  {
1063
    uint      idx;
1064
    my_off_t  total_count;
1065
    char      llbuf[32];
1066
1067
    if (verbose >= 2)
1068
      VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1069
    if (count->tree_buff)
1070
    {
1071
      if (verbose >= 2)
1072
        VOID(printf("number of distinct values: %u\n",
1073
                    (uint) ((count->tree_pos - count->tree_buff) /
1074
                            count->field_length)));
1075
    }
1076
    total_count= 0;
1077
    for (idx= 0; idx < 256; idx++)
1078
    {
1079
      if (count->counts[idx])
1080
      {
1081
        total_count+= count->counts[idx];
1082
        if (verbose >= 2)
1083
          VOID(printf("counts[0x%02x]: %12s\n", idx,
152 by Brian Aker
longlong replacement
1084
                      llstr((int64_t) count->counts[idx], llbuf)));
1 by brian
clean slate
1085
      }
1086
    }
1087
    if ((verbose >= 2) && total_count)
1088
    {
1089
      VOID(printf("total:        %12s\n",
152 by Brian Aker
longlong replacement
1090
                  llstr((int64_t) total_count, llbuf)));
1 by brian
clean slate
1091
    }
1092
  }
1093
1094
  mrg->records=record_count;
1095
  mrg->max_blob_length=max_blob_length;
1096
  my_afree((uchar*) record);
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1097
  return(error != HA_ERR_END_OF_FILE);
1 by brian
clean slate
1098
}
1099
1100
static int compare_huff_elements(void *not_used __attribute__((unused)),
1101
				 uchar *a, uchar *b)
1102
{
1103
  return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1104
    (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1105
}
1106
1107
	/* Check each tree if we should use pre-space-compress, end-space-
1108
	   compress, empty-field-compress or zero-field-compress */
1109
1110
static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1111
			 my_off_t records)
1112
{
1113
  uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1114
  my_off_t old_length,new_length,length;
1115
212.6.1 by Mats Kindahl
Replacing all bzero() calls with memset() calls and removing the bzero.c file.
1116
  memset((uchar*) field_count, 0, sizeof(field_count));
1 by brian
clean slate
1117
  space_fields=fill_zero_fields=0;
1118
1119
  for (; trees-- ; huff_counts++)
1120
  {
1121
    if (huff_counts->field_type == FIELD_BLOB)
1122
    {
1123
      huff_counts->length_bits=max_bit(huff_counts->max_length);
1124
      goto found_pack;
1125
    }
1126
    else if (huff_counts->field_type == FIELD_VARCHAR)
1127
    {
1128
      huff_counts->length_bits=max_bit(huff_counts->max_length);
1129
      goto found_pack;
1130
    }
1131
    else if (huff_counts->field_type == FIELD_CHECK)
1132
    {
1133
      huff_counts->bytes_packed=0;
1134
      huff_counts->counts[0]=0;
1135
      goto found_pack;
1136
    }
1137
1138
    huff_counts->field_type=FIELD_NORMAL;
1139
    huff_counts->pack_type=0;
1140
1141
    /* Check for zero-filled records (in this column), or zero records. */
1142
    if (huff_counts->zero_fields || ! records)
1143
    {
1144
      my_off_t old_space_count;
1145
      /*
1146
        If there are only zero filled records (in this column),
1147
        or no records at all, we are done.
1148
      */
1149
      if (huff_counts->zero_fields == records)
1150
      {
1151
	huff_counts->field_type= FIELD_ZERO;
1152
	huff_counts->bytes_packed=0;
1153
	huff_counts->counts[0]=0;
1154
	goto found_pack;
1155
      }
1156
      /* Remeber the number of significant spaces. */
1157
      old_space_count=huff_counts->counts[' '];
1158
      /* Add all leading and trailing spaces. */
1159
      huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1160
                                  huff_counts->tot_pre_space +
1161
                                  huff_counts->empty_fields *
1162
                                  huff_counts->field_length);
1163
      /* Check, what the compressed length of this would be. */
1164
      old_length=calc_packed_length(huff_counts,0)+records/8;
1165
      /* Get the number of zero bytes. */
1166
      length=huff_counts->zero_fields*huff_counts->field_length;
1167
      /* Add it to the counts. */
1168
      huff_counts->counts[0]+=length;
1169
      /* Check, what the compressed length of this would be. */
1170
      new_length=calc_packed_length(huff_counts,0);
1171
      /* If the compression without the zeroes would be shorter, we are done. */
1172
      if (old_length < new_length && huff_counts->field_length > 1)
1173
      {
1174
	huff_counts->field_type=FIELD_SKIP_ZERO;
1175
	huff_counts->counts[0]-=length;
1176
	huff_counts->bytes_packed=old_length- records/8;
1177
	goto found_pack;
1178
      }
1179
      /* Remove the insignificant spaces, but keep the zeroes. */
1180
      huff_counts->counts[' ']=old_space_count;
1181
    }
1182
    /* Check, what the compressed length of this column would be. */
1183
    huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1184
1185
    /*
1186
      If there are enough empty records (in this column),
1187
      treating them specially may pay off.
1188
    */
1189
    if (huff_counts->empty_fields)
1190
    {
1191
      if (huff_counts->field_length > 2 &&
1192
	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1193
	  (1+max_bit(max(huff_counts->max_pre_space,
1194
			 huff_counts->max_end_space))) <
1195
	  records * max_bit(huff_counts->field_length))
1196
      {
1197
	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1198
      }
1199
      else
1200
      {
1201
	length=huff_counts->empty_fields*huff_counts->field_length;
1202
	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1203
	{
1204
	  huff_counts->tot_end_space+=length;
1205
	  huff_counts->max_end_space=huff_counts->field_length;
1206
	  if (huff_counts->field_length < 8)
1207
	    huff_counts->end_space[huff_counts->field_length]+=
1208
	      huff_counts->empty_fields;
1209
	}
1210
	if (huff_counts->tot_pre_space)
1211
	{
1212
	  huff_counts->tot_pre_space+=length;
1213
	  huff_counts->max_pre_space=huff_counts->field_length;
1214
	  if (huff_counts->field_length < 8)
1215
	    huff_counts->pre_space[huff_counts->field_length]+=
1216
	      huff_counts->empty_fields;
1217
	}
1218
      }
1219
    }
1220
1221
    /*
1222
      If there are enough trailing spaces (in this column),
1223
      treating them specially may pay off.
1224
    */
1225
    if (huff_counts->tot_end_space)
1226
    {
1227
      huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1228
      if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1229
			      huff_counts->end_space,
1230
			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1231
	goto found_pack;
1232
      huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1233
    }
1234
1235
    /*
1236
      If there are enough leading spaces (in this column),
1237
      treating them specially may pay off.
1238
    */
1239
    if (huff_counts->tot_pre_space)
1240
    {
1241
      if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1242
			      huff_counts->pre_space,
1243
			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1244
	goto found_pack;
1245
    }
1246
1247
  found_pack:			/* Found field-packing */
1248
1249
    /* Test if we can use zero-fill */
1250
1251
    if (huff_counts->max_zero_fill &&
1252
	(huff_counts->field_type == FIELD_NORMAL ||
1253
	 huff_counts->field_type == FIELD_SKIP_ZERO))
1254
    {
1255
      huff_counts->counts[0]-=huff_counts->max_zero_fill*
1256
	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1257
	 records - huff_counts->zero_fields : records);
1258
      huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1259
      huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1260
    }
1261
1262
    /* Test if intervall-field is better */
1263
1264
    if (huff_counts->tree_buff)
1265
    {
1266
      HUFF_TREE tree;
1267
1268
      tree.element_buffer=0;
1269
      if (!make_huff_tree(&tree,huff_counts) &&
1270
	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1271
      {
1272
	if (tree.elements == 1)
1273
	  huff_counts->field_type=FIELD_CONSTANT;
1274
	else
1275
	  huff_counts->field_type=FIELD_INTERVALL;
1276
	huff_counts->pack_type=0;
1277
      }
1278
      else
1279
      {
1280
	my_free((uchar*) huff_counts->tree_buff,MYF(0));
1281
	delete_tree(&huff_counts->int_tree);
1282
	huff_counts->tree_buff=0;
1283
      }
1284
      if (tree.element_buffer)
1285
	my_free((uchar*) tree.element_buffer,MYF(0));
1286
    }
1287
    if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1288
      space_fields++;
1289
    if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1290
      fill_zero_fields++;
1291
    field_count[huff_counts->field_type]++;
1292
  }
1293
  if (verbose)
1294
    VOID(printf("\nnormal:    %3d  empty-space:     %3d  "
1295
                "empty-zero:       %3d  empty-fill: %3d\n"
1296
                "pre-space: %3d  end-space:       %3d  "
1297
                "intervall-fields: %3d  zero:       %3d\n",
1298
                field_count[FIELD_NORMAL],space_fields,
1299
                field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1300
                field_count[FIELD_SKIP_PRESPACE],
1301
                field_count[FIELD_SKIP_ENDSPACE],
1302
                field_count[FIELD_INTERVALL],
1303
                field_count[FIELD_ZERO]));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1304
  return;
1 by brian
clean slate
1305
}
1306
1307
	/* Test if we can use space-compression and empty-field-compression */
1308
1309
static int
1310
test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1311
		    uint max_space_length, my_off_t *space_counts,
1312
		    my_off_t tot_space_count, enum en_fieldtype field_type)
1313
{
1314
  int min_pos;
1315
  uint length_bits,i;
1316
  my_off_t space_count,min_space_count,min_pack,new_length,skip;
1317
1318
  length_bits=max_bit(max_space_length);
1319
1320
		/* Default no end_space-packing */
1321
  space_count=huff_counts->counts[(uint) ' '];
1322
  min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1323
  min_pack=calc_packed_length(huff_counts,0);
1324
  min_pos= -2;
1325
  huff_counts->counts[(uint) ' ']=space_count;
1326
1327
	/* Test with allways space-count */
1328
  new_length=huff_counts->bytes_packed+length_bits*records/8;
1329
  if (new_length+1 < min_pack)
1330
  {
1331
    min_pos= -1;
1332
    min_pack=new_length;
1333
    min_space_count=space_count;
1334
  }
1335
	/* Test with length-flag */
1336
  for (skip=0L, i=0 ; i < 8 ; i++)
1337
  {
1338
    if (space_counts[i])
1339
    {
1340
      if (i)
1341
	huff_counts->counts[(uint) ' ']+=space_counts[i];
1342
      skip+=huff_counts->pre_space[i];
1343
      new_length=calc_packed_length(huff_counts,0)+
1344
	(records+(records-skip)*(1+length_bits))/8;
1345
      if (new_length < min_pack)
1346
      {
1347
	min_pos=(int) i;
1348
	min_pack=new_length;
1349
	min_space_count=huff_counts->counts[(uint) ' '];
1350
      }
1351
    }
1352
  }
1353
1354
  huff_counts->counts[(uint) ' ']=min_space_count;
1355
  huff_counts->bytes_packed=min_pack;
1356
  switch (min_pos) {
1357
  case -2:
1358
    return(0);				/* No space-compress */
1359
  case -1:				/* Always space-count */
1360
    huff_counts->field_type=field_type;
1361
    huff_counts->min_space=0;
1362
    huff_counts->length_bits=max_bit(max_space_length);
1363
    break;
1364
  default:
1365
    huff_counts->field_type=field_type;
1366
    huff_counts->min_space=(uint) min_pos;
1367
    huff_counts->pack_type|=PACK_TYPE_SELECTED;
1368
    huff_counts->length_bits=max_bit(max_space_length);
1369
    break;
1370
  }
1371
  return(1);				/* Using space-compress */
1372
}
1373
1374
1375
	/* Make a huff_tree of each huff_count */
1376
1377
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1378
{
1379
  uint tree;
1380
  HUFF_TREE *huff_tree;
1381
1382
  if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1383
					 MYF(MY_WME | MY_ZEROFILL))))
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1384
    return(0);
1 by brian
clean slate
1385
1386
  for (tree=0 ; tree < trees ; tree++)
1387
  {
1388
    if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1389
    {
1390
      while (tree--)
1391
	my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1392
      my_free((uchar*) huff_tree,MYF(0));
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1393
      return(0);
1 by brian
clean slate
1394
    }
1395
  }
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1396
  return(huff_tree);
1 by brian
clean slate
1397
}
1398
1399
/*
1400
  Build a Huffman tree.
1401
1402
  SYNOPSIS
1403
    make_huff_tree()
1404
    huff_tree                   The Huffman tree.
1405
    huff_counts                 The counts.
1406
1407
  DESCRIPTION
1408
    Build a Huffman tree according to huff_counts->counts or
1409
    huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1410
    tree_buff_length of distinct column values. In that case, whole
1411
    values can be Huffman encoded instead of single bytes.
1412
1413
  RETURN
1414
    0           OK
1415
    != 0        Error
1416
*/
1417
1418
static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1419
{
1420
  uint i,found,bits_packed,first,last;
1421
  my_off_t bytes_packed;
1422
  HUFF_ELEMENT *a,*b,*new_huff_el;
1423
1424
  first=last=0;
1425
  if (huff_counts->tree_buff)
1426
  {
1427
    /* Calculate the number of distinct values in tree_buff. */
1428
    found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1429
      huff_counts->field_length;
1430
    first=0; last=found-1;
1431
  }
1432
  else
1433
  {
1434
    /* Count the number of byte codes found in the column. */
1435
    for (i=found=0 ; i < 256 ; i++)
1436
    {
1437
      if (huff_counts->counts[i])
1438
      {
1439
	if (! found++)
1440
	  first=i;
1441
	last=i;
1442
      }
1443
    }
1444
    if (found < 2)
1445
      found=2;
1446
  }
1447
1448
  /* When using 'tree_buff' we can have more that 256 values. */
1449
  if (queue.max_elements < found)
1450
  {
1451
    delete_queue(&queue);
1452
    if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1453
      return -1;
1454
  }
1455
1456
  /* Allocate or reallocate an element buffer for the Huffman tree. */
1457
  if (!huff_tree->element_buffer)
1458
  {
1459
    if (!(huff_tree->element_buffer=
1460
	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1461
      return 1;
1462
  }
1463
  else
1464
  {
1465
    HUFF_ELEMENT *temp;
1466
    if (!(temp=
1467
	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1468
				     found*2*sizeof(HUFF_ELEMENT),
1469
				     MYF(MY_WME))))
1470
      return 1;
1471
    huff_tree->element_buffer=temp;
1472
  }
1473
1474
  huff_counts->tree=huff_tree;
1475
  huff_tree->counts=huff_counts;
1476
  huff_tree->min_chr=first;
1477
  huff_tree->max_chr=last;
1478
  huff_tree->char_bits=max_bit(last-first);
1479
  huff_tree->offset_bits=max_bit(found-1)+1;
1480
1481
  if (huff_counts->tree_buff)
1482
  {
1483
    huff_tree->elements=0;
1484
    huff_tree->tree_pack_length=(1+15+16+5+5+
1485
				 (huff_tree->char_bits+1)*found+
1486
				 (huff_tree->offset_bits+1)*
1487
				 (found-2)+7)/8 +
1488
				   (uint) (huff_tree->counts->tree_pos-
1489
					   huff_tree->counts->tree_buff);
1490
    /*
1491
      Put a HUFF_ELEMENT into the queue for every distinct column value.
1492
1493
      tree_walk() calls save_counts_in_queue() for every element in
1494
      'int_tree'. This takes elements from the target trees element
1495
      buffer and places references to them into the buffer of the
1496
      priority queue. We insert in column value order, but the order is
1497
      in fact irrelevant here. We will establish the correct order
1498
      later.
1499
    */
1500
    tree_walk(&huff_counts->int_tree,
1501
	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1502
	      (uchar*) huff_tree, left_root_right);
1503
  }
1504
  else
1505
  {
1506
    huff_tree->elements=found;
1507
    huff_tree->tree_pack_length=(9+9+5+5+
1508
				 (huff_tree->char_bits+1)*found+
1509
				 (huff_tree->offset_bits+1)*
1510
				 (found-2)+7)/8;
1511
    /*
1512
      Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1513
1514
      The elements are taken from the target trees element buffer.
1515
      Instead of using queue_insert(), we just place references to the
1516
      elements into the buffer of the priority queue. We insert in byte
1517
      value order, but the order is in fact irrelevant here. We will
1518
      establish the correct order later.
1519
    */
1520
    for (i=first, found=0 ; i <= last ; i++)
1521
    {
1522
      if (huff_counts->counts[i])
1523
      {
1524
	new_huff_el=huff_tree->element_buffer+(found++);
1525
	new_huff_el->count=huff_counts->counts[i];
1526
	new_huff_el->a.leaf.null=0;
1527
	new_huff_el->a.leaf.element_nr=i;
1528
	queue.root[found]=(uchar*) new_huff_el;
1529
      }
1530
    }
1531
    /*
1532
      If there is only a single byte value in this field in all records,
1533
      add a second element with zero incidence. This is required to enter
1534
      the loop, which builds the Huffman tree.
1535
    */
1536
    while (found < 2)
1537
    {
1538
      new_huff_el=huff_tree->element_buffer+(found++);
1539
      new_huff_el->count=0;
1540
      new_huff_el->a.leaf.null=0;
1541
      if (last)
1542
	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1543
      else
1544
	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1545
      queue.root[found]=(uchar*) new_huff_el;
1546
    }
1547
  }
1548
1549
  /* Make a queue from the queue buffer. */
1550
  queue.elements=found;
1551
1552
  /*
1553
    Make a priority queue from the queue. Construct its index so that we
1554
    have a partially ordered tree.
1555
  */
1556
  for (i=found/2 ; i > 0 ; i--)
1557
    _downheap(&queue,i);
1558
1559
  /* The Huffman algorithm. */
1560
  bytes_packed=0; bits_packed=0;
1561
  for (i=1 ; i < found ; i++)
1562
  {
1563
    /*
1564
      Pop the top element from the queue (the one with the least incidence).
1565
      Popping from a priority queue includes a re-ordering of the queue,
1566
      to get the next least incidence element to the top.
1567
    */
1568
    a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1569
    /*
1570
      Copy the next least incidence element. The queue implementation
1571
      reserves root[0] for temporary purposes. root[1] is the top.
1572
    */
1573
    b=(HUFF_ELEMENT*) queue.root[1];
1574
    /* Get a new element from the element buffer. */
1575
    new_huff_el=huff_tree->element_buffer+found+i;
1576
    /* The new element gets the sum of the two least incidence elements. */
1577
    new_huff_el->count=a->count+b->count;
1578
    /*
1579
      The Huffman algorithm assigns another bit to the code for a byte
1580
      every time that bytes incidence is combined (directly or indirectly)
1581
      to a new element as one of the two least incidence elements.
1582
      This means that one more bit per incidence of that byte is required
1583
      in the resulting file. So we add the new combined incidence as the
1584
      number of bits by which the result grows.
1585
    */
1586
    bits_packed+=(uint) (new_huff_el->count & 7);
1587
    bytes_packed+=new_huff_el->count/8;
1588
    /* The new element points to its children, lesser in left.  */
1589
    new_huff_el->a.nod.left=a;
1590
    new_huff_el->a.nod.right=b;
1591
    /*
1592
      Replace the copied top element by the new element and re-order the
1593
      queue.
1594
    */
1595
    queue.root[1]=(uchar*) new_huff_el;
1596
    queue_replaced(&queue);
1597
  }
1598
  huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1599
  huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1600
  return 0;
1601
}
1602
1603
static int compare_tree(void* cmp_arg __attribute__((unused)),
1604
			register const uchar *s, register const uchar *t)
1605
{
1606
  uint length;
1607
  for (length=global_count->field_length; length-- ;)
1608
    if (*s++ != *t++)
1609
      return (int) s[-1] - (int) t[-1];
1610
  return 0;
1611
}
1612
1613
/*
1614
  Organize distinct column values and their incidences into a priority queue.
1615
1616
  SYNOPSIS
1617
    save_counts_in_queue()
1618
    key                         The column value.
1619
    count                       The incidence of this value.
1620
    tree                        The Huffman tree to be built later.
1621
1622
  DESCRIPTION
1623
    We use the element buffer of the targeted tree. The distinct column
1624
    values are organized in a priority queue first. The Huffman
1625
    algorithm will later organize the elements into a Huffman tree. For
1626
    the time being, we just place references to the elements into the
1627
    queue buffer. The buffer will later be organized into a priority
1628
    queue.
1629
1630
  RETURN
1631
    0
1632
 */
1633
1634
static int save_counts_in_queue(uchar *key, element_count count,
1635
				HUFF_TREE *tree)
1636
{
1637
  HUFF_ELEMENT *new_huff_el;
1638
1639
  new_huff_el=tree->element_buffer+(tree->elements++);
1640
  new_huff_el->count=count;
1641
  new_huff_el->a.leaf.null=0;
1642
  new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1643
    tree->counts->field_length;
1644
  queue.root[tree->elements]=(uchar*) new_huff_el;
1645
  return 0;
1646
}
1647
1648
1649
/*
1650
  Calculate length of file if given counts should be used.
1651
1652
  SYNOPSIS
1653
    calc_packed_length()
1654
    huff_counts                 The counts for a column of the table(s).
1655
    add_tree_lenght             If the decode tree length should be added.
1656
1657
  DESCRIPTION
1658
    We need to follow the Huffman algorithm until we know, how many bits
1659
    are required for each byte code. But we do not need the resulting
1660
    Huffman tree. Hence, we can leave out some steps which are essential
1661
    in make_huff_tree().
1662
1663
  RETURN
1664
    Number of bytes required to compress this table column.
1665
*/
1666
1667
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1668
				   uint add_tree_lenght)
1669
{
1670
  uint i,found,bits_packed,first,last;
1671
  my_off_t bytes_packed;
1672
  HUFF_ELEMENT element_buffer[256];
1673
1674
  /* 
1675
    WARNING: We use a small hack for efficiency: Instead of placing
1676
    references to HUFF_ELEMENTs into the queue, we just insert
1677
    references to the counts of the byte codes which appeared in this
1678
    table column. During the Huffman algorithm they are successively
1679
    replaced by references to HUFF_ELEMENTs. This works, because
1680
    HUFF_ELEMENTs have the incidence count at their beginning.
1681
    Regardless, wether the queue array contains references to counts of
1682
    type my_off_t or references to HUFF_ELEMENTs which have the count of
1683
    type my_off_t at their beginning, it always points to a count of the
1684
    same type.
1685
1686
    Instead of using queue_insert(), we just copy the references into
1687
    the buffer of the priority queue. We insert in byte value order, but
1688
    the order is in fact irrelevant here. We will establish the correct
1689
    order later.
1690
  */
1691
  first=last=0;
1692
  for (i=found=0 ; i < 256 ; i++)
1693
  {
1694
    if (huff_counts->counts[i])
1695
    {
1696
      if (! found++)
1697
	first=i;
1698
      last=i;
1699
      /* We start with root[1], which is the queues top element. */
1700
      queue.root[found]=(uchar*) &huff_counts->counts[i];
1701
    }
1702
  }
1703
  if (!found)
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1704
    return(0);			/* Empty tree */
1 by brian
clean slate
1705
  /*
1706
    If there is only a single byte value in this field in all records,
1707
    add a second element with zero incidence. This is required to enter
1708
    the loop, which follows the Huffman algorithm.
1709
  */
1710
  if (found < 2)
1711
    queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1712
1713
  /* Make a queue from the queue buffer. */
1714
  queue.elements=found;
1715
1716
  bytes_packed=0; bits_packed=0;
1717
  /* Add the length of the coding table, which would become part of the file. */
1718
  if (add_tree_lenght)
1719
    bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1720
		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1721
1722
  /*
1723
    Make a priority queue from the queue. Construct its index so that we
1724
    have a partially ordered tree.
1725
  */
1726
  for (i=(found+1)/2 ; i > 0 ; i--)
1727
    _downheap(&queue,i);
1728
1729
  /* The Huffman algorithm. */
1730
  for (i=0 ; i < found-1 ; i++)
1731
  {
1732
    my_off_t        *a;
1733
    my_off_t        *b;
1734
    HUFF_ELEMENT    *new_huff_el;
1735
1736
    /*
1737
      Pop the top element from the queue (the one with the least
1738
      incidence). Popping from a priority queue includes a re-ordering
1739
      of the queue, to get the next least incidence element to the top.
1740
    */
1741
    a= (my_off_t*) queue_remove(&queue, 0);
1742
    /*
1743
      Copy the next least incidence element. The queue implementation
1744
      reserves root[0] for temporary purposes. root[1] is the top.
1745
    */
1746
    b= (my_off_t*) queue.root[1];
1747
    /* Create a new element in a local (automatic) buffer. */
1748
    new_huff_el= element_buffer + i;
1749
    /* The new element gets the sum of the two least incidence elements. */
1750
    new_huff_el->count= *a + *b;
1751
    /*
1752
      The Huffman algorithm assigns another bit to the code for a byte
1753
      every time that bytes incidence is combined (directly or indirectly)
1754
      to a new element as one of the two least incidence elements.
1755
      This means that one more bit per incidence of that byte is required
1756
      in the resulting file. So we add the new combined incidence as the
1757
      number of bits by which the result grows.
1758
    */
1759
    bits_packed+=(uint) (new_huff_el->count & 7);
1760
    bytes_packed+=new_huff_el->count/8;
1761
    /*
1762
      Replace the copied top element by the new element and re-order the
1763
      queue. This successively replaces the references to counts by
1764
      references to HUFF_ELEMENTs.
1765
    */
1766
    queue.root[1]=(uchar*) new_huff_el;
1767
    queue_replaced(&queue);
1768
  }
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1769
  return(bytes_packed+(bits_packed+7)/8);
1 by brian
clean slate
1770
}
1771
1772
1773
	/* Remove trees that don't give any compression */
1774
1775
static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1776
{
1777
  uint k,tree_number;
1778
  HUFF_COUNTS count,*i,*j,*last_count;
1779
1780
  last_count=huff_counts+trees;
1781
  for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1782
  {
1783
    if (!i->tree->tree_number)
1784
    {
1785
      i->tree->tree_number= ++tree_number;
1786
      if (i->tree_buff)
1787
	continue;			/* Don't join intervall */
1788
      for (j=i+1 ; j < last_count ; j++)
1789
      {
1790
	if (! j->tree->tree_number && ! j->tree_buff)
1791
	{
1792
	  for (k=0 ; k < 256 ; k++)
1793
	    count.counts[k]=i->counts[k]+j->counts[k];
1794
	  if (calc_packed_length(&count,1) <=
1795
	      i->tree->bytes_packed + j->tree->bytes_packed+
1796
	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1797
	      ALLOWED_JOIN_DIFF)
1798
	  {
1799
	    memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1800
			 sizeof(count.counts[0])*256);
1801
	    my_free((uchar*) j->tree->element_buffer,MYF(0));
1802
	    j->tree->element_buffer=0;
1803
	    j->tree=i->tree;
1804
	    bmove((uchar*) i->counts,(uchar*) count.counts,
1805
		  sizeof(count.counts[0])*256);
1806
	    if (make_huff_tree(i->tree,i))
1807
	      return (uint) -1;
1808
	  }
1809
	}
1810
      }
1811
    }
1812
  }
1813
  if (verbose)
1814
    VOID(printf("Original trees:  %d  After join: %d\n", trees, tree_number));
1815
  return tree_number;			/* Return trees left */
1816
}
1817
1818
1819
/*
1820
  Fill in huff_tree encode tables.
1821
1822
  SYNOPSIS
1823
    make_huff_decode_table()
1824
    huff_tree               An array of HUFF_TREE which are to be encoded.
1825
    trees                   The number of HUFF_TREE in the array.
1826
1827
  RETURN
1828
    0           success
1829
    != 0        error
1830
*/
1831
1832
static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1833
{
1834
  uint elements;
1835
  for ( ; trees-- ; huff_tree++)
1836
  {
1837
    if (huff_tree->tree_number > 0)
1838
    {
1839
      elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1840
      if (!(huff_tree->code =
151 by Brian Aker
Ulonglong to uint64_t
1841
            (uint64_t*) my_malloc(elements*
1842
                                   (sizeof(uint64_t) + sizeof(uchar)),
1 by brian
clean slate
1843
                                   MYF(MY_WME | MY_ZEROFILL))))
1844
	return 1;
1845
      huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1846
      make_traverse_code_tree(huff_tree, huff_tree->root,
151 by Brian Aker
Ulonglong to uint64_t
1847
                              8 * sizeof(uint64_t), 0LL);
1 by brian
clean slate
1848
    }
1849
  }
1850
  return 0;
1851
}
1852
1853
1854
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1855
				    HUFF_ELEMENT *element,
151 by Brian Aker
Ulonglong to uint64_t
1856
				    uint size, uint64_t code)
1 by brian
clean slate
1857
{
1858
  uint chr;
1859
  if (!element->a.leaf.null)
1860
  {
1861
    chr=element->a.leaf.element_nr;
151 by Brian Aker
Ulonglong to uint64_t
1862
    huff_tree->code_len[chr]= (uchar) (8 * sizeof(uint64_t) - size);
1 by brian
clean slate
1863
    huff_tree->code[chr]= (code >> size);
151 by Brian Aker
Ulonglong to uint64_t
1864
    if (huff_tree->height < 8 * sizeof(uint64_t) - size)
1865
        huff_tree->height= 8 * sizeof(uint64_t) - size;
1 by brian
clean slate
1866
  }
1867
  else
1868
  {
1869
    size--;
1870
    make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1871
    make_traverse_code_tree(huff_tree, element->a.nod.right, size,
151 by Brian Aker
Ulonglong to uint64_t
1872
			    code + (((uint64_t) 1) << size));
1 by brian
clean slate
1873
  }
1874
  return;
1875
}
1876
1877
1878
/*
1879
  Convert a value into binary digits.
1880
1881
  SYNOPSIS
1882
    bindigits()
1883
    value                       The value.
1884
    length                      The number of low order bits to convert.
1885
1886
  NOTE
1887
    The result string is in static storage. It is reused on every call.
1888
    So you cannot use it twice in one expression.
1889
1890
  RETURN
1891
    A pointer to a static NUL-terminated string.
1892
 */
1893
151 by Brian Aker
Ulonglong to uint64_t
1894
static char *bindigits(uint64_t value, uint bits)
1 by brian
clean slate
1895
{
1896
  static char digits[72];
1897
  char *ptr= digits;
1898
  uint idx= bits;
1899
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1900
  assert(idx < sizeof(digits));
1 by brian
clean slate
1901
  while (idx)
1902
    *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1903
  *ptr= '\0';
1904
  return digits;
1905
}
1906
1907
1908
/*
1909
  Convert a value into hexadecimal digits.
1910
1911
  SYNOPSIS
1912
    hexdigits()
1913
    value                       The value.
1914
1915
  NOTE
1916
    The result string is in static storage. It is reused on every call.
1917
    So you cannot use it twice in one expression.
1918
1919
  RETURN
1920
    A pointer to a static NUL-terminated string.
1921
 */
1922
151 by Brian Aker
Ulonglong to uint64_t
1923
static char *hexdigits(uint64_t value)
1 by brian
clean slate
1924
{
1925
  static char digits[20];
1926
  char *ptr= digits;
1927
  uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1928
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
1929
  assert(idx < sizeof(digits));
1 by brian
clean slate
1930
  while (idx)
1931
  {
1932
    if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1933
      *(ptr - 1)+= 'a' - '9' - 1;
1934
  }
1935
  *ptr= '\0';
1936
  return digits;
1937
}
1938
1939
1940
	/* Write header to new packed data file */
1941
1942
static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
1943
			my_off_t tot_elements,my_off_t filelength)
1944
{
1945
  uchar *buff= (uchar*) file_buffer.pos;
1946
212.6.1 by Mats Kindahl
Replacing all bzero() calls with memset() calls and removing the bzero.c file.
1947
  memset(buff, 0, HEAD_LENGTH);
1 by brian
clean slate
1948
  memcpy_fixed(buff,myisam_pack_file_magic,4);
1949
  int4store(buff+4,head_length);
1950
  int4store(buff+8, mrg->min_pack_length);
1951
  int4store(buff+12,mrg->max_pack_length);
1952
  int4store(buff+16,tot_elements);
1953
  int4store(buff+20,intervall_length);
1954
  int2store(buff+24,trees);
1955
  buff[26]=(char) mrg->ref_length;
1956
	/* Save record pointer length */
151 by Brian Aker
Ulonglong to uint64_t
1957
  buff[27]= (uchar) mi_get_pointer_length((uint64_t) filelength,2);
1 by brian
clean slate
1958
  if (test_only)
1959
    return 0;
1960
  VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
1961
  return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
1962
		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
1963
}
1964
1965
	/* Write fieldinfo to new packed file */
1966
1967
static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
1968
{
1969
  register uint i;
1970
  uint huff_tree_bits;
1971
  huff_tree_bits=max_bit(trees ? trees-1 : 0);
1972
1973
  if (verbose >= 2)
1974
  {
1975
    VOID(printf("\n"));
1976
    VOID(printf("column types:\n"));
1977
    VOID(printf("FIELD_NORMAL          0\n"));
1978
    VOID(printf("FIELD_SKIP_ENDSPACE   1\n"));
1979
    VOID(printf("FIELD_SKIP_PRESPACE   2\n"));
1980
    VOID(printf("FIELD_SKIP_ZERO       3\n"));
1981
    VOID(printf("FIELD_BLOB            4\n"));
1982
    VOID(printf("FIELD_CONSTANT        5\n"));
1983
    VOID(printf("FIELD_INTERVALL       6\n"));
1984
    VOID(printf("FIELD_ZERO            7\n"));
1985
    VOID(printf("FIELD_VARCHAR         8\n"));
1986
    VOID(printf("FIELD_CHECK           9\n"));
1987
    VOID(printf("\n"));
1988
    VOID(printf("pack type as a set of flags:\n"));
1989
    VOID(printf("PACK_TYPE_SELECTED      1\n"));
1990
    VOID(printf("PACK_TYPE_SPACE_FIELDS  2\n"));
1991
    VOID(printf("PACK_TYPE_ZERO_FILL     4\n"));
1992
    VOID(printf("\n"));
1993
  }
1994
  for (i=0 ; i++ < fields ; counts++)
1995
  {
151 by Brian Aker
Ulonglong to uint64_t
1996
    write_bits((uint64_t) (int) counts->field_type, 5);
1 by brian
clean slate
1997
    write_bits(counts->pack_type,6);
1998
    if (counts->pack_type & PACK_TYPE_ZERO_FILL)
1999
      write_bits(counts->max_zero_fill,5);
2000
    else
2001
      write_bits(counts->length_bits,5);
151 by Brian Aker
Ulonglong to uint64_t
2002
    write_bits((uint64_t) counts->tree->tree_number - 1, huff_tree_bits);
1 by brian
clean slate
2003
    if (verbose >= 2)
2004
      VOID(printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2005
                  "tree: %2u  length: %4u\n", i , counts->field_type,
2006
                  counts->pack_type, counts->max_zero_fill, counts->length_bits,
2007
                  counts->tree->tree_number, counts->field_length));
2008
  }
2009
  flush_bits();
2010
  return;
2011
}
2012
2013
	/* Write all huff_trees to new datafile. Return tot count of
2014
	   elements in all trees
2015
	   Returns 0 on error */
2016
2017
static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2018
{
2019
  uint i,int_length;
2020
  uint tree_no;
2021
  uint codes;
2022
  uint errors= 0;
2023
  uint *packed_tree,*offset,length;
2024
  my_off_t elements;
2025
2026
  /* Find the highest number of elements in the trees. */
2027
  for (i=length=0 ; i < trees ; i++)
2028
    if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2029
      length=huff_tree[i].elements;
2030
  /*
2031
    Allocate a buffer for packing a decode tree. Two numbers per element
2032
    (left child and right child).
2033
  */
2034
  if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2035
  {
2036
    my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2037
    return 0;
2038
  }
2039
2040
  if (verbose >= 2)
2041
    VOID(printf("\n"));
2042
  tree_no= 0;
2043
  intervall_length=0;
2044
  for (elements=0; trees-- ; huff_tree++)
2045
  {
2046
    /* Skip columns that have been joined with other columns. */
2047
    if (huff_tree->tree_number == 0)
2048
      continue;				/* Deleted tree */
2049
    tree_no++;
2050
    if (verbose >= 3)
2051
      VOID(printf("\n"));
2052
    /* Count the total number of elements (byte codes or column values). */
2053
    elements+=huff_tree->elements;
2054
    huff_tree->max_offset=2;
2055
    /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2056
    if (huff_tree->elements <= 1)
2057
      offset=packed_tree;
2058
    else
2059
      offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2060
2061
    /* This should be the same as 'length' above. */
2062
    huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2063
2064
    /*
2065
      Since we check this during collecting the distinct column values,
2066
      this should never happen.
2067
    */
2068
    if (huff_tree->max_offset >= IS_OFFSET)
2069
    {				/* This should be impossible */
2070
      VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2071
                   huff_tree->max_offset));
2072
      my_afree((uchar*) packed_tree);
2073
      return 0;
2074
    }
2075
2076
    if (!huff_tree->counts->tree_buff)
2077
    {
2078
      /* We do a byte compression on this column. Mark with bit 0. */
2079
      write_bits(0,1);
2080
      write_bits(huff_tree->min_chr,8);
2081
      write_bits(huff_tree->elements,9);
2082
      write_bits(huff_tree->char_bits,5);
2083
      write_bits(huff_tree->offset_bits,5);
2084
      int_length=0;
2085
    }
2086
    else
2087
    {
2088
      int_length=(uint) (huff_tree->counts->tree_pos -
2089
			 huff_tree->counts->tree_buff);
2090
      /* We have distinct column values for this column. Mark with bit 1. */
2091
      write_bits(1,1);
2092
      write_bits(huff_tree->elements,15);
2093
      write_bits(int_length,16);
2094
      write_bits(huff_tree->char_bits,5);
2095
      write_bits(huff_tree->offset_bits,5);
2096
      intervall_length+=int_length;
2097
    }
2098
    if (verbose >= 2)
2099
      VOID(printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2100
                  "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2101
                  huff_tree->char_bits, huff_tree->offset_bits,
2102
                  huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2103
                  huff_tree->counts->tree_buff ? int_length :
2104
                  huff_tree->min_chr, huff_tree->height));
2105
2106
    /* Check that the code tree length matches the element count. */
2107
    length=(uint) (offset-packed_tree);
2108
    if (length != huff_tree->elements*2-2)
2109
    {
2110
      VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2111
                   length, huff_tree->elements * 2 - 2));
2112
      errors++;
2113
      break;
2114
    }
2115
2116
    for (i=0 ; i < length ; i++)
2117
    {
2118
      if (packed_tree[i] & IS_OFFSET)
2119
	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2120
		   huff_tree->offset_bits+1);
2121
      else
2122
	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2123
      if (verbose >= 3)
2124
        VOID(printf("tree[0x%04x]: %s0x%04x\n",
2125
                    i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2126
                    (packed_tree[i] & IS_OFFSET) ?
2127
                    packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2128
    }
2129
    flush_bits();
2130
2131
    /*
2132
      Display coding tables and check their correctness.
2133
    */
2134
    codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2135
    for (i= 0; i < codes; i++)
2136
    {
151 by Brian Aker
Ulonglong to uint64_t
2137
      uint64_t code;
1 by brian
clean slate
2138
      uint bits;
2139
      uint len;
2140
      uint idx;
2141
2142
      if (! (len= huff_tree->code_len[i]))
2143
        continue;
2144
      if (verbose >= 3)
2145
        VOID(printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2146
                    hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2147
                    bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2148
2149
      /* Check that the encode table decodes correctly. */
2150
      code= 0;
2151
      bits= 0;
2152
      idx= 0;
2153
      for (;;)
2154
      {
2155
        if (! len)
2156
        {
2157
          VOID(fflush(stdout));
2158
          VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2159
                       hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2160
          errors++;
2161
          break;
2162
        }
2163
        code<<= 1;
2164
        code|= (huff_tree->code[i] >> (--len)) & 1;
2165
        bits++;
2166
        if (bits > 8 * sizeof(code))
2167
        {
2168
          VOID(fflush(stdout));
2169
          VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2170
                       bits, (uint) (8 * sizeof(code))));
2171
          errors++;
2172
          break;
2173
        }
2174
        idx+= (uint) code & 1;
2175
        if (idx >= length)
2176
        {
2177
          VOID(fflush(stdout));
2178
          VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2179
                       idx, length));
2180
          errors++;
2181
          break;
2182
        }
2183
        if (packed_tree[idx] & IS_OFFSET)
2184
          idx+= packed_tree[idx] & ~IS_OFFSET;
2185
        else
2186
          break; /* Hit a leaf. This contains the result value. */
2187
      }
2188
      if (errors)
2189
        break;
2190
2191
      if (packed_tree[idx] != i)
2192
      {
2193
        VOID(fflush(stdout));
2194
        VOID(fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2195
                     packed_tree[idx], i));
2196
        errors++;
2197
        break;
2198
      }
2199
    } /*end for (codes)*/
2200
    if (errors)
2201
      break;
2202
2203
    /* Write column values in case of distinct column value compression. */
2204
    if (huff_tree->counts->tree_buff)
2205
    {
2206
      for (i=0 ; i < int_length ; i++)
2207
      {
151 by Brian Aker
Ulonglong to uint64_t
2208
 	write_bits((uint64_t) (uchar) huff_tree->counts->tree_buff[i], 8);
1 by brian
clean slate
2209
        if (verbose >= 3)
2210
          VOID(printf("column_values[0x%04x]: 0x%02x\n",
2211
                      i, (uchar) huff_tree->counts->tree_buff[i]));
2212
      }
2213
    }
2214
    flush_bits();
2215
  }
2216
  if (verbose >= 2)
2217
    VOID(printf("\n"));
2218
  my_afree((uchar*) packed_tree);
2219
  if (errors)
2220
  {
2221
    VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2222
    return 0;
2223
  }
2224
  return elements;
2225
}
2226
2227
2228
static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2229
				   uint *offset)
2230
{
2231
  uint *prev_offset;
2232
2233
  prev_offset= offset;
2234
  /*
2235
    'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2236
    then there is no left child and, hence no right child either. This
2237
    is a property of a binary tree. An element is either a node with two
2238
    childs, or a leaf without childs.
2239
2240
    The current element is always a node with two childs. Go left first.
2241
  */
2242
  if (!element->a.nod.left->a.leaf.null)
2243
  {
2244
    /* Store the byte code or the index of the column value. */
2245
    prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2246
    offset+=2;
2247
  }
2248
  else
2249
  {
2250
    /*
2251
      Recursively traverse the tree to the left. Mark it as an offset to
2252
      another tree node (in contrast to a byte code or column value index).
2253
    */
2254
    prev_offset[0]= IS_OFFSET+2;
2255
    offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2256
  }
2257
2258
  /* Now, check the right child. */
2259
  if (!element->a.nod.right->a.leaf.null)
2260
  {
2261
    /* Store the byte code or the index of the column value. */
2262
    prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2263
    return offset;
2264
  }
2265
  else
2266
  {
2267
    /*
2268
      Recursively traverse the tree to the right. Mark it as an offset to
2269
      another tree node (in contrast to a byte code or column value index).
2270
    */
2271
    uint temp=(uint) (offset-prev_offset-1);
2272
    prev_offset[1]= IS_OFFSET+ temp;
2273
    if (huff_tree->max_offset < temp)
2274
      huff_tree->max_offset = temp;
2275
    return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2276
  }
2277
}
2278
2279
	/* Get number of bits neaded to represent value */
2280
2281
static uint max_bit(register uint value)
2282
{
2283
  register uint power=1;
2284
2285
  while ((value>>=1))
2286
    power++;
2287
  return (power);
2288
}
2289
2290
2291
static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2292
{
2293
  int error;
2294
  uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2295
    intervall,field_length,max_pack_length,pack_blob_length;
2296
  my_off_t record_count;
2297
  char llbuf[32];
2298
  ulong length,pack_length;
2299
  uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2300
  HUFF_COUNTS *count,*end_count;
2301
  HUFF_TREE *tree;
2302
  MI_INFO *isam_file=mrg->file[0];
2303
  uint pack_version= (uint) isam_file->s->pack.version;
2304
2305
  /* Allocate a buffer for the records (excluding blobs). */
2306
  if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2307
    return -1;
2308
2309
  end_count=huff_counts+isam_file->s->base.fields;
2310
  min_record_length= (uint) ~0;
2311
  max_record_length=0;
2312
2313
  /*
2314
    Calculate the maximum number of bits required to pack the records.
2315
    Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2316
    The tree height determines the maximum number of bits per value.
2317
    Some fields skip leading or trailing spaces or zeroes. The skipped
2318
    number of bytes is encoded by 'length_bits' bits.
2319
    Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2320
    and varchar get a leading 0 bit.
2321
  */
2322
  for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2323
  {
2324
    if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2325
      huff_counts[i].max_zero_fill=0;
2326
    if (huff_counts[i].field_type == FIELD_CONSTANT ||
2327
	huff_counts[i].field_type == FIELD_ZERO ||
2328
	huff_counts[i].field_type == FIELD_CHECK)
2329
      continue;
2330
    if (huff_counts[i].field_type == FIELD_INTERVALL)
2331
      max_calc_length+=huff_counts[i].tree->height;
2332
    else if (huff_counts[i].field_type == FIELD_BLOB ||
2333
	     huff_counts[i].field_type == FIELD_VARCHAR)
2334
      max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2335
    else
2336
      max_calc_length+=
2337
	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2338
	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2339
  }
2340
  max_calc_length= (max_calc_length + 7) / 8;
2341
  pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2342
  record_count=0;
2343
  /* 'max_blob_length' is the max length of all blobs of a record. */
2344
  pack_blob_length= isam_file->s->base.blobs ?
2345
                    calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2346
  max_pack_length=pack_ref_length+pack_blob_length;
2347
2348
  mrg_reset(mrg);
2349
  while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2350
  {
2351
    ulong tot_blob_length=0;
2352
    if (! error)
2353
    {
2354
      if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2355
	break;
2356
      record_pos= (uchar*) file_buffer.pos;
2357
      file_buffer.pos+=max_pack_length;
2358
      for (start_pos=record, count= huff_counts; count < end_count ; count++)
2359
      {
2360
	end_pos=start_pos+(field_length=count->field_length);
2361
	tree=count->tree;
2362
2363
        /* Check if the column contains spaces only. */
2364
	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2365
	{
2366
	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2367
	  if (pos == end_pos)
2368
	  {
2369
	    write_bits(1,1);
2370
	    start_pos=end_pos;
2371
	    continue;
2372
	  }
2373
	  write_bits(0,1);
2374
	}
2375
	end_pos-=count->max_zero_fill;
2376
	field_length-=count->max_zero_fill;
2377
2378
	switch (count->field_type) {
2379
	case FIELD_SKIP_ZERO:
2380
	  if (!memcmp((uchar*) start_pos,zero_string,field_length))
2381
	  {
2382
	    write_bits(1,1);
2383
	    start_pos=end_pos;
2384
	    break;
2385
	  }
2386
	  write_bits(0,1);
2387
	  /* Fall through */
2388
	case FIELD_NORMAL:
2389
	  for ( ; start_pos < end_pos ; start_pos++)
2390
          {
2391
	    write_bits(tree->code[(uchar) *start_pos],
2392
		       (uint) tree->code_len[(uchar) *start_pos]);
2393
          }
2394
	  break;
2395
	case FIELD_SKIP_ENDSPACE:
2396
	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2397
	  length= (ulong) (end_pos - pos);
2398
	  if (count->pack_type & PACK_TYPE_SELECTED)
2399
	  {
2400
	    if (length > count->min_space)
2401
	    {
2402
	      write_bits(1,1);
2403
	      write_bits(length,count->length_bits);
2404
	    }
2405
	    else
2406
	    {
2407
	      write_bits(0,1);
2408
	      pos=end_pos;
2409
	    }
2410
	  }
2411
	  else
2412
          {
2413
	    write_bits(length,count->length_bits);
2414
          }
2415
          /* Encode all significant bytes. */
2416
	  for ( ; start_pos < pos ; start_pos++)
2417
          {
2418
	    write_bits(tree->code[(uchar) *start_pos],
2419
		       (uint) tree->code_len[(uchar) *start_pos]);
2420
          }
2421
	  start_pos=end_pos;
2422
	  break;
2423
	case FIELD_SKIP_PRESPACE:
2424
	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2425
          length= (ulong) (pos - start_pos);
2426
	  if (count->pack_type & PACK_TYPE_SELECTED)
2427
	  {
2428
	    if (length > count->min_space)
2429
	    {
2430
	      write_bits(1,1);
2431
	      write_bits(length,count->length_bits);
2432
	    }
2433
	    else
2434
	    {
2435
	      pos=start_pos;
2436
	      write_bits(0,1);
2437
	    }
2438
	  }
2439
	  else
2440
          {
2441
	    write_bits(length,count->length_bits);
2442
          }
2443
          /* Encode all significant bytes. */
2444
	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2445
          {
2446
	    write_bits(tree->code[(uchar) *start_pos],
2447
		       (uint) tree->code_len[(uchar) *start_pos]);
2448
          }
2449
	  break;
2450
	case FIELD_CONSTANT:
2451
	case FIELD_ZERO:
2452
	case FIELD_CHECK:
2453
	  start_pos=end_pos;
2454
	  break;
2455
	case FIELD_INTERVALL:
2456
	  global_count=count;
2457
	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2458
				  count->int_tree.custom_arg);
2459
	  intervall=(uint) (pos - count->tree_buff)/field_length;
2460
	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2461
	  start_pos=end_pos;
2462
	  break;
2463
	case FIELD_BLOB:
2464
	{
2465
	  ulong blob_length=_mi_calc_blob_length(field_length-
2466
						 portable_sizeof_char_ptr,
2467
						 start_pos);
2468
          /* Empty blobs are encoded with a single 1 bit. */
2469
	  if (!blob_length)
2470
	  {
2471
            write_bits(1,1);
2472
	  }
2473
	  else
2474
	  {
2475
	    uchar *blob,*blob_end;
2476
	    write_bits(0,1);
2477
            /* Write the blob length. */
2478
	    write_bits(blob_length,count->length_bits);
2479
	    memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2480
			 sizeof(char*));
2481
	    blob_end=blob+blob_length;
2482
            /* Encode the blob bytes. */
2483
	    for ( ; blob < blob_end ; blob++)
2484
            {
2485
	      write_bits(tree->code[(uchar) *blob],
2486
			 (uint) tree->code_len[(uchar) *blob]);
2487
            }
2488
	    tot_blob_length+=blob_length;
2489
	  }
2490
	  start_pos= end_pos;
2491
	  break;
2492
	}
2493
	case FIELD_VARCHAR:
2494
	{
2495
          uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2496
          ulong col_length= (var_pack_length == 1 ?
2497
                             (uint) *(uchar*) start_pos :
2498
                             uint2korr(start_pos));
2499
          /* Empty varchar are encoded with a single 1 bit. */
2500
	  if (!col_length)
2501
	  {
2502
	    write_bits(1,1);			/* Empty varchar */
2503
	  }
2504
	  else
2505
	  {
2506
	    uchar *end= start_pos + var_pack_length + col_length;
2507
	    write_bits(0,1);
2508
            /* Write the varchar length. */
2509
	    write_bits(col_length,count->length_bits);
2510
            /* Encode the varchar bytes. */
2511
	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2512
            {
2513
	      write_bits(tree->code[(uchar) *start_pos],
2514
			 (uint) tree->code_len[(uchar) *start_pos]);
2515
            }
2516
	  }
2517
	  start_pos= end_pos;
2518
	  break;
2519
	}
2520
	case FIELD_LAST:
2521
        case FIELD_enum_val_count:
2522
	  abort();				/* Impossible */
2523
	}
2524
	start_pos+=count->max_zero_fill;
2525
      }
2526
      flush_bits();
2527
      length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2528
      pack_length= save_pack_length(pack_version, record_pos, length);
2529
      if (pack_blob_length)
2530
	pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2531
	                               tot_blob_length);
2532
      /* Correct file buffer if the header was smaller */
2533
      if (pack_length != max_pack_length)
2534
      {
2535
	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2536
	file_buffer.pos-= (max_pack_length-pack_length);
2537
      }
2538
      if (length < (ulong) min_record_length)
2539
	min_record_length=(uint) length;
2540
      if (length > (ulong) max_record_length)
2541
	max_record_length=(uint) length;
2542
      record_count++;
2543
      if (write_loop && record_count % WRITE_COUNT == 0)
2544
      {
2545
	VOID(printf("%lu\r", (ulong) record_count));
2546
        VOID(fflush(stdout));
2547
      }
2548
    }
2549
    else if (error != HA_ERR_RECORD_DELETED)
2550
      break;
2551
  }
2552
  if (error == HA_ERR_END_OF_FILE)
2553
    error=0;
2554
  else
2555
  {
2556
    VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2557
                 my_progname, error));
2558
  }
2559
  if (verbose >= 2)
152 by Brian Aker
longlong replacement
2560
    VOID(printf("wrote %s records.\n", llstr((int64_t) record_count, llbuf)));
1 by brian
clean slate
2561
2562
  my_afree((uchar*) record);
2563
  mrg->ref_length=max_pack_length;
2564
  mrg->min_pack_length=max_record_length ? min_record_length : 0;
2565
  mrg->max_pack_length=max_record_length;
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
2566
  return(error || error_on_write || flush_buffer(~(ulong) 0));
1 by brian
clean slate
2567
}
2568
2569
2570
static char *make_new_name(char *new_name, char *old_name)
2571
{
2572
  return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2573
}
2574
2575
static char *make_old_name(char *new_name, char *old_name)
2576
{
2577
  return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2578
}
2579
2580
	/* rutines for bit writing buffer */
2581
154 by Brian Aker
Removed oddball types in my_global.h
2582
static void init_file_buffer(File file, bool read_buffer)
1 by brian
clean slate
2583
{
2584
  file_buffer.file=file;
2585
  file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2586
					 MYF(MY_WME));
2587
  file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2588
  file_buffer.pos_in_file=0;
2589
  error_on_write=0;
2590
  if (read_buffer)
2591
  {
2592
2593
    file_buffer.pos=file_buffer.end;
2594
    file_buffer.bits=0;
2595
  }
2596
  else
2597
  {
2598
    file_buffer.pos=file_buffer.buffer;
2599
    file_buffer.bits=BITS_SAVED;
2600
  }
2601
  file_buffer.bitbucket= 0;
2602
}
2603
2604
2605
static int flush_buffer(ulong neaded_length)
2606
{
2607
  ulong length;
2608
2609
  /*
2610
    file_buffer.end is 8 bytes lower than the real end of the buffer.
2611
    This is done so that the end-of-buffer condition does not need to be
2612
    checked for every byte (see write_bits()). Consequently,
2613
    file_buffer.pos can become greater than file_buffer.end. The
2614
    algorithms in the other functions ensure that there will never be
2615
    more than 8 bytes written to the buffer without an end-of-buffer
2616
    check. So the buffer cannot be overrun. But we need to check for the
2617
    near-to-buffer-end condition to avoid a negative result, which is
2618
    casted to unsigned and thus becomes giant.
2619
  */
2620
  if ((file_buffer.pos < file_buffer.end) &&
2621
      ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2622
    return 0;
2623
  length=(ulong) (file_buffer.pos-file_buffer.buffer);
2624
  file_buffer.pos=file_buffer.buffer;
2625
  file_buffer.pos_in_file+=length;
2626
  if (test_only)
2627
    return 0;
2628
  if (error_on_write|| my_write(file_buffer.file,
2629
				(const uchar*) file_buffer.buffer,
2630
				length,
2631
				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2632
  {
2633
    error_on_write=1;
2634
    return 1;
2635
  }
2636
2637
  if (neaded_length != ~(ulong) 0 &&
2638
      (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2639
  {
2640
    char *tmp;
2641
    neaded_length+=256;				/* some margin */
2642
    tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2643
    if (!tmp)
2644
      return 1;
2645
    file_buffer.pos= ((uchar*) tmp +
2646
                      (ulong) (file_buffer.pos - file_buffer.buffer));
2647
    file_buffer.buffer= (uchar*) tmp;
2648
    file_buffer.end= (uchar*) (tmp+neaded_length-8);
2649
  }
2650
  return 0;
2651
}
2652
2653
2654
static void end_file_buffer(void)
2655
{
2656
  my_free((uchar*) file_buffer.buffer,MYF(0));
2657
}
2658
2659
	/* output `bits` low bits of `value' */
2660
151 by Brian Aker
Ulonglong to uint64_t
2661
static void write_bits(register uint64_t value, register uint bits)
1 by brian
clean slate
2662
{
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
2663
  assert(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
1 by brian
clean slate
2664
              (bits == 8 * sizeof(value)));
2665
2666
  if ((file_buffer.bits-= (int) bits) >= 0)
2667
  {
2668
    file_buffer.bitbucket|= value << file_buffer.bits;
2669
  }
2670
  else
2671
  {
151 by Brian Aker
Ulonglong to uint64_t
2672
    register uint64_t bit_buffer;
1 by brian
clean slate
2673
    bits= (uint) -file_buffer.bits;
2674
    bit_buffer= (file_buffer.bitbucket |
2675
                 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2676
#if BITS_SAVED == 64
2677
    *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2678
    *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2679
    *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2680
    *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2681
#endif
2682
    *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2683
    *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2684
    *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2685
    *file_buffer.pos++= (uchar) (bit_buffer);
2686
2687
    if (bits != 8 * sizeof(value))
151 by Brian Aker
Ulonglong to uint64_t
2688
      value&= (((uint64_t) 1) << bits) - 1;
1 by brian
clean slate
2689
    if (file_buffer.pos >= file_buffer.end)
2690
      VOID(flush_buffer(~ (ulong) 0));
2691
    file_buffer.bits=(int) (BITS_SAVED - bits);
2692
    file_buffer.bitbucket= value << (BITS_SAVED - bits);
2693
  }
2694
  return;
2695
}
2696
2697
	/* Flush bits in bit_buffer to buffer */
2698
2699
static void flush_bits(void)
2700
{
2701
  int bits;
151 by Brian Aker
Ulonglong to uint64_t
2702
  uint64_t bit_buffer;
1 by brian
clean slate
2703
2704
  bits= file_buffer.bits & ~7;
2705
  bit_buffer= file_buffer.bitbucket >> bits;
2706
  bits= BITS_SAVED - bits;
2707
  while (bits > 0)
2708
  {
2709
    bits-= 8;
2710
    *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2711
  }
2712
  if (file_buffer.pos >= file_buffer.end)
2713
    VOID(flush_buffer(~ (ulong) 0));
2714
  file_buffer.bits= BITS_SAVED;
2715
  file_buffer.bitbucket= 0;
2716
}
2717
2718
2719
/****************************************************************************
2720
** functions to handle the joined files
2721
****************************************************************************/
2722
2723
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2724
		      ha_checksum crc)
2725
{
2726
  MYISAM_SHARE *share=isam_file->s;
2727
  uint options=mi_uint2korr(share->state.header.options);
2728
  uint key;
2729
2730
  options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2731
  mi_int2store(share->state.header.options,options);
2732
2733
  share->state.state.data_file_length=new_length;
2734
  share->state.state.del=0;
2735
  share->state.state.empty=0;
2736
  share->state.dellink= HA_OFFSET_ERROR;
2737
  share->state.split=(ha_rows) mrg->records;
2738
  share->state.version=(ulong) time((time_t*) 0);
2739
  if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2740
  {
2741
    /*
2742
      Some indexes are disabled, cannot use current key_file_length value
2743
      as an estimate of upper bound of index file size. Use packed data file 
2744
      size instead.
2745
    */
2746
    share->state.state.key_file_length= new_length;
2747
  }
2748
  /*
2749
    If there are no disabled indexes, keep key_file_length value from 
2750
    original file so "myisamchk -rq" can use this value (this is necessary 
2751
    because index size cannot be easily calculated for fulltext keys)
2752
  */
2753
  mi_clear_all_keys_active(share->state.key_map);
2754
  for (key=0 ; key < share->base.keys ; key++)
2755
    share->state.key_root[key]= HA_OFFSET_ERROR;
2756
  for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2757
    share->state.key_del[key]= HA_OFFSET_ERROR;
2758
  isam_file->state->checksum=crc;       /* Save crc here */
2759
  share->changed=1;			/* Force write of header */
2760
  share->state.open_count=0;
2761
  share->global_changed=0;
30 by Brian Aker
Large file and ftruncate() support
2762
  (void)ftruncate(share->kfile, share->base.keystart);
1 by brian
clean slate
2763
  if (share->base.keys)
2764
    isamchk_neaded=1;
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
2765
  return(mi_state_info_write(share->kfile,&share->state,1+2));
1 by brian
clean slate
2766
}
2767
2768
2769
static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2770
			  ha_checksum crc)
2771
{
2772
  MI_STATE_INFO state;
2773
  MI_INFO *isam_file=mrg->file[0];
2774
  uint options;
2775
2776
  state= isam_file->s->state;
2777
  options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2778
	    HA_OPTION_READ_ONLY_DATA);
2779
  mi_int2store(state.header.options,options);
2780
  state.state.data_file_length=new_length;
2781
  state.state.del=0;
2782
  state.state.empty=0;
2783
  state.state.records=state.split=(ha_rows) mrg->records;
2784
  /* See comment above in save_state about key_file_length handling. */
2785
  if (mrg->src_file_has_indexes_disabled)
2786
  {
2787
    isam_file->s->state.state.key_file_length=
2788
      max(isam_file->s->state.state.key_file_length, new_length);
2789
  }
2790
  state.dellink= HA_OFFSET_ERROR;
2791
  state.version=(ulong) time((time_t*) 0);
2792
  mi_clear_all_keys_active(state.key_map);
2793
  state.state.checksum=crc;
2794
  if (isam_file->s->base.keys)
2795
    isamchk_neaded=1;
2796
  state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
51.1.84 by Jay Pipes
1st round removing DBUG_XXX symbols. Round 2 is custom debugging code layer...
2797
  return (mi_state_info_write(file,&state,1+2));
1 by brian
clean slate
2798
}
2799
2800
2801
/* reset for mrg_rrnd */
2802
2803
static void mrg_reset(PACK_MRG_INFO *mrg)
2804
{
2805
  if (mrg->current)
2806
  {
2807
    mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
2808
    mrg->current=0;
2809
  }
2810
}
2811
2812
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
2813
{
2814
  int error;
2815
  MI_INFO *isam_info;
2816
  my_off_t filepos;
2817
2818
  if (!info->current)
2819
  {
2820
    isam_info= *(info->current=info->file);
2821
    info->end=info->current+info->count;
2822
    mi_reset(isam_info);
2823
    mi_extra(isam_info, HA_EXTRA_CACHE, 0);
2824
    filepos=isam_info->s->pack.header_length;
2825
  }
2826
  else
2827
  {
2828
    isam_info= *info->current;
2829
    filepos= isam_info->nextpos;
2830
  }
2831
2832
  for (;;)
2833
  {
2834
    isam_info->update&= HA_STATE_CHANGED;
2835
    if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
2836
					  filepos, 1)) ||
2837
	error != HA_ERR_END_OF_FILE)
2838
      return (error);
2839
    mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
2840
    if (info->current+1 == info->end)
2841
      return(HA_ERR_END_OF_FILE);
2842
    info->current++;
2843
    isam_info= *info->current;
2844
    filepos=isam_info->s->pack.header_length;
2845
    mi_reset(isam_info);
2846
    mi_extra(isam_info,HA_EXTRA_CACHE, 0);
2847
  }
2848
}
2849
2850
2851
static int mrg_close(PACK_MRG_INFO *mrg)
2852
{
2853
  uint i;
2854
  int error=0;
2855
  for (i=0 ; i < mrg->count ; i++)
2856
    error|=mi_close(mrg->file[i]);
2857
  if (mrg->free_file)
2858
    my_free((uchar*) mrg->file,MYF(0));
2859
  return error;
2860
}