~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000-2006 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
16
/* Pack MyISAM file */
17
18
#ifndef USE_MY_FUNC
19
#define USE_MY_FUNC			/* We need at least my_malloc */
20
#endif
21
22
#include "myisamdef.h"
23
#include <queues.h>
24
#include <my_tree.h>
25
#include "mysys_err.h"
26
#ifdef MSDOS
27
#include <io.h>
28
#endif
29
#ifndef __GNU_LIBRARY__
30
#define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
31
#endif
32
#include <my_getopt.h>
33
#include <assert.h>
34
35
#if SIZEOF_LONG_LONG > 4
36
#define BITS_SAVED 64
37
#else
38
#define BITS_SAVED 32
39
#endif
40
41
#define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
42
#define HEAD_LENGTH	32
43
#define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
44
45
#define DATA_TMP_EXT		".TMD"
46
#define OLD_EXT			".OLD"
47
#define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
48
49
struct st_file_buffer {
50
  File file;
51
  uchar *buffer,*pos,*end;
52
  my_off_t pos_in_file;
53
  int bits;
54
  ulonglong bitbucket;
55
};
56
57
struct st_huff_tree;
58
struct st_huff_element;
59
60
typedef struct st_huff_counts {
61
  uint	field_length,max_zero_fill;
62
  uint	pack_type;
63
  uint	max_end_space,max_pre_space,length_bits,min_space;
64
  ulong max_length;
65
  enum en_fieldtype field_type;
66
  struct st_huff_tree *tree;		/* Tree for field */
67
  my_off_t counts[256];
68
  my_off_t end_space[8];
69
  my_off_t pre_space[8];
70
  my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
71
  TREE int_tree;        /* Tree for detecting distinct column values. */
72
  uchar *tree_buff;      /* Column values, 'field_length' each. */
73
  uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
74
} HUFF_COUNTS;
75
76
typedef struct st_huff_element HUFF_ELEMENT;
77
78
/*
79
  WARNING: It is crucial for the optimizations in calc_packed_length()
80
  that 'count' is the first element of 'HUFF_ELEMENT'.
81
*/
82
struct st_huff_element {
83
  my_off_t count;
84
  union un_element {
85
    struct st_nod {
86
      HUFF_ELEMENT *left,*right;
87
    } nod;
88
    struct st_leaf {
89
      HUFF_ELEMENT *null;
90
      uint	element_nr;		/* Number of element */
91
    } leaf;
92
  } a;
93
};
94
95
96
typedef struct st_huff_tree {
97
  HUFF_ELEMENT *root,*element_buffer;
98
  HUFF_COUNTS *counts;
99
  uint tree_number;
100
  uint elements;
101
  my_off_t bytes_packed;
102
  uint tree_pack_length;
103
  uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
104
  ulonglong *code;
105
  uchar *code_len;
106
} HUFF_TREE;
107
108
109
typedef struct st_isam_mrg {
110
  MI_INFO **file,**current,**end;
111
  uint free_file;
112
  uint count;
113
  uint	min_pack_length;		/* Theese is used by packed data */
114
  uint	max_pack_length;
115
  uint	ref_length;
116
  uint	max_blob_length;
117
  my_off_t records;
118
  /* true if at least one source file has at least one disabled index */
119
  my_bool src_file_has_indexes_disabled;
120
} PACK_MRG_INFO;
121
122
123
extern int main(int argc,char * *argv);
124
static void get_options(int *argc,char ***argv);
125
static MI_INFO *open_isam_file(char *name,int mode);
126
static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
127
static int compress(PACK_MRG_INFO *file,char *join_name);
128
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
129
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
130
					   uint trees,
131
					   HUFF_COUNTS *huff_counts,
132
					   uint fields);
133
static int compare_tree(void* cmp_arg __attribute__((unused)),
134
			const uchar *s,const uchar *t);
135
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
136
static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
137
			 my_off_t records);
138
static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
139
			       uint max_space_length,my_off_t *space_counts,
140
			       my_off_t tot_space_count,
141
			       enum en_fieldtype field_type);
142
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
143
static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
144
static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
145
static int save_counts_in_queue(uchar *key,element_count count,
146
				    HUFF_TREE *tree);
147
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
148
static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
149
static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
150
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
151
				    HUFF_ELEMENT *element,uint size,
152
				    ulonglong code);
153
static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
154
			my_off_t tot_elements,my_off_t filelength);
155
static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
156
static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
157
static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
158
				       HUFF_ELEMENT *element,
159
				       uint *offset);
160
static uint max_bit(uint value);
161
static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
162
static char *make_new_name(char *new_name,char *old_name);
163
static char *make_old_name(char *new_name,char *old_name);
164
static void init_file_buffer(File file,pbool read_buffer);
165
static int flush_buffer(ulong neaded_length);
166
static void end_file_buffer(void);
167
static void write_bits(ulonglong value, uint bits);
168
static void flush_bits(void);
169
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
170
		      ha_checksum crc);
171
static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
172
			  ha_checksum crc);
173
static int mrg_close(PACK_MRG_INFO *mrg);
174
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
175
static void mrg_reset(PACK_MRG_INFO *mrg);
176
#if !defined(DBUG_OFF)
177
static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
178
static int fakecmp(my_off_t **count1, my_off_t **count2);
179
#endif
180
181
182
static int error_on_write=0,test_only=0,verbose=0,silent=0,
183
	   write_loop=0,force_pack=0, isamchk_neaded=0;
184
static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
185
static my_bool backup, opt_wait;
186
/*
187
  tree_buff_length is somewhat arbitrary. The bigger it is the better
188
  the chance to win in terms of compression factor. On the other hand,
189
  this table becomes part of the compressed file header. And its length
190
  is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191
*/
192
static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193
static char tmp_dir[FN_REFLEN]={0},*join_table;
194
static my_off_t intervall_length;
195
static ha_checksum glob_crc;
196
static struct st_file_buffer file_buffer;
197
static QUEUE queue;
198
static HUFF_COUNTS *global_count;
199
static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
200
static const char *load_default_groups[]= { "myisampack",0 };
201
202
	/* The main program */
203
204
int main(int argc, char **argv)
205
{
206
  int error,ok;
207
  PACK_MRG_INFO merge;
208
  char **default_argv;
209
  MY_INIT(argv[0]);
210
211
  load_defaults("my",load_default_groups,&argc,&argv);
212
  default_argv= argv;
213
  get_options(&argc,&argv);
214
215
  error=ok=isamchk_neaded=0;
216
  if (join_table)
217
  {						/* Join files into one */
218
    if (open_isam_files(&merge,argv,(uint) argc) ||
219
	compress(&merge,join_table))
220
      error=1;
221
  }
222
  else while (argc--)
223
  {
224
    MI_INFO *isam_file;
225
    if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
226
      error=1;
227
    else
228
    {
229
      merge.file= &isam_file;
230
      merge.current=0;
231
      merge.free_file=0;
232
      merge.count=1;
233
      if (compress(&merge,0))
234
	error=1;
235
      else
236
	ok=1;
237
    }
238
  }
239
  if (ok && isamchk_neaded && !silent)
240
    puts("Remember to run myisamchk -rq on compressed tables");
241
  VOID(fflush(stdout));
242
  VOID(fflush(stderr));
243
  free_defaults(default_argv);
244
  my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
245
  exit(error ? 2 : 0);
246
#ifndef _lint
247
  return 0;					/* No compiler warning */
248
#endif
249
}
250
251
enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
252
253
static struct my_option my_long_options[] =
254
{
255
#ifdef __NETWARE__
256
  {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
257
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
258
#endif
259
  {"backup", 'b', "Make a backup of the table as table_name.OLD.",
260
   (uchar**) &backup, (uchar**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
261
  {"character-sets-dir", OPT_CHARSETS_DIR_MP,
262
   "Directory where character sets are.", (uchar**) &charsets_dir,
263
   (uchar**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
264
  {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
265
   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
266
  {"force", 'f',
267
   "Force packing of table even if it gets bigger or if tempfile exists.",
268
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
269
  {"join", 'j',
270
   "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
271
   (uchar**) &join_table, (uchar**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
272
   0, 0, 0},
273
  {"help", '?', "Display this help and exit.",
274
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
275
  {"silent", 's', "Be more silent.",
276
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
277
  {"tmpdir", 'T', "Use temporary directory to store temporary table.",
278
   0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
279
  {"test", 't', "Don't pack table, only test packing it.",
280
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
281
  {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
282
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
283
  {"version", 'V', "Output version information and exit.",
284
   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
285
  {"wait", 'w', "Wait and retry if table is in use.", (uchar**) &opt_wait,
286
   (uchar**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
287
  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
288
};
289
290
#include <help_start.h>
291
292
static void print_version(void)
293
{
294
  VOID(printf("%s Ver 1.23 for %s on %s\n",
295
              my_progname, SYSTEM_TYPE, MACHINE_TYPE));
296
  NETWARE_SET_SCREEN_MODE(1);
297
}
298
299
300
static void usage(void)
301
{
302
  print_version();
303
  puts("Copyright (C) 2002 MySQL AB");
304
  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
305
  puts("and you are welcome to modify and redistribute it under the GPL license\n");
306
307
  puts("Pack a MyISAM-table to take much less space.");
308
  puts("Keys are not updated, you must run myisamchk -rq on the datafile");
309
  puts("afterwards to update the keys.");
310
  puts("You should give the .MYI file as the filename argument.");
311
312
  VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
313
  my_print_help(my_long_options);
314
  print_defaults("my", load_default_groups);
315
  my_print_variables(my_long_options);
316
}
317
318
#include <help_end.h>
319
320
static my_bool
321
get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
322
	       char *argument)
323
{
324
  uint length;
325
326
  switch(optid) {
327
#ifdef __NETWARE__
328
  case OPT_AUTO_CLOSE:
329
    setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
330
    break;
331
#endif
332
  case 'f':
333
    force_pack= 1;
334
    tmpfile_createflag= O_RDWR | O_TRUNC;
335
    break;
336
  case 's':
337
    write_loop= verbose= 0;
338
    silent= 1;
339
    break;
340
  case 't':
341
    test_only= 1;
342
    /* Avoid to reset 'verbose' if it was already set > 1. */
343
    if (! verbose)
344
      verbose= 1;
345
    break;
346
  case 'T':
347
    length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
348
    if (length != dirname_length(tmp_dir))
349
    {
350
      tmp_dir[length]=FN_LIBCHAR;
351
      tmp_dir[length+1]=0;
352
    }
353
    break;
354
  case 'v':
355
    verbose++; /* Allow for selecting the level of verbosity. */
356
    silent= 0;
357
    break;
358
  case '#':
359
    DBUG_PUSH(argument ? argument : "d:t:o");
360
    break;
361
  case 'V':
362
    print_version();
363
    exit(0);
364
  case 'I':
365
  case '?':
366
    usage();
367
    exit(0);
368
  }
369
  return 0;
370
}
371
372
	/* reads options */
373
	/* Initiates DEBUG - but no debugging here ! */
374
375
static void get_options(int *argc,char ***argv)
376
{
377
  int ho_error;
378
379
  my_progname= argv[0][0];
380
  if (isatty(fileno(stdout)))
381
    write_loop=1;
382
383
  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
384
    exit(ho_error);
385
386
  if (!*argc)
387
  {
388
    usage();
389
    exit(1);
390
  }
391
  if (join_table)
392
  {
393
    backup=0;					/* Not needed */
394
    tmp_dir[0]=0;
395
  }
396
  return;
397
}
398
399
400
static MI_INFO *open_isam_file(char *name,int mode)
401
{
402
  MI_INFO *isam_file;
403
  MYISAM_SHARE *share;
404
  DBUG_ENTER("open_isam_file");
405
406
  if (!(isam_file=mi_open(name,mode,
407
			  (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
408
			   HA_OPEN_ABORT_IF_LOCKED))))
409
  {
410
    VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
411
    DBUG_RETURN(0);
412
  }
413
  share=isam_file->s;
414
  if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
415
  {
416
    if (!force_pack)
417
    {
418
      VOID(fprintf(stderr, "%s is already compressed\n", name));
419
      VOID(mi_close(isam_file));
420
      DBUG_RETURN(0);
421
    }
422
    if (verbose)
423
      puts("Recompressing already compressed table");
424
    share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
425
  }
426
  if (! force_pack && share->state.state.records != 0 &&
427
      (share->state.state.records <= 1 ||
428
       share->state.state.data_file_length < 1024))
429
  {
430
    VOID(fprintf(stderr, "%s is too small to compress\n", name));
431
    VOID(mi_close(isam_file));
432
    DBUG_RETURN(0);
433
  }
434
  VOID(mi_lock_database(isam_file,F_WRLCK));
435
  DBUG_RETURN(isam_file);
436
}
437
438
439
static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
440
{
441
  uint i,j;
442
  mrg->count=0;
443
  mrg->current=0;
444
  mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
445
  mrg->free_file=1;
446
  mrg->src_file_has_indexes_disabled= 0;
447
  for (i=0; i < count ; i++)
448
  {
449
    if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
450
      goto error;
451
452
    mrg->src_file_has_indexes_disabled|=
453
      ! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
454
                              mrg->file[i]->s->base.keys);
455
  }
456
  /* Check that files are identical */
457
  for (j=0 ; j < count-1 ; j++)
458
  {
459
    MI_COLUMNDEF *m1,*m2,*end;
460
    if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
461
	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
462
      goto diff_file;
463
    m1=mrg->file[j]->s->rec;
464
    end=m1+mrg->file[j]->s->base.fields;
465
    m2=mrg->file[j+1]->s->rec;
466
    for ( ; m1 != end ; m1++,m2++)
467
    {
468
      if (m1->type != m2->type || m1->length != m2->length)
469
	goto diff_file;
470
    }
471
  }
472
  mrg->count=count;
473
  return 0;
474
475
 diff_file:
476
  VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
477
               my_progname, names[j], names[j+1]));
478
 error:
479
  while (i--)
480
    mi_close(mrg->file[i]);
481
  my_free((uchar*) mrg->file,MYF(0));
482
  return 1;
483
}
484
485
486
static int compress(PACK_MRG_INFO *mrg,char *result_table)
487
{
488
  int error;
489
  File new_file,join_isam_file;
490
  MI_INFO *isam_file;
491
  MYISAM_SHARE *share;
492
  char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
493
  uint i,header_length,fields,trees,used_trees;
494
  my_off_t old_length,new_length,tot_elements;
495
  HUFF_COUNTS *huff_counts;
496
  HUFF_TREE *huff_trees;
497
  DBUG_ENTER("compress");
498
499
  isam_file=mrg->file[0];			/* Take this as an example */
500
  share=isam_file->s;
501
  new_file=join_isam_file= -1;
502
  trees=fields=0;
503
  huff_trees=0;
504
  huff_counts=0;
505
506
  /* Create temporary or join file */
507
508
  if (backup)
509
    VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2));
510
  else
511
    VOID(fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16));
512
  if (!test_only && result_table)
513
  {
514
    /* Make a new indexfile based on first file in list */
515
    uint length;
516
    uchar *buff;
517
    strmov(org_name,result_table);		/* Fix error messages */
518
    VOID(fn_format(new_name,result_table,"",MI_NAME_IEXT,2));
519
    if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
520
	< 0)
521
      goto err;
522
    length=(uint) share->base.keystart;
523
    if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
524
      goto err;
525
    if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
526
	my_write(join_isam_file,buff,length,
527
		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
528
    {
529
      my_free(buff,MYF(0));
530
      goto err;
531
    }
532
    my_free(buff,MYF(0));
533
    VOID(fn_format(new_name,result_table,"",MI_NAME_DEXT,2));
534
  }
535
  else if (!tmp_dir[0])
536
    VOID(make_new_name(new_name,org_name));
537
  else
538
    VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
539
  if (!test_only &&
540
      (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
541
    goto err;
542
543
  /* Start calculating statistics */
544
545
  mrg->records=0;
546
  for (i=0 ; i < mrg->count ; i++)
547
    mrg->records+=mrg->file[i]->s->state.state.records;
548
549
  DBUG_PRINT("info", ("Compressing %s: (%lu records)",
550
                      result_table ? new_name : org_name,
551
                      (ulong) mrg->records));
552
  if (write_loop || verbose)
553
  {
554
    VOID(printf("Compressing %s: (%lu records)\n",
555
                result_table ? new_name : org_name, (ulong) mrg->records));
556
  }
557
  trees=fields=share->base.fields;
558
  huff_counts=init_huff_count(isam_file,mrg->records);
559
  QUICK_SAFEMALLOC;
560
561
  /*
562
    Read the whole data file(s) for statistics.
563
  */
564
  DBUG_PRINT("info", ("- Calculating statistics"));
565
  if (write_loop || verbose)
566
    VOID(printf("- Calculating statistics\n"));
567
  if (get_statistic(mrg,huff_counts))
568
    goto err;
569
  NORMAL_SAFEMALLOC;
570
  old_length=0;
571
  for (i=0; i < mrg->count ; i++)
572
    old_length+= (mrg->file[i]->s->state.state.data_file_length -
573
		  mrg->file[i]->s->state.state.empty);
574
575
  /*
576
    Create a global priority queue in preparation for making 
577
    temporary Huffman trees.
578
  */
579
  if (init_queue(&queue,256,0,0,compare_huff_elements,0))
580
    goto err;
581
582
  /*
583
    Check each column if we should use pre-space-compress, end-space-
584
    compress, empty-field-compress or zero-field-compress.
585
  */
586
  check_counts(huff_counts,fields,mrg->records);
587
588
  /*
589
    Build a Huffman tree for each column.
590
  */
591
  huff_trees=make_huff_trees(huff_counts,trees);
592
593
  /*
594
    If the packed lengths of combined columns is less then the sum of
595
    the non-combined columns, then create common Huffman trees for them.
596
    We do this only for byte compressed columns, not for distinct values
597
    compressed columns.
598
  */
599
  if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
600
    goto err;
601
602
  /*
603
    Assign codes to all byte or column values.
604
  */
605
  if (make_huff_decode_table(huff_trees,fields))
606
    goto err;
607
608
  /* Prepare a file buffer. */
609
  init_file_buffer(new_file,0);
610
611
  /*
612
    Reserve space in the target file for the fixed compressed file header.
613
  */
614
  file_buffer.pos_in_file=HEAD_LENGTH;
615
  if (! test_only)
616
    VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
617
618
  /*
619
    Write field infos: field type, pack type, length bits, tree number.
620
  */
621
  write_field_info(huff_counts,fields,used_trees);
622
623
  /*
624
    Write decode trees.
625
  */
626
  if (!(tot_elements=write_huff_tree(huff_trees,trees)))
627
    goto err;
628
629
  /*
630
    Calculate the total length of the compression info header.
631
    This includes the fixed compressed file header, the column compression
632
    type descriptions, and the decode trees.
633
  */
634
  header_length=(uint) file_buffer.pos_in_file+
635
    (uint) (file_buffer.pos-file_buffer.buffer);
636
637
  /*
638
    Compress the source file into the target file.
639
  */
640
  DBUG_PRINT("info", ("- Compressing file"));
641
  if (write_loop || verbose)
642
    VOID(printf("- Compressing file\n"));
643
  error=compress_isam_file(mrg,huff_counts);
644
  new_length=file_buffer.pos_in_file;
645
  if (!error && !test_only)
646
  {
647
    uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
648
    bzero(buff,sizeof(buff));
649
    error=my_write(file_buffer.file,buff,sizeof(buff),
650
		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
651
  }
652
653
  /*
654
    Write the fixed compressed file header.
655
  */
656
  if (!error)
657
    error=write_header(mrg,header_length,used_trees,tot_elements,
658
		       new_length);
659
660
  /* Flush the file buffer. */
661
  end_file_buffer();
662
663
  /* Display statistics. */
664
  DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
665
                      "Mean total length: %6ld\n",
666
                      mrg->min_pack_length, mrg->max_pack_length,
667
                      (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
668
  if (verbose && mrg->records)
669
    VOID(printf("Min record length: %6d   Max length: %6d   "
670
                "Mean total length: %6ld\n", mrg->min_pack_length,
671
                mrg->max_pack_length, (ulong) (new_length/mrg->records)));
672
673
  /* Close source and target file. */
674
  if (!test_only)
675
  {
676
    error|=my_close(new_file,MYF(MY_WME));
677
    if (!result_table)
678
    {
679
      error|=my_close(isam_file->dfile,MYF(MY_WME));
680
      isam_file->dfile= -1;		/* Tell mi_close file is closed */
681
    }
682
  }
683
684
  /* Cleanup. */
685
  free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
686
  if (! test_only && ! error)
687
  {
688
    if (result_table)
689
    {
690
      error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
691
    }
692
    else
693
    {
694
      if (backup)
695
      {
696
	if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
697
		      MYF(MY_WME)))
698
	  error=1;
699
	else
700
	{
701
	  if (tmp_dir[0])
702
	    error=my_copy(new_name,org_name,MYF(MY_WME));
703
	  else
704
	    error=my_rename(new_name,org_name,MYF(MY_WME));
705
	  if (!error)
706
          {
707
	    VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
708
            if (tmp_dir[0])
709
              VOID(my_delete(new_name,MYF(MY_WME)));
710
          }
711
	}
712
      }
713
      else
714
      {
715
	if (tmp_dir[0])
716
        {
717
	  error=my_copy(new_name,org_name,
718
			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
719
          if (!error)
720
            VOID(my_delete(new_name,MYF(MY_WME)));
721
        }
722
	else
723
	  error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
724
      }
725
      if (! error)
726
	error=save_state(isam_file,mrg,new_length,glob_crc);
727
    }
728
  }
729
  error|=mrg_close(mrg);
730
  if (join_isam_file >= 0)
731
    error|=my_close(join_isam_file,MYF(MY_WME));
732
  if (error)
733
  {
734
    VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
735
    VOID(my_delete(new_name,MYF(MY_WME)));
736
    DBUG_RETURN(-1);
737
  }
738
  if (write_loop || verbose)
739
  {
740
    if (old_length)
741
      VOID(printf("%.4g%%     \n",
742
                  (((longlong) (old_length - new_length)) * 100.0 /
743
                   (longlong) old_length)));
744
    else
745
      puts("Empty file saved in compressed format");
746
  }
747
  DBUG_RETURN(0);
748
749
 err:
750
  free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
751
  if (new_file >= 0)
752
    VOID(my_close(new_file,MYF(0)));
753
  if (join_isam_file >= 0)
754
    VOID(my_close(join_isam_file,MYF(0)));
755
  mrg_close(mrg);
756
  VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
757
  DBUG_RETURN(-1);
758
}
759
760
	/* Init a huff_count-struct for each field and init it */
761
762
static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
763
{
764
  register uint i;
765
  register HUFF_COUNTS *count;
766
  if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
767
					sizeof(HUFF_COUNTS),
768
					MYF(MY_ZEROFILL | MY_WME))))
769
  {
770
    for (i=0 ; i < info->s->base.fields ; i++)
771
    {
772
      enum en_fieldtype type;
773
      count[i].field_length=info->s->rec[i].length;
774
      type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
775
      if (type == FIELD_INTERVALL ||
776
	  type == FIELD_CONSTANT ||
777
	  type == FIELD_ZERO)
778
	type = FIELD_NORMAL;
779
      if (count[i].field_length <= 8 &&
780
	  (type == FIELD_NORMAL ||
781
	   type == FIELD_SKIP_ZERO))
782
	count[i].max_zero_fill= count[i].field_length;
783
      /*
784
        For every column initialize a tree, which is used to detect distinct
785
        column values. 'int_tree' works together with 'tree_buff' and
786
        'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
787
        This is accomplished by '-1' as the element size.
788
      */
789
      init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
790
		NULL);
791
      if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
792
	count[i].tree_pos=count[i].tree_buff =
793
	  my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
794
		    MYF(MY_WME));
795
    }
796
  }
797
  return count;
798
}
799
800
801
	/* Free memory used by counts and trees */
802
803
static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
804
					   HUFF_COUNTS *huff_counts,
805
					   uint fields)
806
{
807
  register uint i;
808
809
  if (huff_trees)
810
  {
811
    for (i=0 ; i < trees ; i++)
812
    {
813
      if (huff_trees[i].element_buffer)
814
	my_free((uchar*) huff_trees[i].element_buffer,MYF(0));
815
      if (huff_trees[i].code)
816
	my_free((uchar*) huff_trees[i].code,MYF(0));
817
    }
818
    my_free((uchar*) huff_trees,MYF(0));
819
  }
820
  if (huff_counts)
821
  {
822
    for (i=0 ; i < fields ; i++)
823
    {
824
      if (huff_counts[i].tree_buff)
825
      {
826
	my_free((uchar*) huff_counts[i].tree_buff,MYF(0));
827
	delete_tree(&huff_counts[i].int_tree);
828
      }
829
    }
830
    my_free((uchar*) huff_counts,MYF(0));
831
  }
832
  delete_queue(&queue);		/* This is safe to free */
833
  return;
834
}
835
836
	/* Read through old file and gather some statistics */
837
838
static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
839
{
840
  int error;
841
  uint length;
842
  ulong reclength,max_blob_length;
843
  uchar *record,*pos,*next_pos,*end_pos,*start_pos;
844
  ha_rows record_count;
845
  my_bool static_row_size;
846
  HUFF_COUNTS *count,*end_count;
847
  TREE_ELEMENT *element;
848
  DBUG_ENTER("get_statistic");
849
850
  reclength=mrg->file[0]->s->base.reclength;
851
  record=(uchar*) my_alloca(reclength);
852
  end_count=huff_counts+mrg->file[0]->s->base.fields;
853
  record_count=0; glob_crc=0;
854
  max_blob_length=0;
855
856
  /* Check how to calculate checksum */
857
  static_row_size=1;
858
  for (count=huff_counts ; count < end_count ; count++)
859
  {
860
    if (count->field_type == FIELD_BLOB ||
861
        count->field_type == FIELD_VARCHAR)
862
    {
863
      static_row_size=0;
864
      break;
865
    }
866
  }
867
868
  mrg_reset(mrg);
869
  while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
870
  {
871
    ulong tot_blob_length=0;
872
    if (! error)
873
    {
874
      /* glob_crc is a checksum over all bytes of all records. */
875
      if (static_row_size)
876
	glob_crc+=mi_static_checksum(mrg->file[0],record);
877
      else
878
	glob_crc+=mi_checksum(mrg->file[0],record);
879
880
      /* Count the incidence of values separately for every column. */
881
      for (pos=record,count=huff_counts ;
882
	   count < end_count ;
883
	   count++,
884
	   pos=next_pos)
885
      {
886
	next_pos=end_pos=(start_pos=pos)+count->field_length;
887
888
	/*
889
          Put the whole column value in a tree if there is room for it.
890
          'int_tree' is used to quickly check for duplicate values.
891
          'tree_buff' collects as many distinct column values as
892
          possible. If the field length is > 1, it is tree_buff_length,
893
          else 2 bytes. Each value is 'field_length' bytes big. If there
894
          are more distinct column values than fit into the buffer, we
895
          give up with this tree. BLOBs and VARCHARs do not have a
896
          tree_buff as it can only be used with fixed length columns.
897
          For the special case of field length == 1, we handle only the
898
          case that there is only one distinct value in the table(s).
899
          Otherwise, we can have a maximum of 256 distinct values. This
900
          is then handled by the normal Huffman tree build.
901
902
          Another limit for collecting distinct column values is the
903
          number of values itself. Since we would need to build a
904
          Huffman tree for the values, we are limited by the 'IS_OFFSET'
905
          constant. This constant expresses a bit which is used to
906
          determine if a tree element holds a final value or an offset
907
          to a child element. Hence, all values and offsets need to be
908
          smaller than 'IS_OFFSET'. A tree element is implemented with
909
          two integer values, one for the left branch and one for the
910
          right branch. For the extreme case that the first element
911
          points to the last element, the number of integers in the tree
912
          must be less or equal to IS_OFFSET. So the number of elements
913
          must be less or equal to IS_OFFSET / 2.
914
915
          WARNING: At first, we insert a pointer into the record buffer
916
          as the key for the tree. If we got a new distinct value, which
917
          is really inserted into the tree, instead of being counted
918
          only, we will copy the column value from the record buffer to
919
          'tree_buff' and adjust the key pointer of the tree accordingly.
920
        */
921
	if (count->tree_buff)
922
	{
923
	  global_count=count;
924
	  if (!(element=tree_insert(&count->int_tree,pos, 0, 
925
				    count->int_tree.custom_arg)) ||
926
	      (element->count == 1 &&
927
	       (count->tree_buff + tree_buff_length <
928
                count->tree_pos + count->field_length)) ||
929
              (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
930
	      (count->field_length == 1 &&
931
	       count->int_tree.elements_in_tree > 1))
932
	  {
933
	    delete_tree(&count->int_tree);
934
	    my_free(count->tree_buff,MYF(0));
935
	    count->tree_buff=0;
936
	  }
937
	  else
938
	  {
939
            /*
940
              If tree_insert() succeeds, it either creates a new element
941
              or increments the counter of an existing element.
942
            */
943
	    if (element->count == 1)
944
	    {
945
              /* Copy the new column value into 'tree_buff'. */
946
	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
947
              /* Adjust the key pointer in the tree. */
948
	      tree_set_pointer(element,count->tree_pos);
949
              /* Point behind the last column value so far. */
950
	      count->tree_pos+=count->field_length;
951
	    }
952
	  }
953
	}
954
955
	/* Save character counters and space-counts and zero-field-counts */
956
	if (count->field_type == FIELD_NORMAL ||
957
	    count->field_type == FIELD_SKIP_ENDSPACE)
958
	{
959
          /* Ignore trailing space. */
960
	  for ( ; end_pos > pos ; end_pos--)
961
	    if (end_pos[-1] != ' ')
962
	      break;
963
          /* Empty fields are just counted. Go to the next record. */
964
	  if (end_pos == pos)
965
	  {
966
	    count->empty_fields++;
967
	    count->max_zero_fill=0;
968
	    continue;
969
	  }
970
          /*
971
            Count the total of all trailing spaces and the number of
972
            short trailing spaces. Remember the longest trailing space.
973
          */
974
	  length= (uint) (next_pos-end_pos);
975
	  count->tot_end_space+=length;
976
	  if (length < 8)
977
	    count->end_space[length]++;
978
	  if (count->max_end_space < length)
979
	    count->max_end_space = length;
980
	}
981
982
	if (count->field_type == FIELD_NORMAL ||
983
	    count->field_type == FIELD_SKIP_PRESPACE)
984
	{
985
          /* Ignore leading space. */
986
	  for (pos=start_pos; pos < end_pos ; pos++)
987
	    if (pos[0] != ' ')
988
	      break;
989
          /* Empty fields are just counted. Go to the next record. */
990
	  if (end_pos == pos)
991
	  {
992
	    count->empty_fields++;
993
	    count->max_zero_fill=0;
994
	    continue;
995
	  }
996
          /*
997
            Count the total of all leading spaces and the number of
998
            short leading spaces. Remember the longest leading space.
999
          */
1000
	  length= (uint) (pos-start_pos);
1001
	  count->tot_pre_space+=length;
1002
	  if (length < 8)
1003
	    count->pre_space[length]++;
1004
	  if (count->max_pre_space < length)
1005
	    count->max_pre_space = length;
1006
	}
1007
1008
        /* Calculate pos, end_pos, and max_length for variable length fields. */
1009
	if (count->field_type == FIELD_BLOB)
1010
	{
1011
	  uint field_length=count->field_length -portable_sizeof_char_ptr;
1012
	  ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
1013
	  memcpy_fixed((char*) &pos,  start_pos+field_length,sizeof(char*));
1014
	  end_pos=pos+blob_length;
1015
	  tot_blob_length+=blob_length;
1016
	  set_if_bigger(count->max_length,blob_length);
1017
	}
1018
	else if (count->field_type == FIELD_VARCHAR)
1019
	{
1020
          uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1021
	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1022
                   uint2korr(start_pos));
1023
	  pos= start_pos+pack_length;
1024
	  end_pos= pos+length;
1025
	  set_if_bigger(count->max_length,length);
1026
	}
1027
1028
        /* Evaluate 'max_zero_fill' for short fields. */
1029
	if (count->field_length <= 8 &&
1030
	    (count->field_type == FIELD_NORMAL ||
1031
	     count->field_type == FIELD_SKIP_ZERO))
1032
	{
1033
	  uint i;
1034
          /* Zero fields are just counted. Go to the next record. */
1035
	  if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1036
	  {
1037
	    count->zero_fields++;
1038
	    continue;
1039
	  }
1040
          /*
1041
            max_zero_fill starts with field_length. It is decreased every
1042
            time a shorter "zero trailer" is found. It is set to zero when
1043
            an empty field is found (see above). This suggests that the
1044
            variable should be called 'min_zero_fill'.
1045
          */
1046
	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1047
	       i++) ;
1048
	  if (i < count->max_zero_fill)
1049
	    count->max_zero_fill=i;
1050
	}
1051
1052
        /* Ignore zero fields and check fields. */
1053
	if (count->field_type == FIELD_ZERO ||
1054
	    count->field_type == FIELD_CHECK)
1055
	  continue;
1056
1057
        /*
1058
          Count the incidence of every byte value in the
1059
          significant field value.
1060
        */
1061
	for ( ; pos < end_pos ; pos++)
1062
	  count->counts[(uchar) *pos]++;
1063
1064
        /* Step to next field. */
1065
      }
1066
1067
      if (tot_blob_length > max_blob_length)
1068
	max_blob_length=tot_blob_length;
1069
      record_count++;
1070
      if (write_loop && record_count % WRITE_COUNT == 0)
1071
      {
1072
	VOID(printf("%lu\r", (ulong) record_count));
1073
        VOID(fflush(stdout));
1074
      }
1075
    }
1076
    else if (error != HA_ERR_RECORD_DELETED)
1077
    {
1078
      VOID(fprintf(stderr, "Got error %d while reading rows", error));
1079
      break;
1080
    }
1081
1082
    /* Step to next record. */
1083
  }
1084
  if (write_loop)
1085
  {
1086
    VOID(printf("            \r"));
1087
    VOID(fflush(stdout));
1088
  }
1089
1090
  /*
1091
    If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1092
    codes.
1093
  */
1094
  DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1095
1096
  DBUG_PRINT("info", ("Found the following number of incidents "
1097
                      "of the byte codes:"));
1098
  if (verbose >= 2)
1099
    VOID(printf("Found the following number of incidents "
1100
                "of the byte codes:\n"));
1101
  for (count= huff_counts ; count < end_count; count++)
1102
  {
1103
    uint      idx;
1104
    my_off_t  total_count;
1105
    char      llbuf[32];
1106
1107
    DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1108
    if (verbose >= 2)
1109
      VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1110
    if (count->tree_buff)
1111
    {
1112
      DBUG_PRINT("info", ("number of distinct values: %u",
1113
                          (uint) ((count->tree_pos - count->tree_buff) /
1114
                                  count->field_length)));
1115
      if (verbose >= 2)
1116
        VOID(printf("number of distinct values: %u\n",
1117
                    (uint) ((count->tree_pos - count->tree_buff) /
1118
                            count->field_length)));
1119
    }
1120
    total_count= 0;
1121
    for (idx= 0; idx < 256; idx++)
1122
    {
1123
      if (count->counts[idx])
1124
      {
1125
        total_count+= count->counts[idx];
1126
        DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1127
                            llstr((longlong) count->counts[idx], llbuf)));
1128
        if (verbose >= 2)
1129
          VOID(printf("counts[0x%02x]: %12s\n", idx,
1130
                      llstr((longlong) count->counts[idx], llbuf)));
1131
      }
1132
    }
1133
    DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
1134
                                                    llbuf)));
1135
    if ((verbose >= 2) && total_count)
1136
    {
1137
      VOID(printf("total:        %12s\n",
1138
                  llstr((longlong) total_count, llbuf)));
1139
    }
1140
  }
1141
1142
  mrg->records=record_count;
1143
  mrg->max_blob_length=max_blob_length;
1144
  my_afree((uchar*) record);
1145
  DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1146
}
1147
1148
static int compare_huff_elements(void *not_used __attribute__((unused)),
1149
				 uchar *a, uchar *b)
1150
{
1151
  return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1152
    (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1153
}
1154
1155
	/* Check each tree if we should use pre-space-compress, end-space-
1156
	   compress, empty-field-compress or zero-field-compress */
1157
1158
static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1159
			 my_off_t records)
1160
{
1161
  uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1162
  my_off_t old_length,new_length,length;
1163
  DBUG_ENTER("check_counts");
1164
1165
  bzero((uchar*) field_count,sizeof(field_count));
1166
  space_fields=fill_zero_fields=0;
1167
1168
  for (; trees-- ; huff_counts++)
1169
  {
1170
    if (huff_counts->field_type == FIELD_BLOB)
1171
    {
1172
      huff_counts->length_bits=max_bit(huff_counts->max_length);
1173
      goto found_pack;
1174
    }
1175
    else if (huff_counts->field_type == FIELD_VARCHAR)
1176
    {
1177
      huff_counts->length_bits=max_bit(huff_counts->max_length);
1178
      goto found_pack;
1179
    }
1180
    else if (huff_counts->field_type == FIELD_CHECK)
1181
    {
1182
      huff_counts->bytes_packed=0;
1183
      huff_counts->counts[0]=0;
1184
      goto found_pack;
1185
    }
1186
1187
    huff_counts->field_type=FIELD_NORMAL;
1188
    huff_counts->pack_type=0;
1189
1190
    /* Check for zero-filled records (in this column), or zero records. */
1191
    if (huff_counts->zero_fields || ! records)
1192
    {
1193
      my_off_t old_space_count;
1194
      /*
1195
        If there are only zero filled records (in this column),
1196
        or no records at all, we are done.
1197
      */
1198
      if (huff_counts->zero_fields == records)
1199
      {
1200
	huff_counts->field_type= FIELD_ZERO;
1201
	huff_counts->bytes_packed=0;
1202
	huff_counts->counts[0]=0;
1203
	goto found_pack;
1204
      }
1205
      /* Remeber the number of significant spaces. */
1206
      old_space_count=huff_counts->counts[' '];
1207
      /* Add all leading and trailing spaces. */
1208
      huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1209
                                  huff_counts->tot_pre_space +
1210
                                  huff_counts->empty_fields *
1211
                                  huff_counts->field_length);
1212
      /* Check, what the compressed length of this would be. */
1213
      old_length=calc_packed_length(huff_counts,0)+records/8;
1214
      /* Get the number of zero bytes. */
1215
      length=huff_counts->zero_fields*huff_counts->field_length;
1216
      /* Add it to the counts. */
1217
      huff_counts->counts[0]+=length;
1218
      /* Check, what the compressed length of this would be. */
1219
      new_length=calc_packed_length(huff_counts,0);
1220
      /* If the compression without the zeroes would be shorter, we are done. */
1221
      if (old_length < new_length && huff_counts->field_length > 1)
1222
      {
1223
	huff_counts->field_type=FIELD_SKIP_ZERO;
1224
	huff_counts->counts[0]-=length;
1225
	huff_counts->bytes_packed=old_length- records/8;
1226
	goto found_pack;
1227
      }
1228
      /* Remove the insignificant spaces, but keep the zeroes. */
1229
      huff_counts->counts[' ']=old_space_count;
1230
    }
1231
    /* Check, what the compressed length of this column would be. */
1232
    huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1233
1234
    /*
1235
      If there are enough empty records (in this column),
1236
      treating them specially may pay off.
1237
    */
1238
    if (huff_counts->empty_fields)
1239
    {
1240
      if (huff_counts->field_length > 2 &&
1241
	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1242
	  (1+max_bit(max(huff_counts->max_pre_space,
1243
			 huff_counts->max_end_space))) <
1244
	  records * max_bit(huff_counts->field_length))
1245
      {
1246
	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1247
      }
1248
      else
1249
      {
1250
	length=huff_counts->empty_fields*huff_counts->field_length;
1251
	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1252
	{
1253
	  huff_counts->tot_end_space+=length;
1254
	  huff_counts->max_end_space=huff_counts->field_length;
1255
	  if (huff_counts->field_length < 8)
1256
	    huff_counts->end_space[huff_counts->field_length]+=
1257
	      huff_counts->empty_fields;
1258
	}
1259
	if (huff_counts->tot_pre_space)
1260
	{
1261
	  huff_counts->tot_pre_space+=length;
1262
	  huff_counts->max_pre_space=huff_counts->field_length;
1263
	  if (huff_counts->field_length < 8)
1264
	    huff_counts->pre_space[huff_counts->field_length]+=
1265
	      huff_counts->empty_fields;
1266
	}
1267
      }
1268
    }
1269
1270
    /*
1271
      If there are enough trailing spaces (in this column),
1272
      treating them specially may pay off.
1273
    */
1274
    if (huff_counts->tot_end_space)
1275
    {
1276
      huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1277
      if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1278
			      huff_counts->end_space,
1279
			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1280
	goto found_pack;
1281
      huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1282
    }
1283
1284
    /*
1285
      If there are enough leading spaces (in this column),
1286
      treating them specially may pay off.
1287
    */
1288
    if (huff_counts->tot_pre_space)
1289
    {
1290
      if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1291
			      huff_counts->pre_space,
1292
			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1293
	goto found_pack;
1294
    }
1295
1296
  found_pack:			/* Found field-packing */
1297
1298
    /* Test if we can use zero-fill */
1299
1300
    if (huff_counts->max_zero_fill &&
1301
	(huff_counts->field_type == FIELD_NORMAL ||
1302
	 huff_counts->field_type == FIELD_SKIP_ZERO))
1303
    {
1304
      huff_counts->counts[0]-=huff_counts->max_zero_fill*
1305
	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1306
	 records - huff_counts->zero_fields : records);
1307
      huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1308
      huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1309
    }
1310
1311
    /* Test if intervall-field is better */
1312
1313
    if (huff_counts->tree_buff)
1314
    {
1315
      HUFF_TREE tree;
1316
1317
      DBUG_EXECUTE_IF("forceintervall",
1318
                      huff_counts->bytes_packed= ~ (my_off_t) 0;);
1319
      tree.element_buffer=0;
1320
      if (!make_huff_tree(&tree,huff_counts) &&
1321
	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1322
      {
1323
	if (tree.elements == 1)
1324
	  huff_counts->field_type=FIELD_CONSTANT;
1325
	else
1326
	  huff_counts->field_type=FIELD_INTERVALL;
1327
	huff_counts->pack_type=0;
1328
      }
1329
      else
1330
      {
1331
	my_free((uchar*) huff_counts->tree_buff,MYF(0));
1332
	delete_tree(&huff_counts->int_tree);
1333
	huff_counts->tree_buff=0;
1334
      }
1335
      if (tree.element_buffer)
1336
	my_free((uchar*) tree.element_buffer,MYF(0));
1337
    }
1338
    if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1339
      space_fields++;
1340
    if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1341
      fill_zero_fields++;
1342
    field_count[huff_counts->field_type]++;
1343
  }
1344
  DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
1345
                      "empty-zero:       %3d  empty-fill: %3d",
1346
                      field_count[FIELD_NORMAL],space_fields,
1347
                      field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1348
  DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
1349
                      "intervall-fields: %3d  zero:       %3d",
1350
                      field_count[FIELD_SKIP_PRESPACE],
1351
                      field_count[FIELD_SKIP_ENDSPACE],
1352
                      field_count[FIELD_INTERVALL],
1353
                      field_count[FIELD_ZERO]));
1354
  if (verbose)
1355
    VOID(printf("\nnormal:    %3d  empty-space:     %3d  "
1356
                "empty-zero:       %3d  empty-fill: %3d\n"
1357
                "pre-space: %3d  end-space:       %3d  "
1358
                "intervall-fields: %3d  zero:       %3d\n",
1359
                field_count[FIELD_NORMAL],space_fields,
1360
                field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1361
                field_count[FIELD_SKIP_PRESPACE],
1362
                field_count[FIELD_SKIP_ENDSPACE],
1363
                field_count[FIELD_INTERVALL],
1364
                field_count[FIELD_ZERO]));
1365
  DBUG_VOID_RETURN;
1366
}
1367
1368
	/* Test if we can use space-compression and empty-field-compression */
1369
1370
static int
1371
test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1372
		    uint max_space_length, my_off_t *space_counts,
1373
		    my_off_t tot_space_count, enum en_fieldtype field_type)
1374
{
1375
  int min_pos;
1376
  uint length_bits,i;
1377
  my_off_t space_count,min_space_count,min_pack,new_length,skip;
1378
1379
  length_bits=max_bit(max_space_length);
1380
1381
		/* Default no end_space-packing */
1382
  space_count=huff_counts->counts[(uint) ' '];
1383
  min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1384
  min_pack=calc_packed_length(huff_counts,0);
1385
  min_pos= -2;
1386
  huff_counts->counts[(uint) ' ']=space_count;
1387
1388
	/* Test with allways space-count */
1389
  new_length=huff_counts->bytes_packed+length_bits*records/8;
1390
  if (new_length+1 < min_pack)
1391
  {
1392
    min_pos= -1;
1393
    min_pack=new_length;
1394
    min_space_count=space_count;
1395
  }
1396
	/* Test with length-flag */
1397
  for (skip=0L, i=0 ; i < 8 ; i++)
1398
  {
1399
    if (space_counts[i])
1400
    {
1401
      if (i)
1402
	huff_counts->counts[(uint) ' ']+=space_counts[i];
1403
      skip+=huff_counts->pre_space[i];
1404
      new_length=calc_packed_length(huff_counts,0)+
1405
	(records+(records-skip)*(1+length_bits))/8;
1406
      if (new_length < min_pack)
1407
      {
1408
	min_pos=(int) i;
1409
	min_pack=new_length;
1410
	min_space_count=huff_counts->counts[(uint) ' '];
1411
      }
1412
    }
1413
  }
1414
1415
  huff_counts->counts[(uint) ' ']=min_space_count;
1416
  huff_counts->bytes_packed=min_pack;
1417
  switch (min_pos) {
1418
  case -2:
1419
    return(0);				/* No space-compress */
1420
  case -1:				/* Always space-count */
1421
    huff_counts->field_type=field_type;
1422
    huff_counts->min_space=0;
1423
    huff_counts->length_bits=max_bit(max_space_length);
1424
    break;
1425
  default:
1426
    huff_counts->field_type=field_type;
1427
    huff_counts->min_space=(uint) min_pos;
1428
    huff_counts->pack_type|=PACK_TYPE_SELECTED;
1429
    huff_counts->length_bits=max_bit(max_space_length);
1430
    break;
1431
  }
1432
  return(1);				/* Using space-compress */
1433
}
1434
1435
1436
	/* Make a huff_tree of each huff_count */
1437
1438
static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1439
{
1440
  uint tree;
1441
  HUFF_TREE *huff_tree;
1442
  DBUG_ENTER("make_huff_trees");
1443
1444
  if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1445
					 MYF(MY_WME | MY_ZEROFILL))))
1446
    DBUG_RETURN(0);
1447
1448
  for (tree=0 ; tree < trees ; tree++)
1449
  {
1450
    if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1451
    {
1452
      while (tree--)
1453
	my_free((uchar*) huff_tree[tree].element_buffer,MYF(0));
1454
      my_free((uchar*) huff_tree,MYF(0));
1455
      DBUG_RETURN(0);
1456
    }
1457
  }
1458
  DBUG_RETURN(huff_tree);
1459
}
1460
1461
/*
1462
  Build a Huffman tree.
1463
1464
  SYNOPSIS
1465
    make_huff_tree()
1466
    huff_tree                   The Huffman tree.
1467
    huff_counts                 The counts.
1468
1469
  DESCRIPTION
1470
    Build a Huffman tree according to huff_counts->counts or
1471
    huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1472
    tree_buff_length of distinct column values. In that case, whole
1473
    values can be Huffman encoded instead of single bytes.
1474
1475
  RETURN
1476
    0           OK
1477
    != 0        Error
1478
*/
1479
1480
static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1481
{
1482
  uint i,found,bits_packed,first,last;
1483
  my_off_t bytes_packed;
1484
  HUFF_ELEMENT *a,*b,*new_huff_el;
1485
1486
  first=last=0;
1487
  if (huff_counts->tree_buff)
1488
  {
1489
    /* Calculate the number of distinct values in tree_buff. */
1490
    found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1491
      huff_counts->field_length;
1492
    first=0; last=found-1;
1493
  }
1494
  else
1495
  {
1496
    /* Count the number of byte codes found in the column. */
1497
    for (i=found=0 ; i < 256 ; i++)
1498
    {
1499
      if (huff_counts->counts[i])
1500
      {
1501
	if (! found++)
1502
	  first=i;
1503
	last=i;
1504
      }
1505
    }
1506
    if (found < 2)
1507
      found=2;
1508
  }
1509
1510
  /* When using 'tree_buff' we can have more that 256 values. */
1511
  if (queue.max_elements < found)
1512
  {
1513
    delete_queue(&queue);
1514
    if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1515
      return -1;
1516
  }
1517
1518
  /* Allocate or reallocate an element buffer for the Huffman tree. */
1519
  if (!huff_tree->element_buffer)
1520
  {
1521
    if (!(huff_tree->element_buffer=
1522
	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1523
      return 1;
1524
  }
1525
  else
1526
  {
1527
    HUFF_ELEMENT *temp;
1528
    if (!(temp=
1529
	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1530
				     found*2*sizeof(HUFF_ELEMENT),
1531
				     MYF(MY_WME))))
1532
      return 1;
1533
    huff_tree->element_buffer=temp;
1534
  }
1535
1536
  huff_counts->tree=huff_tree;
1537
  huff_tree->counts=huff_counts;
1538
  huff_tree->min_chr=first;
1539
  huff_tree->max_chr=last;
1540
  huff_tree->char_bits=max_bit(last-first);
1541
  huff_tree->offset_bits=max_bit(found-1)+1;
1542
1543
  if (huff_counts->tree_buff)
1544
  {
1545
    huff_tree->elements=0;
1546
    huff_tree->tree_pack_length=(1+15+16+5+5+
1547
				 (huff_tree->char_bits+1)*found+
1548
				 (huff_tree->offset_bits+1)*
1549
				 (found-2)+7)/8 +
1550
				   (uint) (huff_tree->counts->tree_pos-
1551
					   huff_tree->counts->tree_buff);
1552
    /*
1553
      Put a HUFF_ELEMENT into the queue for every distinct column value.
1554
1555
      tree_walk() calls save_counts_in_queue() for every element in
1556
      'int_tree'. This takes elements from the target trees element
1557
      buffer and places references to them into the buffer of the
1558
      priority queue. We insert in column value order, but the order is
1559
      in fact irrelevant here. We will establish the correct order
1560
      later.
1561
    */
1562
    tree_walk(&huff_counts->int_tree,
1563
	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1564
	      (uchar*) huff_tree, left_root_right);
1565
  }
1566
  else
1567
  {
1568
    huff_tree->elements=found;
1569
    huff_tree->tree_pack_length=(9+9+5+5+
1570
				 (huff_tree->char_bits+1)*found+
1571
				 (huff_tree->offset_bits+1)*
1572
				 (found-2)+7)/8;
1573
    /*
1574
      Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1575
1576
      The elements are taken from the target trees element buffer.
1577
      Instead of using queue_insert(), we just place references to the
1578
      elements into the buffer of the priority queue. We insert in byte
1579
      value order, but the order is in fact irrelevant here. We will
1580
      establish the correct order later.
1581
    */
1582
    for (i=first, found=0 ; i <= last ; i++)
1583
    {
1584
      if (huff_counts->counts[i])
1585
      {
1586
	new_huff_el=huff_tree->element_buffer+(found++);
1587
	new_huff_el->count=huff_counts->counts[i];
1588
	new_huff_el->a.leaf.null=0;
1589
	new_huff_el->a.leaf.element_nr=i;
1590
	queue.root[found]=(uchar*) new_huff_el;
1591
      }
1592
    }
1593
    /*
1594
      If there is only a single byte value in this field in all records,
1595
      add a second element with zero incidence. This is required to enter
1596
      the loop, which builds the Huffman tree.
1597
    */
1598
    while (found < 2)
1599
    {
1600
      new_huff_el=huff_tree->element_buffer+(found++);
1601
      new_huff_el->count=0;
1602
      new_huff_el->a.leaf.null=0;
1603
      if (last)
1604
	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1605
      else
1606
	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1607
      queue.root[found]=(uchar*) new_huff_el;
1608
    }
1609
  }
1610
1611
  /* Make a queue from the queue buffer. */
1612
  queue.elements=found;
1613
1614
  /*
1615
    Make a priority queue from the queue. Construct its index so that we
1616
    have a partially ordered tree.
1617
  */
1618
  for (i=found/2 ; i > 0 ; i--)
1619
    _downheap(&queue,i);
1620
1621
  /* The Huffman algorithm. */
1622
  bytes_packed=0; bits_packed=0;
1623
  for (i=1 ; i < found ; i++)
1624
  {
1625
    /*
1626
      Pop the top element from the queue (the one with the least incidence).
1627
      Popping from a priority queue includes a re-ordering of the queue,
1628
      to get the next least incidence element to the top.
1629
    */
1630
    a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1631
    /*
1632
      Copy the next least incidence element. The queue implementation
1633
      reserves root[0] for temporary purposes. root[1] is the top.
1634
    */
1635
    b=(HUFF_ELEMENT*) queue.root[1];
1636
    /* Get a new element from the element buffer. */
1637
    new_huff_el=huff_tree->element_buffer+found+i;
1638
    /* The new element gets the sum of the two least incidence elements. */
1639
    new_huff_el->count=a->count+b->count;
1640
    /*
1641
      The Huffman algorithm assigns another bit to the code for a byte
1642
      every time that bytes incidence is combined (directly or indirectly)
1643
      to a new element as one of the two least incidence elements.
1644
      This means that one more bit per incidence of that byte is required
1645
      in the resulting file. So we add the new combined incidence as the
1646
      number of bits by which the result grows.
1647
    */
1648
    bits_packed+=(uint) (new_huff_el->count & 7);
1649
    bytes_packed+=new_huff_el->count/8;
1650
    /* The new element points to its children, lesser in left.  */
1651
    new_huff_el->a.nod.left=a;
1652
    new_huff_el->a.nod.right=b;
1653
    /*
1654
      Replace the copied top element by the new element and re-order the
1655
      queue.
1656
    */
1657
    queue.root[1]=(uchar*) new_huff_el;
1658
    queue_replaced(&queue);
1659
  }
1660
  huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1661
  huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1662
  return 0;
1663
}
1664
1665
static int compare_tree(void* cmp_arg __attribute__((unused)),
1666
			register const uchar *s, register const uchar *t)
1667
{
1668
  uint length;
1669
  for (length=global_count->field_length; length-- ;)
1670
    if (*s++ != *t++)
1671
      return (int) s[-1] - (int) t[-1];
1672
  return 0;
1673
}
1674
1675
/*
1676
  Organize distinct column values and their incidences into a priority queue.
1677
1678
  SYNOPSIS
1679
    save_counts_in_queue()
1680
    key                         The column value.
1681
    count                       The incidence of this value.
1682
    tree                        The Huffman tree to be built later.
1683
1684
  DESCRIPTION
1685
    We use the element buffer of the targeted tree. The distinct column
1686
    values are organized in a priority queue first. The Huffman
1687
    algorithm will later organize the elements into a Huffman tree. For
1688
    the time being, we just place references to the elements into the
1689
    queue buffer. The buffer will later be organized into a priority
1690
    queue.
1691
1692
  RETURN
1693
    0
1694
 */
1695
1696
static int save_counts_in_queue(uchar *key, element_count count,
1697
				HUFF_TREE *tree)
1698
{
1699
  HUFF_ELEMENT *new_huff_el;
1700
1701
  new_huff_el=tree->element_buffer+(tree->elements++);
1702
  new_huff_el->count=count;
1703
  new_huff_el->a.leaf.null=0;
1704
  new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1705
    tree->counts->field_length;
1706
  queue.root[tree->elements]=(uchar*) new_huff_el;
1707
  return 0;
1708
}
1709
1710
1711
/*
1712
  Calculate length of file if given counts should be used.
1713
1714
  SYNOPSIS
1715
    calc_packed_length()
1716
    huff_counts                 The counts for a column of the table(s).
1717
    add_tree_lenght             If the decode tree length should be added.
1718
1719
  DESCRIPTION
1720
    We need to follow the Huffman algorithm until we know, how many bits
1721
    are required for each byte code. But we do not need the resulting
1722
    Huffman tree. Hence, we can leave out some steps which are essential
1723
    in make_huff_tree().
1724
1725
  RETURN
1726
    Number of bytes required to compress this table column.
1727
*/
1728
1729
static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1730
				   uint add_tree_lenght)
1731
{
1732
  uint i,found,bits_packed,first,last;
1733
  my_off_t bytes_packed;
1734
  HUFF_ELEMENT element_buffer[256];
1735
  DBUG_ENTER("calc_packed_length");
1736
1737
  /* 
1738
    WARNING: We use a small hack for efficiency: Instead of placing
1739
    references to HUFF_ELEMENTs into the queue, we just insert
1740
    references to the counts of the byte codes which appeared in this
1741
    table column. During the Huffman algorithm they are successively
1742
    replaced by references to HUFF_ELEMENTs. This works, because
1743
    HUFF_ELEMENTs have the incidence count at their beginning.
1744
    Regardless, wether the queue array contains references to counts of
1745
    type my_off_t or references to HUFF_ELEMENTs which have the count of
1746
    type my_off_t at their beginning, it always points to a count of the
1747
    same type.
1748
1749
    Instead of using queue_insert(), we just copy the references into
1750
    the buffer of the priority queue. We insert in byte value order, but
1751
    the order is in fact irrelevant here. We will establish the correct
1752
    order later.
1753
  */
1754
  first=last=0;
1755
  for (i=found=0 ; i < 256 ; i++)
1756
  {
1757
    if (huff_counts->counts[i])
1758
    {
1759
      if (! found++)
1760
	first=i;
1761
      last=i;
1762
      /* We start with root[1], which is the queues top element. */
1763
      queue.root[found]=(uchar*) &huff_counts->counts[i];
1764
    }
1765
  }
1766
  if (!found)
1767
    DBUG_RETURN(0);			/* Empty tree */
1768
  /*
1769
    If there is only a single byte value in this field in all records,
1770
    add a second element with zero incidence. This is required to enter
1771
    the loop, which follows the Huffman algorithm.
1772
  */
1773
  if (found < 2)
1774
    queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1775
1776
  /* Make a queue from the queue buffer. */
1777
  queue.elements=found;
1778
1779
  bytes_packed=0; bits_packed=0;
1780
  /* Add the length of the coding table, which would become part of the file. */
1781
  if (add_tree_lenght)
1782
    bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1783
		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1784
1785
  /*
1786
    Make a priority queue from the queue. Construct its index so that we
1787
    have a partially ordered tree.
1788
  */
1789
  for (i=(found+1)/2 ; i > 0 ; i--)
1790
    _downheap(&queue,i);
1791
1792
  /* The Huffman algorithm. */
1793
  for (i=0 ; i < found-1 ; i++)
1794
  {
1795
    my_off_t        *a;
1796
    my_off_t        *b;
1797
    HUFF_ELEMENT    *new_huff_el;
1798
1799
    /*
1800
      Pop the top element from the queue (the one with the least
1801
      incidence). Popping from a priority queue includes a re-ordering
1802
      of the queue, to get the next least incidence element to the top.
1803
    */
1804
    a= (my_off_t*) queue_remove(&queue, 0);
1805
    /*
1806
      Copy the next least incidence element. The queue implementation
1807
      reserves root[0] for temporary purposes. root[1] is the top.
1808
    */
1809
    b= (my_off_t*) queue.root[1];
1810
    /* Create a new element in a local (automatic) buffer. */
1811
    new_huff_el= element_buffer + i;
1812
    /* The new element gets the sum of the two least incidence elements. */
1813
    new_huff_el->count= *a + *b;
1814
    /*
1815
      The Huffman algorithm assigns another bit to the code for a byte
1816
      every time that bytes incidence is combined (directly or indirectly)
1817
      to a new element as one of the two least incidence elements.
1818
      This means that one more bit per incidence of that byte is required
1819
      in the resulting file. So we add the new combined incidence as the
1820
      number of bits by which the result grows.
1821
    */
1822
    bits_packed+=(uint) (new_huff_el->count & 7);
1823
    bytes_packed+=new_huff_el->count/8;
1824
    /*
1825
      Replace the copied top element by the new element and re-order the
1826
      queue. This successively replaces the references to counts by
1827
      references to HUFF_ELEMENTs.
1828
    */
1829
    queue.root[1]=(uchar*) new_huff_el;
1830
    queue_replaced(&queue);
1831
  }
1832
  DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1833
}
1834
1835
1836
	/* Remove trees that don't give any compression */
1837
1838
static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1839
{
1840
  uint k,tree_number;
1841
  HUFF_COUNTS count,*i,*j,*last_count;
1842
1843
  last_count=huff_counts+trees;
1844
  for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1845
  {
1846
    if (!i->tree->tree_number)
1847
    {
1848
      i->tree->tree_number= ++tree_number;
1849
      if (i->tree_buff)
1850
	continue;			/* Don't join intervall */
1851
      for (j=i+1 ; j < last_count ; j++)
1852
      {
1853
	if (! j->tree->tree_number && ! j->tree_buff)
1854
	{
1855
	  for (k=0 ; k < 256 ; k++)
1856
	    count.counts[k]=i->counts[k]+j->counts[k];
1857
	  if (calc_packed_length(&count,1) <=
1858
	      i->tree->bytes_packed + j->tree->bytes_packed+
1859
	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1860
	      ALLOWED_JOIN_DIFF)
1861
	  {
1862
	    memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1863
			 sizeof(count.counts[0])*256);
1864
	    my_free((uchar*) j->tree->element_buffer,MYF(0));
1865
	    j->tree->element_buffer=0;
1866
	    j->tree=i->tree;
1867
	    bmove((uchar*) i->counts,(uchar*) count.counts,
1868
		  sizeof(count.counts[0])*256);
1869
	    if (make_huff_tree(i->tree,i))
1870
	      return (uint) -1;
1871
	  }
1872
	}
1873
      }
1874
    }
1875
  }
1876
  DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
1877
                      trees, tree_number));
1878
  if (verbose)
1879
    VOID(printf("Original trees:  %d  After join: %d\n", trees, tree_number));
1880
  return tree_number;			/* Return trees left */
1881
}
1882
1883
1884
/*
1885
  Fill in huff_tree encode tables.
1886
1887
  SYNOPSIS
1888
    make_huff_decode_table()
1889
    huff_tree               An array of HUFF_TREE which are to be encoded.
1890
    trees                   The number of HUFF_TREE in the array.
1891
1892
  RETURN
1893
    0           success
1894
    != 0        error
1895
*/
1896
1897
static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1898
{
1899
  uint elements;
1900
  for ( ; trees-- ; huff_tree++)
1901
  {
1902
    if (huff_tree->tree_number > 0)
1903
    {
1904
      elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1905
      if (!(huff_tree->code =
1906
            (ulonglong*) my_malloc(elements*
1907
                                   (sizeof(ulonglong) + sizeof(uchar)),
1908
                                   MYF(MY_WME | MY_ZEROFILL))))
1909
	return 1;
1910
      huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1911
      make_traverse_code_tree(huff_tree, huff_tree->root,
1912
                              8 * sizeof(ulonglong), LL(0));
1913
    }
1914
  }
1915
  return 0;
1916
}
1917
1918
1919
static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1920
				    HUFF_ELEMENT *element,
1921
				    uint size, ulonglong code)
1922
{
1923
  uint chr;
1924
  if (!element->a.leaf.null)
1925
  {
1926
    chr=element->a.leaf.element_nr;
1927
    huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1928
    huff_tree->code[chr]= (code >> size);
1929
    if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1930
        huff_tree->height= 8 * sizeof(ulonglong) - size;
1931
  }
1932
  else
1933
  {
1934
    size--;
1935
    make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1936
    make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1937
			    code + (((ulonglong) 1) << size));
1938
  }
1939
  return;
1940
}
1941
1942
1943
/*
1944
  Convert a value into binary digits.
1945
1946
  SYNOPSIS
1947
    bindigits()
1948
    value                       The value.
1949
    length                      The number of low order bits to convert.
1950
1951
  NOTE
1952
    The result string is in static storage. It is reused on every call.
1953
    So you cannot use it twice in one expression.
1954
1955
  RETURN
1956
    A pointer to a static NUL-terminated string.
1957
 */
1958
1959
static char *bindigits(ulonglong value, uint bits)
1960
{
1961
  static char digits[72];
1962
  char *ptr= digits;
1963
  uint idx= bits;
1964
1965
  DBUG_ASSERT(idx < sizeof(digits));
1966
  while (idx)
1967
    *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1968
  *ptr= '\0';
1969
  return digits;
1970
}
1971
1972
1973
/*
1974
  Convert a value into hexadecimal digits.
1975
1976
  SYNOPSIS
1977
    hexdigits()
1978
    value                       The value.
1979
1980
  NOTE
1981
    The result string is in static storage. It is reused on every call.
1982
    So you cannot use it twice in one expression.
1983
1984
  RETURN
1985
    A pointer to a static NUL-terminated string.
1986
 */
1987
1988
static char *hexdigits(ulonglong value)
1989
{
1990
  static char digits[20];
1991
  char *ptr= digits;
1992
  uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1993
1994
  DBUG_ASSERT(idx < sizeof(digits));
1995
  while (idx)
1996
  {
1997
    if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1998
      *(ptr - 1)+= 'a' - '9' - 1;
1999
  }
2000
  *ptr= '\0';
2001
  return digits;
2002
}
2003
2004
2005
	/* Write header to new packed data file */
2006
2007
static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2008
			my_off_t tot_elements,my_off_t filelength)
2009
{
2010
  uchar *buff= (uchar*) file_buffer.pos;
2011
2012
  bzero(buff,HEAD_LENGTH);
2013
  memcpy_fixed(buff,myisam_pack_file_magic,4);
2014
  int4store(buff+4,head_length);
2015
  int4store(buff+8, mrg->min_pack_length);
2016
  int4store(buff+12,mrg->max_pack_length);
2017
  int4store(buff+16,tot_elements);
2018
  int4store(buff+20,intervall_length);
2019
  int2store(buff+24,trees);
2020
  buff[26]=(char) mrg->ref_length;
2021
	/* Save record pointer length */
2022
  buff[27]= (uchar) mi_get_pointer_length((ulonglong) filelength,2);
2023
  if (test_only)
2024
    return 0;
2025
  VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
2026
  return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2027
		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2028
}
2029
2030
	/* Write fieldinfo to new packed file */
2031
2032
static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2033
{
2034
  register uint i;
2035
  uint huff_tree_bits;
2036
  huff_tree_bits=max_bit(trees ? trees-1 : 0);
2037
2038
  DBUG_PRINT("info", (" "));
2039
  DBUG_PRINT("info", ("column types:"));
2040
  DBUG_PRINT("info", ("FIELD_NORMAL          0"));
2041
  DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
2042
  DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
2043
  DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
2044
  DBUG_PRINT("info", ("FIELD_BLOB            4"));
2045
  DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
2046
  DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
2047
  DBUG_PRINT("info", ("FIELD_ZERO            7"));
2048
  DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
2049
  DBUG_PRINT("info", ("FIELD_CHECK           9"));
2050
  DBUG_PRINT("info", (" "));
2051
  DBUG_PRINT("info", ("pack type as a set of flags:"));
2052
  DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
2053
  DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
2054
  DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
2055
  DBUG_PRINT("info", (" "));
2056
  if (verbose >= 2)
2057
  {
2058
    VOID(printf("\n"));
2059
    VOID(printf("column types:\n"));
2060
    VOID(printf("FIELD_NORMAL          0\n"));
2061
    VOID(printf("FIELD_SKIP_ENDSPACE   1\n"));
2062
    VOID(printf("FIELD_SKIP_PRESPACE   2\n"));
2063
    VOID(printf("FIELD_SKIP_ZERO       3\n"));
2064
    VOID(printf("FIELD_BLOB            4\n"));
2065
    VOID(printf("FIELD_CONSTANT        5\n"));
2066
    VOID(printf("FIELD_INTERVALL       6\n"));
2067
    VOID(printf("FIELD_ZERO            7\n"));
2068
    VOID(printf("FIELD_VARCHAR         8\n"));
2069
    VOID(printf("FIELD_CHECK           9\n"));
2070
    VOID(printf("\n"));
2071
    VOID(printf("pack type as a set of flags:\n"));
2072
    VOID(printf("PACK_TYPE_SELECTED      1\n"));
2073
    VOID(printf("PACK_TYPE_SPACE_FIELDS  2\n"));
2074
    VOID(printf("PACK_TYPE_ZERO_FILL     4\n"));
2075
    VOID(printf("\n"));
2076
  }
2077
  for (i=0 ; i++ < fields ; counts++)
2078
  {
2079
    write_bits((ulonglong) (int) counts->field_type, 5);
2080
    write_bits(counts->pack_type,6);
2081
    if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2082
      write_bits(counts->max_zero_fill,5);
2083
    else
2084
      write_bits(counts->length_bits,5);
2085
    write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2086
    DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
2087
                        "lbits: %2u  tree: %2u  length: %4u",
2088
                        i , counts->field_type, counts->pack_type,
2089
                        counts->max_zero_fill, counts->length_bits,
2090
                        counts->tree->tree_number, counts->field_length));
2091
    if (verbose >= 2)
2092
      VOID(printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2093
                  "tree: %2u  length: %4u\n", i , counts->field_type,
2094
                  counts->pack_type, counts->max_zero_fill, counts->length_bits,
2095
                  counts->tree->tree_number, counts->field_length));
2096
  }
2097
  flush_bits();
2098
  return;
2099
}
2100
2101
	/* Write all huff_trees to new datafile. Return tot count of
2102
	   elements in all trees
2103
	   Returns 0 on error */
2104
2105
static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2106
{
2107
  uint i,int_length;
2108
  uint tree_no;
2109
  uint codes;
2110
  uint errors= 0;
2111
  uint *packed_tree,*offset,length;
2112
  my_off_t elements;
2113
2114
  /* Find the highest number of elements in the trees. */
2115
  for (i=length=0 ; i < trees ; i++)
2116
    if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2117
      length=huff_tree[i].elements;
2118
  /*
2119
    Allocate a buffer for packing a decode tree. Two numbers per element
2120
    (left child and right child).
2121
  */
2122
  if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2123
  {
2124
    my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2125
    return 0;
2126
  }
2127
2128
  DBUG_PRINT("info", (" "));
2129
  if (verbose >= 2)
2130
    VOID(printf("\n"));
2131
  tree_no= 0;
2132
  intervall_length=0;
2133
  for (elements=0; trees-- ; huff_tree++)
2134
  {
2135
    /* Skip columns that have been joined with other columns. */
2136
    if (huff_tree->tree_number == 0)
2137
      continue;				/* Deleted tree */
2138
    tree_no++;
2139
    DBUG_PRINT("info", (" "));
2140
    if (verbose >= 3)
2141
      VOID(printf("\n"));
2142
    /* Count the total number of elements (byte codes or column values). */
2143
    elements+=huff_tree->elements;
2144
    huff_tree->max_offset=2;
2145
    /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2146
    if (huff_tree->elements <= 1)
2147
      offset=packed_tree;
2148
    else
2149
      offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2150
2151
    /* This should be the same as 'length' above. */
2152
    huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2153
2154
    /*
2155
      Since we check this during collecting the distinct column values,
2156
      this should never happen.
2157
    */
2158
    if (huff_tree->max_offset >= IS_OFFSET)
2159
    {				/* This should be impossible */
2160
      VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2161
                   huff_tree->max_offset));
2162
      my_afree((uchar*) packed_tree);
2163
      return 0;
2164
    }
2165
2166
    DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
2167
                        "char_bits: %u\n",
2168
                        (ulong) (file_buffer.pos - file_buffer.buffer),
2169
                        huff_tree->elements, (ulong) (offset - packed_tree),
2170
                        huff_tree->char_bits));
2171
    if (!huff_tree->counts->tree_buff)
2172
    {
2173
      /* We do a byte compression on this column. Mark with bit 0. */
2174
      write_bits(0,1);
2175
      write_bits(huff_tree->min_chr,8);
2176
      write_bits(huff_tree->elements,9);
2177
      write_bits(huff_tree->char_bits,5);
2178
      write_bits(huff_tree->offset_bits,5);
2179
      int_length=0;
2180
    }
2181
    else
2182
    {
2183
      int_length=(uint) (huff_tree->counts->tree_pos -
2184
			 huff_tree->counts->tree_buff);
2185
      /* We have distinct column values for this column. Mark with bit 1. */
2186
      write_bits(1,1);
2187
      write_bits(huff_tree->elements,15);
2188
      write_bits(int_length,16);
2189
      write_bits(huff_tree->char_bits,5);
2190
      write_bits(huff_tree->offset_bits,5);
2191
      intervall_length+=int_length;
2192
    }
2193
    DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
2194
                        "offset_bits: %2u  %s: %5u  codelen: %2u",
2195
                        tree_no, huff_tree->elements, huff_tree->char_bits,
2196
                        huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2197
                        "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2198
                        int_length : huff_tree->min_chr, huff_tree->height));
2199
    if (verbose >= 2)
2200
      VOID(printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2201
                  "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2202
                  huff_tree->char_bits, huff_tree->offset_bits,
2203
                  huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2204
                  huff_tree->counts->tree_buff ? int_length :
2205
                  huff_tree->min_chr, huff_tree->height));
2206
2207
    /* Check that the code tree length matches the element count. */
2208
    length=(uint) (offset-packed_tree);
2209
    if (length != huff_tree->elements*2-2)
2210
    {
2211
      VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2212
                   length, huff_tree->elements * 2 - 2));
2213
      errors++;
2214
      break;
2215
    }
2216
2217
    for (i=0 ; i < length ; i++)
2218
    {
2219
      if (packed_tree[i] & IS_OFFSET)
2220
	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2221
		   huff_tree->offset_bits+1);
2222
      else
2223
	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2224
      DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2225
                          i, (packed_tree[i] & IS_OFFSET) ?
2226
                          " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2227
                          packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2228
      if (verbose >= 3)
2229
        VOID(printf("tree[0x%04x]: %s0x%04x\n",
2230
                    i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2231
                    (packed_tree[i] & IS_OFFSET) ?
2232
                    packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2233
    }
2234
    flush_bits();
2235
2236
    /*
2237
      Display coding tables and check their correctness.
2238
    */
2239
    codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2240
    for (i= 0; i < codes; i++)
2241
    {
2242
      ulonglong code;
2243
      uint bits;
2244
      uint len;
2245
      uint idx;
2246
2247
      if (! (len= huff_tree->code_len[i]))
2248
        continue;
2249
      DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
2250
                          hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2251
                          bindigits(huff_tree->code[i],
2252
                                    huff_tree->code_len[i])));
2253
      if (verbose >= 3)
2254
        VOID(printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2255
                    hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2256
                    bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2257
2258
      /* Check that the encode table decodes correctly. */
2259
      code= 0;
2260
      bits= 0;
2261
      idx= 0;
2262
      DBUG_EXECUTE_IF("forcechkerr1", len--;);
2263
      DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2264
      DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2265
      for (;;)
2266
      {
2267
        if (! len)
2268
        {
2269
          VOID(fflush(stdout));
2270
          VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2271
                       hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2272
          errors++;
2273
          break;
2274
        }
2275
        code<<= 1;
2276
        code|= (huff_tree->code[i] >> (--len)) & 1;
2277
        bits++;
2278
        if (bits > 8 * sizeof(code))
2279
        {
2280
          VOID(fflush(stdout));
2281
          VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2282
                       bits, (uint) (8 * sizeof(code))));
2283
          errors++;
2284
          break;
2285
        }
2286
        idx+= (uint) code & 1;
2287
        if (idx >= length)
2288
        {
2289
          VOID(fflush(stdout));
2290
          VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2291
                       idx, length));
2292
          errors++;
2293
          break;
2294
        }
2295
        if (packed_tree[idx] & IS_OFFSET)
2296
          idx+= packed_tree[idx] & ~IS_OFFSET;
2297
        else
2298
          break; /* Hit a leaf. This contains the result value. */
2299
      }
2300
      if (errors)
2301
        break;
2302
2303
      DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2304
      if (packed_tree[idx] != i)
2305
      {
2306
        VOID(fflush(stdout));
2307
        VOID(fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2308
                     packed_tree[idx], i));
2309
        errors++;
2310
        break;
2311
      }
2312
    } /*end for (codes)*/
2313
    if (errors)
2314
      break;
2315
2316
    /* Write column values in case of distinct column value compression. */
2317
    if (huff_tree->counts->tree_buff)
2318
    {
2319
      for (i=0 ; i < int_length ; i++)
2320
      {
2321
 	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2322
        DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2323
                            i, (uchar) huff_tree->counts->tree_buff[i]));
2324
        if (verbose >= 3)
2325
          VOID(printf("column_values[0x%04x]: 0x%02x\n",
2326
                      i, (uchar) huff_tree->counts->tree_buff[i]));
2327
      }
2328
    }
2329
    flush_bits();
2330
  }
2331
  DBUG_PRINT("info", (" "));
2332
  if (verbose >= 2)
2333
    VOID(printf("\n"));
2334
  my_afree((uchar*) packed_tree);
2335
  if (errors)
2336
  {
2337
    VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2338
    return 0;
2339
  }
2340
  return elements;
2341
}
2342
2343
2344
static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2345
				   uint *offset)
2346
{
2347
  uint *prev_offset;
2348
2349
  prev_offset= offset;
2350
  /*
2351
    'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2352
    then there is no left child and, hence no right child either. This
2353
    is a property of a binary tree. An element is either a node with two
2354
    childs, or a leaf without childs.
2355
2356
    The current element is always a node with two childs. Go left first.
2357
  */
2358
  if (!element->a.nod.left->a.leaf.null)
2359
  {
2360
    /* Store the byte code or the index of the column value. */
2361
    prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2362
    offset+=2;
2363
  }
2364
  else
2365
  {
2366
    /*
2367
      Recursively traverse the tree to the left. Mark it as an offset to
2368
      another tree node (in contrast to a byte code or column value index).
2369
    */
2370
    prev_offset[0]= IS_OFFSET+2;
2371
    offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2372
  }
2373
2374
  /* Now, check the right child. */
2375
  if (!element->a.nod.right->a.leaf.null)
2376
  {
2377
    /* Store the byte code or the index of the column value. */
2378
    prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2379
    return offset;
2380
  }
2381
  else
2382
  {
2383
    /*
2384
      Recursively traverse the tree to the right. Mark it as an offset to
2385
      another tree node (in contrast to a byte code or column value index).
2386
    */
2387
    uint temp=(uint) (offset-prev_offset-1);
2388
    prev_offset[1]= IS_OFFSET+ temp;
2389
    if (huff_tree->max_offset < temp)
2390
      huff_tree->max_offset = temp;
2391
    return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2392
  }
2393
}
2394
2395
	/* Get number of bits neaded to represent value */
2396
2397
static uint max_bit(register uint value)
2398
{
2399
  register uint power=1;
2400
2401
  while ((value>>=1))
2402
    power++;
2403
  return (power);
2404
}
2405
2406
2407
static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2408
{
2409
  int error;
2410
  uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2411
    intervall,field_length,max_pack_length,pack_blob_length;
2412
  my_off_t record_count;
2413
  char llbuf[32];
2414
  ulong length,pack_length;
2415
  uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2416
  HUFF_COUNTS *count,*end_count;
2417
  HUFF_TREE *tree;
2418
  MI_INFO *isam_file=mrg->file[0];
2419
  uint pack_version= (uint) isam_file->s->pack.version;
2420
  DBUG_ENTER("compress_isam_file");
2421
2422
  /* Allocate a buffer for the records (excluding blobs). */
2423
  if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2424
    return -1;
2425
2426
  end_count=huff_counts+isam_file->s->base.fields;
2427
  min_record_length= (uint) ~0;
2428
  max_record_length=0;
2429
2430
  /*
2431
    Calculate the maximum number of bits required to pack the records.
2432
    Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2433
    The tree height determines the maximum number of bits per value.
2434
    Some fields skip leading or trailing spaces or zeroes. The skipped
2435
    number of bytes is encoded by 'length_bits' bits.
2436
    Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2437
    and varchar get a leading 0 bit.
2438
  */
2439
  for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2440
  {
2441
    if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2442
      huff_counts[i].max_zero_fill=0;
2443
    if (huff_counts[i].field_type == FIELD_CONSTANT ||
2444
	huff_counts[i].field_type == FIELD_ZERO ||
2445
	huff_counts[i].field_type == FIELD_CHECK)
2446
      continue;
2447
    if (huff_counts[i].field_type == FIELD_INTERVALL)
2448
      max_calc_length+=huff_counts[i].tree->height;
2449
    else if (huff_counts[i].field_type == FIELD_BLOB ||
2450
	     huff_counts[i].field_type == FIELD_VARCHAR)
2451
      max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2452
    else
2453
      max_calc_length+=
2454
	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2455
	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2456
  }
2457
  max_calc_length= (max_calc_length + 7) / 8;
2458
  pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2459
  record_count=0;
2460
  /* 'max_blob_length' is the max length of all blobs of a record. */
2461
  pack_blob_length= isam_file->s->base.blobs ?
2462
                    calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2463
  max_pack_length=pack_ref_length+pack_blob_length;
2464
2465
  DBUG_PRINT("fields", ("==="));
2466
  mrg_reset(mrg);
2467
  while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2468
  {
2469
    ulong tot_blob_length=0;
2470
    if (! error)
2471
    {
2472
      if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2473
	break;
2474
      record_pos= (uchar*) file_buffer.pos;
2475
      file_buffer.pos+=max_pack_length;
2476
      for (start_pos=record, count= huff_counts; count < end_count ; count++)
2477
      {
2478
	end_pos=start_pos+(field_length=count->field_length);
2479
	tree=count->tree;
2480
2481
        DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
2482
                              "lbits: %2u  tree: %2u  length: %4u",
2483
                              (ulong) (count - huff_counts + 1),
2484
                              count->field_type,
2485
                              count->pack_type, count->max_zero_fill,
2486
                              count->length_bits, count->tree->tree_number,
2487
                              count->field_length));
2488
2489
        /* Check if the column contains spaces only. */
2490
	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2491
	{
2492
	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2493
	  if (pos == end_pos)
2494
	  {
2495
            DBUG_PRINT("fields",
2496
                       ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
2497
            DBUG_PRINT("fields", ("---"));
2498
	    write_bits(1,1);
2499
	    start_pos=end_pos;
2500
	    continue;
2501
	  }
2502
          DBUG_PRINT("fields",
2503
                     ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
2504
	  write_bits(0,1);
2505
	}
2506
	end_pos-=count->max_zero_fill;
2507
	field_length-=count->max_zero_fill;
2508
2509
	switch (count->field_type) {
2510
	case FIELD_SKIP_ZERO:
2511
	  if (!memcmp((uchar*) start_pos,zero_string,field_length))
2512
	  {
2513
            DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
2514
	    write_bits(1,1);
2515
	    start_pos=end_pos;
2516
	    break;
2517
	  }
2518
          DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
2519
	  write_bits(0,1);
2520
	  /* Fall through */
2521
	case FIELD_NORMAL:
2522
          DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2523
                                (ulong) (end_pos - start_pos)));
2524
	  for ( ; start_pos < end_pos ; start_pos++)
2525
          {
2526
            DBUG_PRINT("fields",
2527
                       ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2528
                        (uchar) *start_pos,
2529
                        hexdigits(tree->code[(uchar) *start_pos]),
2530
                        (uint) tree->code_len[(uchar) *start_pos],
2531
                        bindigits(tree->code[(uchar) *start_pos],
2532
                                  (uint) tree->code_len[(uchar) *start_pos])));
2533
	    write_bits(tree->code[(uchar) *start_pos],
2534
		       (uint) tree->code_len[(uchar) *start_pos]);
2535
          }
2536
	  break;
2537
	case FIELD_SKIP_ENDSPACE:
2538
	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2539
	  length= (ulong) (end_pos - pos);
2540
	  if (count->pack_type & PACK_TYPE_SELECTED)
2541
	  {
2542
	    if (length > count->min_space)
2543
	    {
2544
              DBUG_PRINT("fields",
2545
                         ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
2546
              DBUG_PRINT("fields",
2547
                         ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2548
                          length, field_length, count->length_bits));
2549
	      write_bits(1,1);
2550
	      write_bits(length,count->length_bits);
2551
	    }
2552
	    else
2553
	    {
2554
              DBUG_PRINT("fields",
2555
                         ("FIELD_SKIP_ENDSPACE not more than min_space, "
2556
                          "bits:  1"));
2557
	      write_bits(0,1);
2558
	      pos=end_pos;
2559
	    }
2560
	  }
2561
	  else
2562
          {
2563
            DBUG_PRINT("fields",
2564
                       ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2565
                        length, field_length, count->length_bits));
2566
	    write_bits(length,count->length_bits);
2567
          }
2568
          /* Encode all significant bytes. */
2569
          DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2570
                                (ulong) (pos - start_pos)));
2571
	  for ( ; start_pos < pos ; start_pos++)
2572
          {
2573
            DBUG_PRINT("fields",
2574
                       ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2575
                        (uchar) *start_pos,
2576
                        hexdigits(tree->code[(uchar) *start_pos]),
2577
                        (uint) tree->code_len[(uchar) *start_pos],
2578
                        bindigits(tree->code[(uchar) *start_pos],
2579
                                  (uint) tree->code_len[(uchar) *start_pos])));
2580
	    write_bits(tree->code[(uchar) *start_pos],
2581
		       (uint) tree->code_len[(uchar) *start_pos]);
2582
          }
2583
	  start_pos=end_pos;
2584
	  break;
2585
	case FIELD_SKIP_PRESPACE:
2586
	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2587
          length= (ulong) (pos - start_pos);
2588
	  if (count->pack_type & PACK_TYPE_SELECTED)
2589
	  {
2590
	    if (length > count->min_space)
2591
	    {
2592
              DBUG_PRINT("fields",
2593
                         ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
2594
              DBUG_PRINT("fields",
2595
                         ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2596
                          length, field_length, count->length_bits));
2597
	      write_bits(1,1);
2598
	      write_bits(length,count->length_bits);
2599
	    }
2600
	    else
2601
	    {
2602
              DBUG_PRINT("fields",
2603
                         ("FIELD_SKIP_PRESPACE not more than min_space, "
2604
                          "bits:  1"));
2605
	      pos=start_pos;
2606
	      write_bits(0,1);
2607
	    }
2608
	  }
2609
	  else
2610
          {
2611
            DBUG_PRINT("fields",
2612
                       ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2613
                        length, field_length, count->length_bits));
2614
	    write_bits(length,count->length_bits);
2615
          }
2616
          /* Encode all significant bytes. */
2617
          DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2618
                                (ulong) (end_pos - start_pos)));
2619
	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2620
          {
2621
            DBUG_PRINT("fields",
2622
                       ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2623
                        (uchar) *start_pos,
2624
                        hexdigits(tree->code[(uchar) *start_pos]),
2625
                        (uint) tree->code_len[(uchar) *start_pos],
2626
                        bindigits(tree->code[(uchar) *start_pos],
2627
                                  (uint) tree->code_len[(uchar) *start_pos])));
2628
	    write_bits(tree->code[(uchar) *start_pos],
2629
		       (uint) tree->code_len[(uchar) *start_pos]);
2630
          }
2631
	  break;
2632
	case FIELD_CONSTANT:
2633
	case FIELD_ZERO:
2634
	case FIELD_CHECK:
2635
          DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2636
	  start_pos=end_pos;
2637
	  break;
2638
	case FIELD_INTERVALL:
2639
	  global_count=count;
2640
	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2641
				  count->int_tree.custom_arg);
2642
	  intervall=(uint) (pos - count->tree_buff)/field_length;
2643
          DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2644
          DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
2645
                                intervall, hexdigits(tree->code[intervall]),
2646
                                (uint) tree->code_len[intervall]));
2647
	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2648
	  start_pos=end_pos;
2649
	  break;
2650
	case FIELD_BLOB:
2651
	{
2652
	  ulong blob_length=_mi_calc_blob_length(field_length-
2653
						 portable_sizeof_char_ptr,
2654
						 start_pos);
2655
          /* Empty blobs are encoded with a single 1 bit. */
2656
	  if (!blob_length)
2657
	  {
2658
            DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
2659
            write_bits(1,1);
2660
	  }
2661
	  else
2662
	  {
2663
	    uchar *blob,*blob_end;
2664
            DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
2665
	    write_bits(0,1);
2666
            /* Write the blob length. */
2667
            DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2668
                                  blob_length, count->length_bits));
2669
	    write_bits(blob_length,count->length_bits);
2670
	    memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2671
			 sizeof(char*));
2672
	    blob_end=blob+blob_length;
2673
            /* Encode the blob bytes. */
2674
	    for ( ; blob < blob_end ; blob++)
2675
            {
2676
              DBUG_PRINT("fields",
2677
                         ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2678
                          (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2679
                          (uint) tree->code_len[(uchar) *blob],
2680
                          bindigits(tree->code[(uchar) *start_pos],
2681
                                    (uint)tree->code_len[(uchar) *start_pos])));
2682
	      write_bits(tree->code[(uchar) *blob],
2683
			 (uint) tree->code_len[(uchar) *blob]);
2684
            }
2685
	    tot_blob_length+=blob_length;
2686
	  }
2687
	  start_pos= end_pos;
2688
	  break;
2689
	}
2690
	case FIELD_VARCHAR:
2691
	{
2692
          uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2693
          ulong col_length= (var_pack_length == 1 ?
2694
                             (uint) *(uchar*) start_pos :
2695
                             uint2korr(start_pos));
2696
          /* Empty varchar are encoded with a single 1 bit. */
2697
	  if (!col_length)
2698
	  {
2699
            DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
2700
	    write_bits(1,1);			/* Empty varchar */
2701
	  }
2702
	  else
2703
	  {
2704
	    uchar *end= start_pos + var_pack_length + col_length;
2705
            DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
2706
	    write_bits(0,1);
2707
            /* Write the varchar length. */
2708
            DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2709
                                  col_length, count->length_bits));
2710
	    write_bits(col_length,count->length_bits);
2711
            /* Encode the varchar bytes. */
2712
	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2713
            {
2714
              DBUG_PRINT("fields",
2715
                         ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2716
                          (uchar) *start_pos,
2717
                          hexdigits(tree->code[(uchar) *start_pos]),
2718
                          (uint) tree->code_len[(uchar) *start_pos],
2719
                          bindigits(tree->code[(uchar) *start_pos],
2720
                                    (uint)tree->code_len[(uchar) *start_pos])));
2721
	      write_bits(tree->code[(uchar) *start_pos],
2722
			 (uint) tree->code_len[(uchar) *start_pos]);
2723
            }
2724
	  }
2725
	  start_pos= end_pos;
2726
	  break;
2727
	}
2728
	case FIELD_LAST:
2729
        case FIELD_enum_val_count:
2730
	  abort();				/* Impossible */
2731
	}
2732
	start_pos+=count->max_zero_fill;
2733
        DBUG_PRINT("fields", ("---"));
2734
      }
2735
      flush_bits();
2736
      length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2737
      pack_length= save_pack_length(pack_version, record_pos, length);
2738
      if (pack_blob_length)
2739
	pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2740
	                               tot_blob_length);
2741
      DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
2742
                            "length-bytes: %lu", (ulong) record_count, length,
2743
                            tot_blob_length, pack_length));
2744
      DBUG_PRINT("fields", ("==="));
2745
2746
      /* Correct file buffer if the header was smaller */
2747
      if (pack_length != max_pack_length)
2748
      {
2749
	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2750
	file_buffer.pos-= (max_pack_length-pack_length);
2751
      }
2752
      if (length < (ulong) min_record_length)
2753
	min_record_length=(uint) length;
2754
      if (length > (ulong) max_record_length)
2755
	max_record_length=(uint) length;
2756
      record_count++;
2757
      if (write_loop && record_count % WRITE_COUNT == 0)
2758
      {
2759
	VOID(printf("%lu\r", (ulong) record_count));
2760
        VOID(fflush(stdout));
2761
      }
2762
    }
2763
    else if (error != HA_ERR_RECORD_DELETED)
2764
      break;
2765
  }
2766
  if (error == HA_ERR_END_OF_FILE)
2767
    error=0;
2768
  else
2769
  {
2770
    VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2771
                 my_progname, error));
2772
  }
2773
  if (verbose >= 2)
2774
    VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
2775
2776
  my_afree((uchar*) record);
2777
  mrg->ref_length=max_pack_length;
2778
  mrg->min_pack_length=max_record_length ? min_record_length : 0;
2779
  mrg->max_pack_length=max_record_length;
2780
  DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2781
}
2782
2783
2784
static char *make_new_name(char *new_name, char *old_name)
2785
{
2786
  return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2787
}
2788
2789
static char *make_old_name(char *new_name, char *old_name)
2790
{
2791
  return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2792
}
2793
2794
	/* rutines for bit writing buffer */
2795
2796
static void init_file_buffer(File file, pbool read_buffer)
2797
{
2798
  file_buffer.file=file;
2799
  file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2800
					 MYF(MY_WME));
2801
  file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2802
  file_buffer.pos_in_file=0;
2803
  error_on_write=0;
2804
  if (read_buffer)
2805
  {
2806
2807
    file_buffer.pos=file_buffer.end;
2808
    file_buffer.bits=0;
2809
  }
2810
  else
2811
  {
2812
    file_buffer.pos=file_buffer.buffer;
2813
    file_buffer.bits=BITS_SAVED;
2814
  }
2815
  file_buffer.bitbucket= 0;
2816
}
2817
2818
2819
static int flush_buffer(ulong neaded_length)
2820
{
2821
  ulong length;
2822
2823
  /*
2824
    file_buffer.end is 8 bytes lower than the real end of the buffer.
2825
    This is done so that the end-of-buffer condition does not need to be
2826
    checked for every byte (see write_bits()). Consequently,
2827
    file_buffer.pos can become greater than file_buffer.end. The
2828
    algorithms in the other functions ensure that there will never be
2829
    more than 8 bytes written to the buffer without an end-of-buffer
2830
    check. So the buffer cannot be overrun. But we need to check for the
2831
    near-to-buffer-end condition to avoid a negative result, which is
2832
    casted to unsigned and thus becomes giant.
2833
  */
2834
  if ((file_buffer.pos < file_buffer.end) &&
2835
      ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2836
    return 0;
2837
  length=(ulong) (file_buffer.pos-file_buffer.buffer);
2838
  file_buffer.pos=file_buffer.buffer;
2839
  file_buffer.pos_in_file+=length;
2840
  if (test_only)
2841
    return 0;
2842
  if (error_on_write|| my_write(file_buffer.file,
2843
				(const uchar*) file_buffer.buffer,
2844
				length,
2845
				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2846
  {
2847
    error_on_write=1;
2848
    return 1;
2849
  }
2850
2851
  if (neaded_length != ~(ulong) 0 &&
2852
      (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2853
  {
2854
    char *tmp;
2855
    neaded_length+=256;				/* some margin */
2856
    tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2857
    if (!tmp)
2858
      return 1;
2859
    file_buffer.pos= ((uchar*) tmp +
2860
                      (ulong) (file_buffer.pos - file_buffer.buffer));
2861
    file_buffer.buffer= (uchar*) tmp;
2862
    file_buffer.end= (uchar*) (tmp+neaded_length-8);
2863
  }
2864
  return 0;
2865
}
2866
2867
2868
static void end_file_buffer(void)
2869
{
2870
  my_free((uchar*) file_buffer.buffer,MYF(0));
2871
}
2872
2873
	/* output `bits` low bits of `value' */
2874
2875
static void write_bits(register ulonglong value, register uint bits)
2876
{
2877
  DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2878
              (bits == 8 * sizeof(value)));
2879
2880
  if ((file_buffer.bits-= (int) bits) >= 0)
2881
  {
2882
    file_buffer.bitbucket|= value << file_buffer.bits;
2883
  }
2884
  else
2885
  {
2886
    register ulonglong bit_buffer;
2887
    bits= (uint) -file_buffer.bits;
2888
    bit_buffer= (file_buffer.bitbucket |
2889
                 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2890
#if BITS_SAVED == 64
2891
    *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2892
    *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2893
    *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2894
    *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2895
#endif
2896
    *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2897
    *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2898
    *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2899
    *file_buffer.pos++= (uchar) (bit_buffer);
2900
2901
    if (bits != 8 * sizeof(value))
2902
      value&= (((ulonglong) 1) << bits) - 1;
2903
    if (file_buffer.pos >= file_buffer.end)
2904
      VOID(flush_buffer(~ (ulong) 0));
2905
    file_buffer.bits=(int) (BITS_SAVED - bits);
2906
    file_buffer.bitbucket= value << (BITS_SAVED - bits);
2907
  }
2908
  return;
2909
}
2910
2911
	/* Flush bits in bit_buffer to buffer */
2912
2913
static void flush_bits(void)
2914
{
2915
  int bits;
2916
  ulonglong bit_buffer;
2917
2918
  bits= file_buffer.bits & ~7;
2919
  bit_buffer= file_buffer.bitbucket >> bits;
2920
  bits= BITS_SAVED - bits;
2921
  while (bits > 0)
2922
  {
2923
    bits-= 8;
2924
    *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2925
  }
2926
  if (file_buffer.pos >= file_buffer.end)
2927
    VOID(flush_buffer(~ (ulong) 0));
2928
  file_buffer.bits= BITS_SAVED;
2929
  file_buffer.bitbucket= 0;
2930
}
2931
2932
2933
/****************************************************************************
2934
** functions to handle the joined files
2935
****************************************************************************/
2936
2937
static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2938
		      ha_checksum crc)
2939
{
2940
  MYISAM_SHARE *share=isam_file->s;
2941
  uint options=mi_uint2korr(share->state.header.options);
2942
  uint key;
2943
  DBUG_ENTER("save_state");
2944
2945
  options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2946
  mi_int2store(share->state.header.options,options);
2947
2948
  share->state.state.data_file_length=new_length;
2949
  share->state.state.del=0;
2950
  share->state.state.empty=0;
2951
  share->state.dellink= HA_OFFSET_ERROR;
2952
  share->state.split=(ha_rows) mrg->records;
2953
  share->state.version=(ulong) time((time_t*) 0);
2954
  if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2955
  {
2956
    /*
2957
      Some indexes are disabled, cannot use current key_file_length value
2958
      as an estimate of upper bound of index file size. Use packed data file 
2959
      size instead.
2960
    */
2961
    share->state.state.key_file_length= new_length;
2962
  }
2963
  /*
2964
    If there are no disabled indexes, keep key_file_length value from 
2965
    original file so "myisamchk -rq" can use this value (this is necessary 
2966
    because index size cannot be easily calculated for fulltext keys)
2967
  */
2968
  mi_clear_all_keys_active(share->state.key_map);
2969
  for (key=0 ; key < share->base.keys ; key++)
2970
    share->state.key_root[key]= HA_OFFSET_ERROR;
2971
  for (key=0 ; key < share->state.header.max_block_size_index ; key++)
2972
    share->state.key_del[key]= HA_OFFSET_ERROR;
2973
  isam_file->state->checksum=crc;       /* Save crc here */
2974
  share->changed=1;			/* Force write of header */
2975
  share->state.open_count=0;
2976
  share->global_changed=0;
30 by Brian Aker
Large file and ftruncate() support
2977
  (void)ftruncate(share->kfile, share->base.keystart);
1 by brian
clean slate
2978
  if (share->base.keys)
2979
    isamchk_neaded=1;
2980
  DBUG_RETURN(mi_state_info_write(share->kfile,&share->state,1+2));
2981
}
2982
2983
2984
static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
2985
			  ha_checksum crc)
2986
{
2987
  MI_STATE_INFO state;
2988
  MI_INFO *isam_file=mrg->file[0];
2989
  uint options;
2990
  DBUG_ENTER("save_state_mrg");
2991
2992
  state= isam_file->s->state;
2993
  options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
2994
	    HA_OPTION_READ_ONLY_DATA);
2995
  mi_int2store(state.header.options,options);
2996
  state.state.data_file_length=new_length;
2997
  state.state.del=0;
2998
  state.state.empty=0;
2999
  state.state.records=state.split=(ha_rows) mrg->records;
3000
  /* See comment above in save_state about key_file_length handling. */
3001
  if (mrg->src_file_has_indexes_disabled)
3002
  {
3003
    isam_file->s->state.state.key_file_length=
3004
      max(isam_file->s->state.state.key_file_length, new_length);
3005
  }
3006
  state.dellink= HA_OFFSET_ERROR;
3007
  state.version=(ulong) time((time_t*) 0);
3008
  mi_clear_all_keys_active(state.key_map);
3009
  state.state.checksum=crc;
3010
  if (isam_file->s->base.keys)
3011
    isamchk_neaded=1;
3012
  state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3013
  DBUG_RETURN (mi_state_info_write(file,&state,1+2));
3014
}
3015
3016
3017
/* reset for mrg_rrnd */
3018
3019
static void mrg_reset(PACK_MRG_INFO *mrg)
3020
{
3021
  if (mrg->current)
3022
  {
3023
    mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3024
    mrg->current=0;
3025
  }
3026
}
3027
3028
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3029
{
3030
  int error;
3031
  MI_INFO *isam_info;
3032
  my_off_t filepos;
3033
3034
  if (!info->current)
3035
  {
3036
    isam_info= *(info->current=info->file);
3037
    info->end=info->current+info->count;
3038
    mi_reset(isam_info);
3039
    mi_extra(isam_info, HA_EXTRA_CACHE, 0);
3040
    filepos=isam_info->s->pack.header_length;
3041
  }
3042
  else
3043
  {
3044
    isam_info= *info->current;
3045
    filepos= isam_info->nextpos;
3046
  }
3047
3048
  for (;;)
3049
  {
3050
    isam_info->update&= HA_STATE_CHANGED;
3051
    if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
3052
					  filepos, 1)) ||
3053
	error != HA_ERR_END_OF_FILE)
3054
      return (error);
3055
    mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3056
    if (info->current+1 == info->end)
3057
      return(HA_ERR_END_OF_FILE);
3058
    info->current++;
3059
    isam_info= *info->current;
3060
    filepos=isam_info->s->pack.header_length;
3061
    mi_reset(isam_info);
3062
    mi_extra(isam_info,HA_EXTRA_CACHE, 0);
3063
  }
3064
}
3065
3066
3067
static int mrg_close(PACK_MRG_INFO *mrg)
3068
{
3069
  uint i;
3070
  int error=0;
3071
  for (i=0 ; i < mrg->count ; i++)
3072
    error|=mi_close(mrg->file[i]);
3073
  if (mrg->free_file)
3074
    my_free((uchar*) mrg->file,MYF(0));
3075
  return error;
3076
}
3077
3078
3079
#if !defined(DBUG_OFF)
3080
/*
3081
  Fake the counts to get big Huffman codes.
3082
3083
  SYNOPSIS
3084
    fakebigcodes()
3085
    huff_counts                 A pointer to the counts array.
3086
    end_count                   A pointer past the counts array.
3087
3088
  DESCRIPTION
3089
3090
    Huffman coding works by removing the two least frequent values from
3091
    the list of values and add a new value with the sum of their
3092
    incidences in a loop until only one value is left. Every time a
3093
    value is reused for a new value, it gets one more bit for its
3094
    encoding. Hence, the least frequent values get the longest codes.
3095
3096
    To get a maximum code length for a value, two of the values must
3097
    have an incidence of 1. As their sum is 2, the next infrequent value
3098
    must have at least an incidence of 2, then 4, 8, 16 and so on. This
3099
    means that one needs 2**n bytes (values) for a code length of n
3100
    bits. However, using more distinct values forces the use of longer
3101
    codes, or reaching the code length with less total bytes (values).
3102
3103
    To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3104
    I assign counts of 1 to the two most frequent values, a count of 2
3105
    for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3106
    the remaining values get 1. That way every possible byte has an
3107
    assigned code, though not all codes are used if not all byte values
3108
    are present in the column.
3109
3110
    This strategy would work with distinct column values too, but
3111
    requires that at least 64(32) values are present. To make things
3112
    easier here, I cancel all distinct column values and force byte
3113
    compression for all columns.
3114
3115
  RETURN
3116
    void
3117
*/
3118
3119
static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3120
{
3121
  HUFF_COUNTS   *count;
3122
  my_off_t      *cur_count_p;
3123
  my_off_t      *end_count_p;
3124
  my_off_t      **cur_sort_p;
3125
  my_off_t      **end_sort_p;
3126
  my_off_t      *sort_counts[256];
3127
  my_off_t      total;
3128
  DBUG_ENTER("fakebigcodes");
3129
3130
  for (count= huff_counts; count < end_count; count++)
3131
  {
3132
    /*
3133
      Remove distinct column values.
3134
    */
3135
    if (huff_counts->tree_buff)
3136
    {
3137
      my_free((uchar*) huff_counts->tree_buff, MYF(0));
3138
      delete_tree(&huff_counts->int_tree);
3139
      huff_counts->tree_buff= NULL;
3140
      DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3141
    }
3142
3143
    /*
3144
      Sort counts by decreasing incidence.
3145
    */
3146
    cur_count_p= count->counts;
3147
    end_count_p= cur_count_p + 256;
3148
    cur_sort_p= sort_counts;
3149
    while (cur_count_p < end_count_p)
3150
      *(cur_sort_p++)= cur_count_p++;
3151
    (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3152
3153
    /*
3154
      Assign faked counts.
3155
    */
3156
    cur_sort_p= sort_counts;
3157
#if SIZEOF_LONG_LONG > 4
3158
    end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3159
#else
3160
    end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3161
#endif
3162
    /* Most frequent value gets a faked count of 1. */
3163
    **(cur_sort_p++)= 1;
3164
    total= 1;
3165
    while (cur_sort_p < end_sort_p)
3166
    {
3167
      **(cur_sort_p++)= total;
3168
      total<<= 1;
3169
    }
3170
    /* Set the last value. */
3171
    **(cur_sort_p++)= --total;
3172
    /*
3173
      Set the remaining counts.
3174
    */
3175
    end_sort_p= sort_counts + 256;
3176
    while (cur_sort_p < end_sort_p)
3177
      **(cur_sort_p++)= 1;
3178
  }
3179
  DBUG_VOID_RETURN;
3180
}
3181
3182
3183
/*
3184
  Compare two counts for reverse sorting.
3185
3186
  SYNOPSIS
3187
    fakecmp()
3188
    count1              One count.
3189
    count2              Another count.
3190
3191
  RETURN
3192
    1                   count1  < count2
3193
    0                   count1 == count2
3194
    -1                  count1 >  count2
3195
*/
3196
3197
static int fakecmp(my_off_t **count1, my_off_t **count2)
3198
{
3199
  return ((**count1 < **count2) ? 1 :
3200
          (**count1 > **count2) ? -1 : 0);
3201
}
3202
#endif
3203
3204
#include "mi_extrafunc.h"