170
170
static int mrg_close(PACK_MRG_INFO *mrg);
171
171
static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
172
172
static void mrg_reset(PACK_MRG_INFO *mrg);
173
#if !defined(DBUG_OFF)
174
static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
175
static int fakecmp(my_off_t **count1, my_off_t **count2);
179
174
static int error_on_write=0,test_only=0,verbose=0,silent=0,
180
175
write_loop=0,force_pack=0, isamchk_neaded=0;
657
642
end_file_buffer();
659
644
/* Display statistics. */
660
DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
661
"Mean total length: %6ld\n",
662
mrg->min_pack_length, mrg->max_pack_length,
663
(ulong) (mrg->records ? (new_length/mrg->records) : 0)));
664
645
if (verbose && mrg->records)
665
646
VOID(printf("Min record length: %6d Max length: %6d "
666
647
"Mean total length: %6ld\n", mrg->min_pack_length,
1119
1087
if (count->counts[idx])
1121
1089
total_count+= count->counts[idx];
1122
DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1123
llstr((int64_t) count->counts[idx], llbuf)));
1124
1090
if (verbose >= 2)
1125
1091
VOID(printf("counts[0x%02x]: %12s\n", idx,
1126
1092
llstr((int64_t) count->counts[idx], llbuf)));
1129
DBUG_PRINT("info", ("total: %12s", llstr((int64_t) total_count,
1131
1095
if ((verbose >= 2) && total_count)
1133
1097
VOID(printf("total: %12s\n",
1337
1298
fill_zero_fields++;
1338
1299
field_count[huff_counts->field_type]++;
1340
DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1341
"empty-zero: %3d empty-fill: %3d",
1342
field_count[FIELD_NORMAL],space_fields,
1343
field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1344
DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1345
"intervall-fields: %3d zero: %3d",
1346
field_count[FIELD_SKIP_PRESPACE],
1347
field_count[FIELD_SKIP_ENDSPACE],
1348
field_count[FIELD_INTERVALL],
1349
field_count[FIELD_ZERO]));
1351
1302
VOID(printf("\nnormal: %3d empty-space: %3d "
1352
1303
"empty-zero: %3d empty-fill: %3d\n"
2031
1978
uint huff_tree_bits;
2032
1979
huff_tree_bits=max_bit(trees ? trees-1 : 0);
2034
DBUG_PRINT("info", (" "));
2035
DBUG_PRINT("info", ("column types:"));
2036
DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2037
DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2038
DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2039
DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2040
DBUG_PRINT("info", ("FIELD_BLOB 4"));
2041
DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2042
DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2043
DBUG_PRINT("info", ("FIELD_ZERO 7"));
2044
DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2045
DBUG_PRINT("info", ("FIELD_CHECK 9"));
2046
DBUG_PRINT("info", (" "));
2047
DBUG_PRINT("info", ("pack type as a set of flags:"));
2048
DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2049
DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2050
DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2051
DBUG_PRINT("info", (" "));
2052
1981
if (verbose >= 2)
2054
1983
VOID(printf("\n"));
2080
2009
write_bits(counts->length_bits,5);
2081
2010
write_bits((uint64_t) counts->tree->tree_number - 1, huff_tree_bits);
2082
DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2083
"lbits: %2u tree: %2u length: %4u",
2084
i , counts->field_type, counts->pack_type,
2085
counts->max_zero_fill, counts->length_bits,
2086
counts->tree->tree_number, counts->field_length));
2087
2011
if (verbose >= 2)
2088
2012
VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2089
2013
"tree: %2u length: %4u\n", i , counts->field_type,
2186
2103
write_bits(huff_tree->offset_bits,5);
2187
2104
intervall_length+=int_length;
2189
DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2190
"offset_bits: %2u %s: %5u codelen: %2u",
2191
tree_no, huff_tree->elements, huff_tree->char_bits,
2192
huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2193
"bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2194
int_length : huff_tree->min_chr, huff_tree->height));
2195
2106
if (verbose >= 2)
2196
2107
VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2197
2108
"%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2217
2128
huff_tree->offset_bits+1);
2219
2130
write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2220
DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2221
i, (packed_tree[i] & IS_OFFSET) ?
2222
" -> " : "", (packed_tree[i] & IS_OFFSET) ?
2223
packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2224
2131
if (verbose >= 3)
2225
2132
VOID(printf("tree[0x%04x]: %s0x%04x\n",
2226
2133
i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2243
2150
if (! (len= huff_tree->code_len[i]))
2245
DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2246
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2247
bindigits(huff_tree->code[i],
2248
huff_tree->code_len[i])));
2249
2152
if (verbose >= 3)
2250
2153
VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2251
2154
hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2474
2368
end_pos=start_pos+(field_length=count->field_length);
2475
2369
tree=count->tree;
2477
DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2478
"lbits: %2u tree: %2u length: %4u",
2479
(ulong) (count - huff_counts + 1),
2481
count->pack_type, count->max_zero_fill,
2482
count->length_bits, count->tree->tree_number,
2483
count->field_length));
2485
2371
/* Check if the column contains spaces only. */
2486
2372
if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2488
2374
for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2489
2375
if (pos == end_pos)
2491
DBUG_PRINT("fields",
2492
("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2493
DBUG_PRINT("fields", ("---"));
2494
2377
write_bits(1,1);
2495
2378
start_pos=end_pos;
2498
DBUG_PRINT("fields",
2499
("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2500
2381
write_bits(0,1);
2502
2383
end_pos-=count->max_zero_fill;
2506
2387
case FIELD_SKIP_ZERO:
2507
2388
if (!memcmp((uchar*) start_pos,zero_string,field_length))
2509
DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2510
2390
write_bits(1,1);
2511
2391
start_pos=end_pos;
2514
DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2515
2394
write_bits(0,1);
2516
2395
/* Fall through */
2517
2396
case FIELD_NORMAL:
2518
DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2519
(ulong) (end_pos - start_pos)));
2520
2397
for ( ; start_pos < end_pos ; start_pos++)
2522
DBUG_PRINT("fields",
2523
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2525
hexdigits(tree->code[(uchar) *start_pos]),
2526
(uint) tree->code_len[(uchar) *start_pos],
2527
bindigits(tree->code[(uchar) *start_pos],
2528
(uint) tree->code_len[(uchar) *start_pos])));
2529
2399
write_bits(tree->code[(uchar) *start_pos],
2530
2400
(uint) tree->code_len[(uchar) *start_pos]);
2538
2408
if (length > count->min_space)
2540
DBUG_PRINT("fields",
2541
("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2542
DBUG_PRINT("fields",
2543
("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2544
length, field_length, count->length_bits));
2545
2410
write_bits(1,1);
2546
2411
write_bits(length,count->length_bits);
2550
DBUG_PRINT("fields",
2551
("FIELD_SKIP_ENDSPACE not more than min_space, "
2553
2415
write_bits(0,1);
2559
DBUG_PRINT("fields",
2560
("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2561
length, field_length, count->length_bits));
2562
2421
write_bits(length,count->length_bits);
2564
2423
/* Encode all significant bytes. */
2565
DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2566
(ulong) (pos - start_pos)));
2567
2424
for ( ; start_pos < pos ; start_pos++)
2569
DBUG_PRINT("fields",
2570
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2572
hexdigits(tree->code[(uchar) *start_pos]),
2573
(uint) tree->code_len[(uchar) *start_pos],
2574
bindigits(tree->code[(uchar) *start_pos],
2575
(uint) tree->code_len[(uchar) *start_pos])));
2576
2426
write_bits(tree->code[(uchar) *start_pos],
2577
2427
(uint) tree->code_len[(uchar) *start_pos]);
2586
2436
if (length > count->min_space)
2588
DBUG_PRINT("fields",
2589
("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2590
DBUG_PRINT("fields",
2591
("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2592
length, field_length, count->length_bits));
2593
2438
write_bits(1,1);
2594
2439
write_bits(length,count->length_bits);
2598
DBUG_PRINT("fields",
2599
("FIELD_SKIP_PRESPACE not more than min_space, "
2602
2444
write_bits(0,1);
2607
DBUG_PRINT("fields",
2608
("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2609
length, field_length, count->length_bits));
2610
2449
write_bits(length,count->length_bits);
2612
2451
/* Encode all significant bytes. */
2613
DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2614
(ulong) (end_pos - start_pos)));
2615
2452
for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2617
DBUG_PRINT("fields",
2618
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2620
hexdigits(tree->code[(uchar) *start_pos]),
2621
(uint) tree->code_len[(uchar) *start_pos],
2622
bindigits(tree->code[(uchar) *start_pos],
2623
(uint) tree->code_len[(uchar) *start_pos])));
2624
2454
write_bits(tree->code[(uchar) *start_pos],
2625
2455
(uint) tree->code_len[(uchar) *start_pos]);
2636
2465
pos=(uchar*) tree_search(&count->int_tree, start_pos,
2637
2466
count->int_tree.custom_arg);
2638
2467
intervall=(uint) (pos - count->tree_buff)/field_length;
2639
DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2640
DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2641
intervall, hexdigits(tree->code[intervall]),
2642
(uint) tree->code_len[intervall]));
2643
2468
write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2644
2469
start_pos=end_pos;
2651
2476
/* Empty blobs are encoded with a single 1 bit. */
2652
2477
if (!blob_length)
2654
DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2655
2479
write_bits(1,1);
2659
2483
uchar *blob,*blob_end;
2660
DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2661
2484
write_bits(0,1);
2662
2485
/* Write the blob length. */
2663
DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2664
blob_length, count->length_bits));
2665
2486
write_bits(blob_length,count->length_bits);
2666
2487
memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2667
2488
sizeof(char*));
2669
2490
/* Encode the blob bytes. */
2670
2491
for ( ; blob < blob_end ; blob++)
2672
DBUG_PRINT("fields",
2673
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2674
(uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2675
(uint) tree->code_len[(uchar) *blob],
2676
bindigits(tree->code[(uchar) *start_pos],
2677
(uint)tree->code_len[(uchar) *start_pos])));
2678
2493
write_bits(tree->code[(uchar) *blob],
2679
2494
(uint) tree->code_len[(uchar) *blob]);
2692
2507
/* Empty varchar are encoded with a single 1 bit. */
2693
2508
if (!col_length)
2695
DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2696
2510
write_bits(1,1); /* Empty varchar */
2700
2514
uchar *end= start_pos + var_pack_length + col_length;
2701
DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2702
2515
write_bits(0,1);
2703
2516
/* Write the varchar length. */
2704
DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2705
col_length, count->length_bits));
2706
2517
write_bits(col_length,count->length_bits);
2707
2518
/* Encode the varchar bytes. */
2708
2519
for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2710
DBUG_PRINT("fields",
2711
("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2713
hexdigits(tree->code[(uchar) *start_pos]),
2714
(uint) tree->code_len[(uchar) *start_pos],
2715
bindigits(tree->code[(uchar) *start_pos],
2716
(uint)tree->code_len[(uchar) *start_pos])));
2717
2521
write_bits(tree->code[(uchar) *start_pos],
2718
2522
(uint) tree->code_len[(uchar) *start_pos]);
2734
2537
if (pack_blob_length)
2735
2538
pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2736
2539
tot_blob_length);
2737
DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2738
"length-bytes: %lu", (ulong) record_count, length,
2739
tot_blob_length, pack_length));
2740
DBUG_PRINT("fields", ("==="));
2742
2540
/* Correct file buffer if the header was smaller */
2743
2541
if (pack_length != max_pack_length)
3070
2866
my_free((uchar*) mrg->file,MYF(0));
3075
#if !defined(DBUG_OFF)
3077
Fake the counts to get big Huffman codes.
3081
huff_counts A pointer to the counts array.
3082
end_count A pointer past the counts array.
3086
Huffman coding works by removing the two least frequent values from
3087
the list of values and add a new value with the sum of their
3088
incidences in a loop until only one value is left. Every time a
3089
value is reused for a new value, it gets one more bit for its
3090
encoding. Hence, the least frequent values get the longest codes.
3092
To get a maximum code length for a value, two of the values must
3093
have an incidence of 1. As their sum is 2, the next infrequent value
3094
must have at least an incidence of 2, then 4, 8, 16 and so on. This
3095
means that one needs 2**n bytes (values) for a code length of n
3096
bits. However, using more distinct values forces the use of longer
3097
codes, or reaching the code length with less total bytes (values).
3099
To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3100
I assign counts of 1 to the two most frequent values, a count of 2
3101
for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3102
the remaining values get 1. That way every possible byte has an
3103
assigned code, though not all codes are used if not all byte values
3104
are present in the column.
3106
This strategy would work with distinct column values too, but
3107
requires that at least 64(32) values are present. To make things
3108
easier here, I cancel all distinct column values and force byte
3109
compression for all columns.
3115
static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3118
my_off_t *cur_count_p;
3119
my_off_t *end_count_p;
3120
my_off_t **cur_sort_p;
3121
my_off_t **end_sort_p;
3122
my_off_t *sort_counts[256];
3124
DBUG_ENTER("fakebigcodes");
3126
for (count= huff_counts; count < end_count; count++)
3129
Remove distinct column values.
3131
if (huff_counts->tree_buff)
3133
my_free((uchar*) huff_counts->tree_buff, MYF(0));
3134
delete_tree(&huff_counts->int_tree);
3135
huff_counts->tree_buff= NULL;
3136
DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3140
Sort counts by decreasing incidence.
3142
cur_count_p= count->counts;
3143
end_count_p= cur_count_p + 256;
3144
cur_sort_p= sort_counts;
3145
while (cur_count_p < end_count_p)
3146
*(cur_sort_p++)= cur_count_p++;
3147
(void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3150
Assign faked counts.
3152
cur_sort_p= sort_counts;
3153
#if SIZEOF_LONG_LONG > 4
3154
end_sort_p= sort_counts + 8 * sizeof(uint64_t) - 1;
3156
end_sort_p= sort_counts + 8 * sizeof(uint64_t) - 2;
3158
/* Most frequent value gets a faked count of 1. */
3159
**(cur_sort_p++)= 1;
3161
while (cur_sort_p < end_sort_p)
3163
**(cur_sort_p++)= total;
3166
/* Set the last value. */
3167
**(cur_sort_p++)= --total;
3169
Set the remaining counts.
3171
end_sort_p= sort_counts + 256;
3172
while (cur_sort_p < end_sort_p)
3173
**(cur_sort_p++)= 1;
3180
Compare two counts for reverse sorting.
3185
count2 Another count.
3193
static int fakecmp(my_off_t **count1, my_off_t **count2)
3195
return ((**count1 < **count2) ? 1 :
3196
(**count1 > **count2) ? -1 : 0);