1
by brian
clean slate |
1 |
/* Copyright (C) 2001 MySQL AB
|
2 |
||
3 |
This program is free software; you can redistribute it and/or modify
|
|
4 |
it under the terms of the GNU General Public License as published by
|
|
5 |
the Free Software Foundation; version 2 of the License.
|
|
6 |
||
7 |
This program is distributed in the hope that it will be useful,
|
|
8 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10 |
GNU General Public License for more details.
|
|
11 |
||
12 |
You should have received a copy of the GNU General Public License
|
|
13 |
along with this program; if not, write to the Free Software
|
|
14 |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
|
15 |
||
16 |
/*
|
|
17 |
Function to handle quick removal of duplicates
|
|
18 |
This code is used when doing multi-table deletes to find the rows in
|
|
19 |
reference tables that needs to be deleted.
|
|
20 |
||
21 |
The basic idea is as follows:
|
|
22 |
||
23 |
Store first all strings in a binary tree, ignoring duplicates.
|
|
24 |
When the tree uses more memory than 'max_heap_table_size',
|
|
25 |
write the tree (in sorted order) out to disk and start with a new tree.
|
|
26 |
When all data has been generated, merge the trees (removing any found
|
|
27 |
duplicates).
|
|
28 |
||
29 |
The unique entries will be returned in sort order, to ensure that we do the
|
|
30 |
deletes in disk order.
|
|
31 |
*/
|
|
32 |
||
1241.9.36
by Monty Taylor
ZOMG. I deleted drizzled/server_includes.h. |
33 |
#include "config.h" |
1241.9.1
by Monty Taylor
Removed global.h. Fixed all the headers. |
34 |
|
35 |
#include <math.h> |
|
36 |
||
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
37 |
#include <queue> |
572.1.4
by Monty Taylor
Removed a bunch of unusued tests and defines from autoconf. |
38 |
|
1241.9.23
by Monty Taylor
Removed sql_table.h from server_includes.h. |
39 |
#include "drizzled/sql_sort.h" |
40 |
#include "drizzled/session.h" |
|
41 |
#include "drizzled/sql_list.h" |
|
1241.9.64
by Monty Taylor
Moved remaining non-public portions of mysys and mystrings to drizzled/internal. |
42 |
#include "drizzled/internal/iocache.h" |
1241.9.23
by Monty Taylor
Removed sql_table.h from server_includes.h. |
43 |
|
572.1.4
by Monty Taylor
Removed a bunch of unusued tests and defines from autoconf. |
44 |
#if defined(CMATH_NAMESPACE)
|
45 |
using namespace CMATH_NAMESPACE; |
|
46 |
#endif
|
|
1
by brian
clean slate |
47 |
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
48 |
using namespace std; |
49 |
||
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
50 |
namespace drizzled |
51 |
{
|
|
1
by brian
clean slate |
52 |
|
1241.9.46
by Monty Taylor
Removed some more evil. |
53 |
int unique_write_to_file(unsigned char* key, uint32_t, |
77.1.45
by Monty Taylor
Warning fixes. |
54 |
Unique *unique) |
1
by brian
clean slate |
55 |
{
|
56 |
/*
|
|
57 |
Use unique->size (size of element stored in the tree) and not
|
|
58 |
unique->tree.size_of_element. The latter is different from unique->size
|
|
59 |
when tree implementation chooses to store pointer to key in TREE_ELEMENT
|
|
60 |
(instead of storing the element itself there)
|
|
61 |
*/
|
|
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
62 |
return my_b_write(unique->file, key, unique->size) ? 1 : 0; |
1
by brian
clean slate |
63 |
}
|
64 |
||
481
by Brian Aker
Remove all of uchar. |
65 |
int unique_write_to_ptrs(unsigned char* key, |
1241.9.46
by Monty Taylor
Removed some more evil. |
66 |
uint32_t, Unique *unique) |
1
by brian
clean slate |
67 |
{
|
68 |
memcpy(unique->record_pointers, key, unique->size); |
|
69 |
unique->record_pointers+=unique->size; |
|
70 |
return 0; |
|
71 |
}
|
|
72 |
||
73 |
Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg, |
|
641.3.8
by Monty Taylor
Removed my_malloc from drizzled. |
74 |
uint32_t size_arg, size_t max_in_memory_size_arg) |
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
75 |
: max_in_memory_size(max_in_memory_size_arg), |
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
76 |
file(static_cast<internal::IO_CACHE *>(memory::sql_calloc(sizeof(internal::IO_CACHE)))), |
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
77 |
size(size_arg), |
78 |
elements(0) |
|
1
by brian
clean slate |
79 |
{
|
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
80 |
my_b_clear(file); |
1164
by Brian Aker
Small cleanup. |
81 |
init_tree(&tree, (ulong) (max_in_memory_size / 16), 0, size, comp_func, false, |
1
by brian
clean slate |
82 |
NULL, comp_func_fixed_arg); |
83 |
/* If the following fail's the next add will also fail */
|
|
84 |
my_init_dynamic_array(&file_ptrs, sizeof(BUFFPEK), 16, 16); |
|
85 |
/*
|
|
86 |
If you change the following, change it in get_max_elements function, too.
|
|
87 |
*/
|
|
88 |
max_elements= (ulong) (max_in_memory_size / |
|
89 |
ALIGN_SIZE(sizeof(TREE_ELEMENT)+size)); |
|
1556.1.1
by Brian Aker
Updates for moving temporary directory. |
90 |
open_cached_file(file, drizzle_tmpdir.c_str(), TEMP_PREFIX, DISK_BUFFER_SIZE, |
398.1.10
by Monty Taylor
Actually removed VOID() this time. |
91 |
MYF(MY_WME)); |
1
by brian
clean slate |
92 |
}
|
93 |
||
94 |
||
95 |
/*
|
|
96 |
Calculate log2(n!)
|
|
97 |
||
98 |
NOTES
|
|
99 |
Stirling's approximate formula is used:
|
|
100 |
||
101 |
n! ~= sqrt(2*M_PI*n) * (n/M_E)^n
|
|
102 |
||
103 |
Derivation of formula used for calculations is as follows:
|
|
104 |
||
105 |
log2(n!) = log(n!)/log(2) = log(sqrt(2*M_PI*n)*(n/M_E)^n) / log(2) =
|
|
106 |
||
107 |
= (log(2*M_PI*n)/2 + n*log(n/M_E)) / log(2).
|
|
108 |
*/
|
|
109 |
||
110 |
inline double log2_n_fact(double x) |
|
111 |
{
|
|
112 |
return (log(2*M_PI*x)/2 + x*log(x/M_E)) / M_LN2; |
|
113 |
}
|
|
114 |
||
115 |
||
116 |
/*
|
|
117 |
Calculate cost of merge_buffers function call for given sequence of
|
|
118 |
input stream lengths and store the number of rows in result stream in *last.
|
|
119 |
||
120 |
SYNOPSIS
|
|
121 |
get_merge_buffers_cost()
|
|
122 |
buff_elems Array of #s of elements in buffers
|
|
123 |
elem_size Size of element stored in buffer
|
|
124 |
first Pointer to first merged element size
|
|
125 |
last Pointer to last merged element size
|
|
126 |
||
127 |
RETURN
|
|
128 |
Cost of merge_buffers operation in disk seeks.
|
|
129 |
||
130 |
NOTES
|
|
131 |
It is assumed that no rows are eliminated during merge.
|
|
132 |
The cost is calculated as
|
|
133 |
||
134 |
cost(read_and_write) + cost(merge_comparisons).
|
|
135 |
||
136 |
All bytes in the sequences is read and written back during merge so cost
|
|
137 |
of disk io is 2*elem_size*total_buf_elems/IO_SIZE (2 is for read + write)
|
|
138 |
||
139 |
For comparisons cost calculations we assume that all merged sequences have
|
|
140 |
the same length, so each of total_buf_size elements will be added to a sort
|
|
141 |
heap with (n_buffers-1) elements. This gives the comparison cost:
|
|
142 |
||
143 |
total_buf_elems* log2(n_buffers) / TIME_FOR_COMPARE_ROWID;
|
|
144 |
*/
|
|
145 |
||
654
by Brian Aker
Remove unused (yet more) |
146 |
static double get_merge_buffers_cost(uint32_t *, uint32_t elem_size, |
482
by Brian Aker
Remove uint. |
147 |
uint32_t *first, uint32_t *last) |
1
by brian
clean slate |
148 |
{
|
482
by Brian Aker
Remove uint. |
149 |
uint32_t total_buf_elems= 0; |
150 |
for (uint32_t *pbuf= first; pbuf <= last; pbuf++) |
|
1
by brian
clean slate |
151 |
total_buf_elems+= *pbuf; |
152 |
*last= total_buf_elems; |
|
153 |
||
154 |
int n_buffers= last - first + 1; |
|
155 |
||
156 |
/* Using log2(n)=log(n)/log(2) formula */
|
|
157 |
return 2*((double)total_buf_elems*elem_size) / IO_SIZE + |
|
158 |
total_buf_elems*log((double) n_buffers) / (TIME_FOR_COMPARE_ROWID * M_LN2); |
|
159 |
}
|
|
160 |
||
161 |
||
162 |
/*
|
|
163 |
Calculate cost of merging buffers into one in Unique::get, i.e. calculate
|
|
164 |
how long (in terms of disk seeks) the two calls
|
|
165 |
merge_many_buffs(...);
|
|
166 |
merge_buffers(...);
|
|
167 |
will take.
|
|
168 |
||
169 |
SYNOPSIS
|
|
170 |
get_merge_many_buffs_cost()
|
|
171 |
buffer buffer space for temporary data, at least
|
|
172 |
Unique::get_cost_calc_buff_size bytes
|
|
173 |
maxbuffer # of full buffers
|
|
174 |
max_n_elems # of elements in first maxbuffer buffers
|
|
175 |
last_n_elems # of elements in last buffer
|
|
176 |
elem_size size of buffer element
|
|
177 |
||
178 |
NOTES
|
|
179 |
maxbuffer+1 buffers are merged, where first maxbuffer buffers contain
|
|
180 |
max_n_elems elements each and last buffer contains last_n_elems elements.
|
|
181 |
||
182 |
The current implementation does a dumb simulation of merge_many_buffs
|
|
183 |
function actions.
|
|
184 |
||
185 |
RETURN
|
|
186 |
Cost of merge in disk seeks.
|
|
187 |
*/
|
|
188 |
||
482
by Brian Aker
Remove uint. |
189 |
static double get_merge_many_buffs_cost(uint32_t *buffer, |
190 |
uint32_t maxbuffer, uint32_t max_n_elems, |
|
191 |
uint32_t last_n_elems, int elem_size) |
|
1
by brian
clean slate |
192 |
{
|
193 |
register int i; |
|
194 |
double total_cost= 0.0; |
|
482
by Brian Aker
Remove uint. |
195 |
uint32_t *buff_elems= buffer; /* #s of elements in each of merged sequences */ |
1
by brian
clean slate |
196 |
|
197 |
/*
|
|
198 |
Set initial state: first maxbuffer sequences contain max_n_elems elements
|
|
199 |
each, last sequence contains last_n_elems elements.
|
|
200 |
*/
|
|
201 |
for (i = 0; i < (int)maxbuffer; i++) |
|
202 |
buff_elems[i]= max_n_elems; |
|
203 |
buff_elems[maxbuffer]= last_n_elems; |
|
204 |
||
205 |
/*
|
|
206 |
Do it exactly as merge_many_buff function does, calling
|
|
207 |
get_merge_buffers_cost to get cost of merge_buffers.
|
|
208 |
*/
|
|
209 |
if (maxbuffer >= MERGEBUFF2) |
|
210 |
{
|
|
211 |
while (maxbuffer >= MERGEBUFF2) |
|
212 |
{
|
|
482
by Brian Aker
Remove uint. |
213 |
uint32_t lastbuff= 0; |
1
by brian
clean slate |
214 |
for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF) |
215 |
{
|
|
216 |
total_cost+=get_merge_buffers_cost(buff_elems, elem_size, |
|
217 |
buff_elems + i, |
|
218 |
buff_elems + i + MERGEBUFF-1); |
|
219 |
lastbuff++; |
|
220 |
}
|
|
221 |
total_cost+=get_merge_buffers_cost(buff_elems, elem_size, |
|
222 |
buff_elems + i, |
|
223 |
buff_elems + maxbuffer); |
|
224 |
maxbuffer= lastbuff; |
|
225 |
}
|
|
226 |
}
|
|
227 |
||
228 |
/* Simulate final merge_buff call. */
|
|
229 |
total_cost += get_merge_buffers_cost(buff_elems, elem_size, |
|
230 |
buff_elems, buff_elems + maxbuffer); |
|
231 |
return total_cost; |
|
232 |
}
|
|
233 |
||
234 |
||
235 |
/*
|
|
236 |
Calculate cost of using Unique for processing nkeys elements of size
|
|
237 |
key_size using max_in_memory_size memory.
|
|
238 |
||
239 |
SYNOPSIS
|
|
240 |
Unique::get_use_cost()
|
|
241 |
buffer space for temporary data, use Unique::get_cost_calc_buff_size
|
|
242 |
to get # bytes needed.
|
|
243 |
nkeys #of elements in Unique
|
|
244 |
key_size size of each elements in bytes
|
|
245 |
max_in_memory_size amount of memory Unique will be allowed to use
|
|
246 |
||
247 |
RETURN
|
|
248 |
Cost in disk seeks.
|
|
249 |
||
250 |
NOTES
|
|
251 |
cost(using_unqiue) =
|
|
252 |
cost(create_trees) + (see #1)
|
|
253 |
cost(merge) + (see #2)
|
|
254 |
cost(read_result) (see #3)
|
|
255 |
||
256 |
1. Cost of trees creation
|
|
257 |
For each Unique::put operation there will be 2*log2(n+1) elements
|
|
258 |
comparisons, where n runs from 1 tree_size (we assume that all added
|
|
259 |
elements are different). Together this gives:
|
|
260 |
||
261 |
n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!)
|
|
262 |
||
263 |
then cost(tree_creation) = n_compares*ROWID_COMPARE_COST;
|
|
264 |
||
265 |
Total cost of creating trees:
|
|
266 |
(n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost.
|
|
267 |
||
268 |
Approximate value of log2(N!) is calculated by log2_n_fact function.
|
|
269 |
||
270 |
2. Cost of merging.
|
|
271 |
If only one tree is created by Unique no merging will be necessary.
|
|
272 |
Otherwise, we model execution of merge_many_buff function and count
|
|
273 |
#of merges. (The reason behind this is that number of buffers is small,
|
|
274 |
while size of buffers is big and we don't want to loose precision with
|
|
275 |
O(x)-style formula)
|
|
276 |
||
277 |
3. If only one tree is created by Unique no disk io will happen.
|
|
278 |
Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume
|
|
279 |
these will be random seeks.
|
|
280 |
*/
|
|
281 |
||
482
by Brian Aker
Remove uint. |
282 |
double Unique::get_use_cost(uint32_t *buffer, uint32_t nkeys, uint32_t key_size, |
641.3.8
by Monty Taylor
Removed my_malloc from drizzled. |
283 |
size_t max_in_memory_size) |
1
by brian
clean slate |
284 |
{
|
285 |
ulong max_elements_in_tree; |
|
286 |
ulong last_tree_elems; |
|
287 |
int n_full_trees; /* number of trees in unique - 1 */ |
|
288 |
double result; |
|
289 |
||
290 |
max_elements_in_tree= ((ulong) max_in_memory_size / |
|
291 |
ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size)); |
|
292 |
||
293 |
n_full_trees= nkeys / max_elements_in_tree; |
|
294 |
last_tree_elems= nkeys % max_elements_in_tree; |
|
295 |
||
296 |
/* Calculate cost of creating trees */
|
|
297 |
result= 2*log2_n_fact(last_tree_elems + 1.0); |
|
298 |
if (n_full_trees) |
|
299 |
result+= n_full_trees * log2_n_fact(max_elements_in_tree + 1.0); |
|
300 |
result /= TIME_FOR_COMPARE_ROWID; |
|
301 |
||
302 |
||
303 |
if (!n_full_trees) |
|
304 |
return result; |
|
305 |
||
306 |
/*
|
|
307 |
There is more then one tree and merging is necessary.
|
|
308 |
First, add cost of writing all trees to disk, assuming that all disk
|
|
309 |
writes are sequential.
|
|
310 |
*/
|
|
311 |
result += DISK_SEEK_BASE_COST * n_full_trees * |
|
312 |
ceil(((double) key_size)*max_elements_in_tree / IO_SIZE); |
|
313 |
result += DISK_SEEK_BASE_COST * ceil(((double) key_size)*last_tree_elems / IO_SIZE); |
|
314 |
||
315 |
/* Cost of merge */
|
|
316 |
double merge_cost= get_merge_many_buffs_cost(buffer, n_full_trees, |
|
317 |
max_elements_in_tree, |
|
318 |
last_tree_elems, key_size); |
|
319 |
if (merge_cost < 0.0) |
|
320 |
return merge_cost; |
|
321 |
||
322 |
result += merge_cost; |
|
323 |
/*
|
|
324 |
Add cost of reading the resulting sequence, assuming there were no
|
|
325 |
duplicate elements.
|
|
326 |
*/
|
|
327 |
result += ceil((double)key_size*nkeys/IO_SIZE); |
|
328 |
||
329 |
return result; |
|
330 |
}
|
|
331 |
||
332 |
Unique::~Unique() |
|
333 |
{
|
|
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
334 |
close_cached_file(file); |
1
by brian
clean slate |
335 |
delete_tree(&tree); |
336 |
delete_dynamic(&file_ptrs); |
|
337 |
}
|
|
338 |
||
339 |
||
340 |
/* Write tree to disk; clear tree */
|
|
341 |
bool Unique::flush() |
|
342 |
{
|
|
343 |
BUFFPEK file_ptr; |
|
344 |
elements+= tree.elements_in_tree; |
|
345 |
file_ptr.count=tree.elements_in_tree; |
|
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
346 |
file_ptr.file_pos=my_b_tell(file); |
1
by brian
clean slate |
347 |
|
348 |
if (tree_walk(&tree, (tree_walk_action) unique_write_to_file, |
|
349 |
(void*) this, left_root_right) || |
|
481
by Brian Aker
Remove all of uchar. |
350 |
insert_dynamic(&file_ptrs, (unsigned char*) &file_ptr)) |
1
by brian
clean slate |
351 |
return 1; |
352 |
delete_tree(&tree); |
|
353 |
return 0; |
|
354 |
}
|
|
355 |
||
356 |
||
357 |
/*
|
|
358 |
Clear the tree and the file.
|
|
359 |
You must call reset() if you want to reuse Unique after walk().
|
|
360 |
*/
|
|
361 |
||
362 |
void
|
|
363 |
Unique::reset() |
|
364 |
{
|
|
365 |
reset_tree(&tree); |
|
366 |
/*
|
|
367 |
If elements != 0, some trees were stored in the file (see how
|
|
368 |
flush() works). Note, that we can not count on my_b_tell(&file) == 0
|
|
369 |
here, because it can return 0 right after walk(), and walk() does not
|
|
370 |
reset any Unique member.
|
|
371 |
*/
|
|
372 |
if (elements) |
|
373 |
{
|
|
374 |
reset_dynamic(&file_ptrs); |
|
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
375 |
reinit_io_cache(file, internal::WRITE_CACHE, 0L, 0, 1); |
1
by brian
clean slate |
376 |
}
|
377 |
elements= 0; |
|
378 |
}
|
|
379 |
||
380 |
/*
|
|
381 |
The comparison function, passed to queue_init() in merge_walk() and in
|
|
382 |
merge_buffers() when the latter is called from Uniques::get() must
|
|
383 |
use comparison function of Uniques::tree, but compare members of struct
|
|
384 |
BUFFPEK.
|
|
385 |
*/
|
|
386 |
||
481
by Brian Aker
Remove all of uchar. |
387 |
static int buffpek_compare(void *arg, unsigned char *key_ptr1, unsigned char *key_ptr2) |
1
by brian
clean slate |
388 |
{
|
389 |
BUFFPEK_COMPARE_CONTEXT *ctx= (BUFFPEK_COMPARE_CONTEXT *) arg; |
|
390 |
return ctx->key_compare(ctx->key_compare_arg, |
|
481
by Brian Aker
Remove all of uchar. |
391 |
*((unsigned char **) key_ptr1), *((unsigned char **)key_ptr2)); |
1
by brian
clean slate |
392 |
}
|
393 |
||
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
394 |
/*
|
395 |
The comparison function object, passed to a priority_queue in merge_walk()
|
|
396 |
as its sort function parameter.
|
|
397 |
*/
|
|
1
by brian
clean slate |
398 |
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
399 |
class buffpek_compare_functor |
400 |
{
|
|
401 |
qsort_cmp2 key_compare; |
|
402 |
void *key_compare_arg; |
|
403 |
public: |
|
404 |
buffpek_compare_functor(qsort_cmp2 in_key_compare, void *in_compare_arg) |
|
405 |
: key_compare(in_key_compare), key_compare_arg(in_compare_arg) { } |
|
897.1.12
by Padraig
Making my function objects const correct. |
406 |
inline bool operator()(const BUFFPEK *i, const BUFFPEK *j) |
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
407 |
{
|
408 |
return key_compare(key_compare_arg, |
|
409 |
i->key, j->key); |
|
410 |
}
|
|
411 |
};
|
|
1
by brian
clean slate |
412 |
|
413 |
/*
|
|
414 |
DESCRIPTION
|
|
415 |
||
416 |
Function is very similar to merge_buffers, but instead of writing sorted
|
|
417 |
unique keys to the output file, it invokes walk_action for each key.
|
|
418 |
This saves I/O if you need to pass through all unique keys only once.
|
|
419 |
||
420 |
SYNOPSIS
|
|
421 |
merge_walk()
|
|
422 |
All params are 'IN' (but see comment for begin, end):
|
|
423 |
merge_buffer buffer to perform cached piece-by-piece loading
|
|
424 |
of trees; initially the buffer is empty
|
|
425 |
merge_buffer_size size of merge_buffer. Must be aligned with
|
|
426 |
key_length
|
|
427 |
key_length size of tree element; key_length * (end - begin)
|
|
428 |
must be less or equal than merge_buffer_size.
|
|
429 |
begin pointer to BUFFPEK struct for the first tree.
|
|
430 |
end pointer to BUFFPEK struct for the last tree;
|
|
431 |
end > begin and [begin, end) form a consecutive
|
|
432 |
range. BUFFPEKs structs in that range are used and
|
|
433 |
overwritten in merge_walk().
|
|
434 |
walk_action element visitor. Action is called for each unique
|
|
435 |
key.
|
|
436 |
walk_action_arg argument to walk action. Passed to it on each call.
|
|
437 |
compare elements comparison function
|
|
438 |
compare_arg comparison function argument
|
|
439 |
file file with all trees dumped. Trees in the file
|
|
440 |
must contain sorted unique values. Cache must be
|
|
441 |
initialized in read mode.
|
|
442 |
RETURN VALUE
|
|
443 |
0 ok
|
|
444 |
<> 0 error
|
|
445 |
*/
|
|
446 |
||
481
by Brian Aker
Remove all of uchar. |
447 |
static bool merge_walk(unsigned char *merge_buffer, ulong merge_buffer_size, |
482
by Brian Aker
Remove uint. |
448 |
uint32_t key_length, BUFFPEK *begin, BUFFPEK *end, |
1
by brian
clean slate |
449 |
tree_walk_action walk_action, void *walk_action_arg, |
450 |
qsort_cmp2 compare, void *compare_arg, |
|
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
451 |
internal::IO_CACHE *file) |
1
by brian
clean slate |
452 |
{
|
453 |
if (end <= begin || |
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
454 |
merge_buffer_size < (ulong) (key_length * (end - begin + 1))) |
1
by brian
clean slate |
455 |
return 1; |
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
456 |
priority_queue<BUFFPEK *, vector<BUFFPEK *>, buffpek_compare_functor > |
457 |
queue(buffpek_compare_functor(compare, compare_arg)); |
|
1
by brian
clean slate |
458 |
/* we need space for one key when a piece of merge buffer is re-read */
|
459 |
merge_buffer_size-= key_length; |
|
481
by Brian Aker
Remove all of uchar. |
460 |
unsigned char *save_key_buff= merge_buffer + merge_buffer_size; |
895
by Brian Aker
Completion (?) of uint conversion. |
461 |
uint32_t max_key_count_per_piece= (uint32_t) (merge_buffer_size/(end-begin) / |
1
by brian
clean slate |
462 |
key_length); |
463 |
/* if piece_size is aligned reuse_freed_buffer will always hit */
|
|
482
by Brian Aker
Remove uint. |
464 |
uint32_t piece_size= max_key_count_per_piece * key_length; |
465 |
uint32_t bytes_read; /* to hold return value of read_to_buffer */ |
|
1
by brian
clean slate |
466 |
BUFFPEK *top; |
467 |
int res= 1; |
|
468 |
/*
|
|
469 |
Invariant: queue must contain top element from each tree, until a tree
|
|
470 |
is not completely walked through.
|
|
471 |
Here we're forcing the invariant, inserting one element from each tree
|
|
472 |
to the queue.
|
|
473 |
*/
|
|
474 |
for (top= begin; top != end; ++top) |
|
475 |
{
|
|
476 |
top->base= merge_buffer + (top - begin) * piece_size; |
|
477 |
top->max_keys= max_key_count_per_piece; |
|
478 |
bytes_read= read_to_buffer(file, top, key_length); |
|
895
by Brian Aker
Completion (?) of uint conversion. |
479 |
if (bytes_read == (uint32_t) (-1)) |
1
by brian
clean slate |
480 |
goto end; |
51.1.73
by Jay Pipes
Standardized TRUE/FALSE, removed/replaced DBUG symbols |
481 |
assert(bytes_read); |
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
482 |
queue.push(top); |
1
by brian
clean slate |
483 |
}
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
484 |
top= queue.top(); |
485 |
while (queue.size() > 1) |
|
1
by brian
clean slate |
486 |
{
|
487 |
/*
|
|
488 |
Every iteration one element is removed from the queue, and one is
|
|
489 |
inserted by the rules of the invariant. If two adjacent elements on
|
|
490 |
the top of the queue are not equal, biggest one is unique, because all
|
|
491 |
elements in each tree are unique. Action is applied only to unique
|
|
492 |
elements.
|
|
493 |
*/
|
|
494 |
void *old_key= top->key; |
|
495 |
/*
|
|
496 |
read next key from the cache or from the file and push it to the
|
|
497 |
queue; this gives new top.
|
|
498 |
*/
|
|
499 |
top->key+= key_length; |
|
500 |
if (--top->mem_count) |
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
501 |
{
|
502 |
queue.pop(); |
|
503 |
queue.push(top); |
|
504 |
}
|
|
1
by brian
clean slate |
505 |
else /* next piece should be read */ |
506 |
{
|
|
507 |
/* save old_key not to overwrite it in read_to_buffer */
|
|
508 |
memcpy(save_key_buff, old_key, key_length); |
|
509 |
old_key= save_key_buff; |
|
510 |
bytes_read= read_to_buffer(file, top, key_length); |
|
895
by Brian Aker
Completion (?) of uint conversion. |
511 |
if (bytes_read == (uint32_t) (-1)) |
1
by brian
clean slate |
512 |
goto end; |
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
513 |
else if (bytes_read > 0) /* top->key, top->mem_count are reset */ |
514 |
{ /* in read_to_buffer */ |
|
515 |
queue.pop(); |
|
516 |
queue.push(top); |
|
517 |
}
|
|
1
by brian
clean slate |
518 |
else
|
519 |
{
|
|
520 |
/*
|
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
521 |
Tree for old 'top' element is empty: remove it from the queue.
|
1
by brian
clean slate |
522 |
*/
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
523 |
queue.pop(); |
1
by brian
clean slate |
524 |
}
|
525 |
}
|
|
897.1.1
by Padraig
Replaced the instance of QUEUE in the merge_walk() function with |
526 |
top= queue.top(); |
1
by brian
clean slate |
527 |
/* new top has been obtained; if old top is unique, apply the action */
|
528 |
if (compare(compare_arg, old_key, top->key)) |
|
529 |
{
|
|
530 |
if (walk_action(old_key, 1, walk_action_arg)) |
|
531 |
goto end; |
|
532 |
}
|
|
533 |
}
|
|
534 |
/*
|
|
535 |
Applying walk_action to the tail of the last tree: this is safe because
|
|
536 |
either we had only one tree in the beginning, either we work with the
|
|
537 |
last tree in the queue.
|
|
538 |
*/
|
|
539 |
do
|
|
540 |
{
|
|
541 |
do
|
|
542 |
{
|
|
543 |
if (walk_action(top->key, 1, walk_action_arg)) |
|
544 |
goto end; |
|
545 |
top->key+= key_length; |
|
546 |
}
|
|
547 |
while (--top->mem_count); |
|
548 |
bytes_read= read_to_buffer(file, top, key_length); |
|
895
by Brian Aker
Completion (?) of uint conversion. |
549 |
if (bytes_read == (uint32_t) (-1)) |
1
by brian
clean slate |
550 |
goto end; |
551 |
}
|
|
552 |
while (bytes_read); |
|
553 |
res= 0; |
|
554 |
end: |
|
555 |
return res; |
|
556 |
}
|
|
557 |
||
558 |
||
559 |
/*
|
|
560 |
DESCRIPTION
|
|
561 |
Walks consecutively through all unique elements:
|
|
562 |
if all elements are in memory, then it simply invokes 'tree_walk', else
|
|
563 |
all flushed trees are loaded to memory piece-by-piece, pieces are
|
|
564 |
sorted, and action is called for each unique value.
|
|
565 |
Note: so as merging resets file_ptrs state, this method can change
|
|
566 |
internal Unique state to undefined: if you want to reuse Unique after
|
|
567 |
walk() you must call reset() first!
|
|
568 |
SYNOPSIS
|
|
569 |
Unique:walk()
|
|
570 |
All params are 'IN':
|
|
1410.3.4
by Djellel E. Difallah
update references to old my_'s |
571 |
action function-visitor, typed in include/tree.h
|
1
by brian
clean slate |
572 |
function is called for each unique element
|
573 |
arg argument for visitor, which is passed to it on each call
|
|
574 |
RETURN VALUE
|
|
575 |
0 OK
|
|
576 |
<> 0 error
|
|
577 |
*/
|
|
578 |
||
579 |
bool Unique::walk(tree_walk_action action, void *walk_action_arg) |
|
580 |
{
|
|
581 |
int res; |
|
1578.4.8
by Brian Aker
Modify merge-buffer to use std::vector in one location (just curious to see |
582 |
std::vector<unsigned char> merge_buffer; |
1
by brian
clean slate |
583 |
|
584 |
if (elements == 0) /* the whole tree is in memory */ |
|
585 |
return tree_walk(&tree, action, walk_action_arg, left_root_right); |
|
586 |
||
587 |
/* flush current tree to the file to have some memory for merge buffer */
|
|
588 |
if (flush()) |
|
1578.4.8
by Brian Aker
Modify merge-buffer to use std::vector in one location (just curious to see |
589 |
return true; |
590 |
||
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
591 |
if (flush_io_cache(file) || reinit_io_cache(file, internal::READ_CACHE, 0L, 0, 0)) |
1578.4.8
by Brian Aker
Modify merge-buffer to use std::vector in one location (just curious to see |
592 |
return true; |
593 |
||
594 |
try
|
|
595 |
{
|
|
596 |
merge_buffer.resize(max_in_memory_size); |
|
597 |
}
|
|
598 |
catch (std::bad_alloc const&) |
|
599 |
{
|
|
600 |
return true; |
|
601 |
}
|
|
602 |
||
603 |
res= merge_walk(&merge_buffer[0], (ulong) max_in_memory_size, size, |
|
1
by brian
clean slate |
604 |
(BUFFPEK *) file_ptrs.buffer, |
605 |
(BUFFPEK *) file_ptrs.buffer + file_ptrs.elements, |
|
606 |
action, walk_action_arg, |
|
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
607 |
tree.compare, tree.custom_arg, file); |
1578.4.8
by Brian Aker
Modify merge-buffer to use std::vector in one location (just curious to see |
608 |
|
1
by brian
clean slate |
609 |
return res; |
610 |
}
|
|
611 |
||
612 |
/*
|
|
327.1.5
by Brian Aker
Refactor around classes. TABLE_LIST has been factored out of table.h |
613 |
Modify the Table element so that when one calls init_records()
|
1
by brian
clean slate |
614 |
the rows will be read in priority order.
|
615 |
*/
|
|
616 |
||
327.1.5
by Brian Aker
Refactor around classes. TABLE_LIST has been factored out of table.h |
617 |
bool Unique::get(Table *table) |
1
by brian
clean slate |
618 |
{
|
619 |
SORTPARAM sort_param; |
|
620 |
table->sort.found_records=elements+tree.elements_in_tree; |
|
621 |
||
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
622 |
if (my_b_tell(file) == 0) |
1
by brian
clean slate |
623 |
{
|
624 |
/* Whole tree is in memory; Don't use disk if you don't need to */
|
|
481
by Brian Aker
Remove all of uchar. |
625 |
if ((record_pointers=table->sort.record_pointers= (unsigned char*) |
641.3.8
by Monty Taylor
Removed my_malloc from drizzled. |
626 |
malloc(size * tree.elements_in_tree))) |
1
by brian
clean slate |
627 |
{
|
628 |
(void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs, |
|
629 |
this, left_root_right); |
|
630 |
return 0; |
|
631 |
}
|
|
632 |
}
|
|
633 |
/* Not enough memory; Save the result to file && free memory used by tree */
|
|
634 |
if (flush()) |
|
635 |
return 1; |
|
636 |
||
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
637 |
internal::IO_CACHE *outfile=table->sort.io_cache; |
1
by brian
clean slate |
638 |
BUFFPEK *file_ptr= (BUFFPEK*) file_ptrs.buffer; |
482
by Brian Aker
Remove uint. |
639 |
uint32_t maxbuffer= file_ptrs.elements - 1; |
481
by Brian Aker
Remove all of uchar. |
640 |
unsigned char *sort_buffer; |
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
641 |
internal::my_off_t save_pos; |
1
by brian
clean slate |
642 |
bool error=1; |
643 |
||
644 |
/* Open cached file if it isn't open */
|
|
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
645 |
outfile=table->sort.io_cache= new internal::IO_CACHE; |
1
by brian
clean slate |
646 |
|
1556.1.1
by Brian Aker
Updates for moving temporary directory. |
647 |
if (!outfile || (! my_b_inited(outfile) && open_cached_file(outfile, drizzle_tmpdir.c_str(),TEMP_PREFIX,READ_RECORD_BUFFER, MYF(MY_WME)))) |
1
by brian
clean slate |
648 |
return 1; |
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
649 |
reinit_io_cache(outfile, internal::WRITE_CACHE, 0L, 0, 0); |
1
by brian
clean slate |
650 |
|
212.6.6
by Mats Kindahl
Removing redundant use of casts in drizzled/ for memcmp(), memcpy(), memset(), and memmove(). |
651 |
memset(&sort_param, 0, sizeof(sort_param)); |
1
by brian
clean slate |
652 |
sort_param.max_rows= elements; |
653 |
sort_param.sort_form=table; |
|
654 |
sort_param.rec_length= sort_param.sort_length= sort_param.ref_length= |
|
655 |
size; |
|
895
by Brian Aker
Completion (?) of uint conversion. |
656 |
sort_param.keys= (uint32_t) (max_in_memory_size / sort_param.sort_length); |
1
by brian
clean slate |
657 |
sort_param.not_killable=1; |
658 |
||
641.3.8
by Monty Taylor
Removed my_malloc from drizzled. |
659 |
if (!(sort_buffer=(unsigned char*) malloc((sort_param.keys+1) * |
660 |
sort_param.sort_length))) |
|
1
by brian
clean slate |
661 |
return 1; |
662 |
sort_param.unique_buff= sort_buffer+(sort_param.keys* |
|
663 |
sort_param.sort_length); |
|
664 |
||
665 |
sort_param.compare= (qsort2_cmp) buffpek_compare; |
|
666 |
sort_param.cmp_context.key_compare= tree.compare; |
|
667 |
sort_param.cmp_context.key_compare_arg= tree.custom_arg; |
|
668 |
||
669 |
/* Merge the buffers to one file, removing duplicates */
|
|
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
670 |
if (merge_many_buff(&sort_param,sort_buffer,file_ptr,&maxbuffer,file)) |
671 |
goto err; |
|
672 |
if (flush_io_cache(file) || |
|
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
673 |
reinit_io_cache(file,internal::READ_CACHE,0L,0,0)) |
1241.9.50
by Monty Taylor
Removed last non-pointer public IO_CACHE from drizzled/ |
674 |
goto err; |
675 |
if (merge_buffers(&sort_param, file, outfile, sort_buffer, file_ptr, |
|
1
by brian
clean slate |
676 |
file_ptr, file_ptr+maxbuffer,0)) |
677 |
goto err; |
|
678 |
error=0; |
|
679 |
err: |
|
460
by Monty Taylor
Removed x_free calls. |
680 |
if (sort_buffer) |
681 |
free(sort_buffer); |
|
1
by brian
clean slate |
682 |
if (flush_io_cache(outfile)) |
683 |
error=1; |
|
684 |
||
685 |
/* Setup io_cache for reading */
|
|
686 |
save_pos=outfile->pos_in_file; |
|
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
687 |
if (reinit_io_cache(outfile,internal::READ_CACHE,0L,0,0)) |
1
by brian
clean slate |
688 |
error=1; |
689 |
outfile->end_of_file=save_pos; |
|
690 |
return error; |
|
691 |
}
|
|
1280.1.10
by Monty Taylor
Put everything in drizzled into drizzled namespace. |
692 |
|
693 |
} /* namespace drizzled */ |