~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to drizzled/unique.cc

Committer: Brian Aker
Date: 2010-09-09 21:45:53 UTC
mto: (1756.1.2 build) (1768.2.1 trunk)
mto: This revision was merged to the branch mainline in revision 1757.
Revision ID: brian@tangent.org-20100909214553-e687rmf5zk9478on

Force unique to just use memory and let the OS handle paging.

files renamed:
drizzled/uniques.cc => drizzled/unique.cc

files modified:
drizzled/include.am

drizzled/unique.h

Show diffs side-by-side

added added

removed removed

drizzled/unique.cc

The basic idea is as follows:

Store first all strings in a binary tree, ignoring duplicates.

When the tree uses more memory than 'max_heap_table_size',

write the tree (in sorted order) out to disk and start with a new tree.

When all data has been generated, merge the trees (removing any found

duplicates).

The unique entries will be returned in sort order, to ensure that we do the

deletes in disk order.

namespace drizzled

{

int unique_write_to_file(unsigned char* key, uint32_t,

Unique *unique)

{

Use unique->size (size of element stored in the tree) and not

unique->tree.size_of_element. The latter is different from unique->size

when tree implementation chooses to store pointer to key in TREE_ELEMENT

(instead of storing the element itself there)

return my_b_write(unique->file, key, unique->size) ? 1 : 0;

}

int unique_write_to_ptrs(unsigned char* key,

uint32_t, Unique *unique)

{

Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,

uint32_t size_arg, size_t max_in_memory_size_arg)

: max_in_memory_size(max_in_memory_size_arg),

file(static_cast<internal::IO_CACHE *>(memory::sql_calloc(sizeof(internal::IO_CACHE)))),

size(size_arg),

elements(0)

{

my_b_clear(file);

init_tree(&tree, (ulong) (max_in_memory_size / 16), 0, size, comp_func, false,

// Second element is max size for memory use in unique sort

init_tree(&tree, 0, 0, size, comp_func, false,

NULL, comp_func_fixed_arg);

/* If the following fail's the next add will also fail */

my_init_dynamic_array(&file_ptrs, sizeof(buffpek_st), 16, 16);

If you change the following, change it in get_max_elements function, too.

max_elements= (ulong) (max_in_memory_size /

ALIGN_SIZE(sizeof(TREE_ELEMENT)+size));

open_cached_file(file, drizzle_tmpdir.c_str(), TEMP_PREFIX, DISK_BUFFER_SIZE,

MYF(MY_WME));

}

114

115

116

117

Calculate cost of merge_buffers function call for given sequence of

118

input stream lengths and store the number of rows in result stream in *last.

119

120

SYNOPSIS

121

get_merge_buffers_cost()

122

buff_elems Array of #s of elements in buffers

123

elem_size Size of element stored in buffer

124

first Pointer to first merged element size

125

last Pointer to last merged element size

126

127

RETURN

128

Cost of merge_buffers operation in disk seeks.

129

130

NOTES

131

It is assumed that no rows are eliminated during merge.

132

The cost is calculated as

133

134

cost(read_and_write) + cost(merge_comparisons).

135

136

All bytes in the sequences is read and written back during merge so cost

137

of disk io is 2*elem_size*total_buf_elems/IO_SIZE (2 is for read + write)

138

139

For comparisons cost calculations we assume that all merged sequences have

140

the same length, so each of total_buf_size elements will be added to a sort

141

heap with (n_buffers-1) elements. This gives the comparison cost:

142

143

total_buf_elems* log2(n_buffers) / TIME_FOR_COMPARE_ROWID;

144

145

146

static double get_merge_buffers_cost(uint32_t *, uint32_t elem_size,

147

uint32_t *first, uint32_t *last)

148

{

149

uint32_t total_buf_elems= 0;

150

for (uint32_t *pbuf= first; pbuf <= last; pbuf++)

151

total_buf_elems+= *pbuf;

152

*last= total_buf_elems;

153

154

int n_buffers= last - first + 1;

155

156

/* Using log2(n)=log(n)/log(2) formula */

157

return 2*((double)total_buf_elems*elem_size) / IO_SIZE +

158

total_buf_elems*log((double) n_buffers) / (TIME_FOR_COMPARE_ROWID * M_LN2);

159

}

160

161

162

163

Calculate cost of merging buffers into one in Unique::get, i.e. calculate

164

how long (in terms of disk seeks) the two calls

165

merge_many_buffs(...);

166

merge_buffers(...);

167

will take.

168

169

SYNOPSIS

170

get_merge_many_buffs_cost()

171

buffer buffer space for temporary data, at least

172

Unique::get_cost_calc_buff_size bytes

173

maxbuffer # of full buffers

174

max_n_elems # of elements in first maxbuffer buffers

175

last_n_elems # of elements in last buffer

176

elem_size size of buffer element

177

178

NOTES

179

maxbuffer+1 buffers are merged, where first maxbuffer buffers contain

180

max_n_elems elements each and last buffer contains last_n_elems elements.

181

182

The current implementation does a dumb simulation of merge_many_buffs

183

function actions.

184

185

RETURN

186

Cost of merge in disk seeks.

187

188

189

static double get_merge_many_buffs_cost(uint32_t *buffer,

190

uint32_t maxbuffer, uint32_t max_n_elems,

191

uint32_t last_n_elems, int elem_size)

192

{

193

194

double total_cost= 0.0;

195

uint32_t *buff_elems= buffer; /* #s of elements in each of merged sequences */

196

197

198

Set initial state: first maxbuffer sequences contain max_n_elems elements

199

each, last sequence contains last_n_elems elements.

200

201

for (i = 0; i < (int)maxbuffer; i++)

202

buff_elems[i]= max_n_elems;

203

buff_elems[maxbuffer]= last_n_elems;

204

205

206

Do it exactly as merge_many_buff function does, calling

207

get_merge_buffers_cost to get cost of merge_buffers.

208

209

if (maxbuffer >= MERGEBUFF2)

210

{

211

while (maxbuffer >= MERGEBUFF2)

212

{

213

uint32_t lastbuff= 0;

214

for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF)

215

{

216

total_cost+=get_merge_buffers_cost(buff_elems, elem_size,

217

buff_elems + i,

218

buff_elems + i + MERGEBUFF-1);

219

lastbuff++;

220

}

221

total_cost+=get_merge_buffers_cost(buff_elems, elem_size,

222

buff_elems + i,

223

buff_elems + maxbuffer);

224

maxbuffer= lastbuff;

225

}

226

}

227

228

/* Simulate final merge_buff call. */

229

total_cost += get_merge_buffers_cost(buff_elems, elem_size,

230

buff_elems, buff_elems + maxbuffer);

231

return total_cost;

232

}

233

234

235

236

Calculate cost of using Unique for processing nkeys elements of size

237

key_size using max_in_memory_size memory.

238

267

122

268

123

Approximate value of log2(N!) is calculated by log2_n_fact function.

269

124

125

126

(The Next two are historical, we do all unique operations in memory or fail)

127

270

128

2. Cost of merging.

271

129

If only one tree is created by Unique no merging will be necessary.

272

130

Otherwise, we model execution of merge_many_buff function and count

279

137

these will be random seeks.

280

138

281

139

282

double Unique::get_use_cost(uint32_t *buffer, uint32_t nkeys, uint32_t key_size,

140

double Unique::get_use_cost(uint32_t *, uint32_t nkeys, uint32_t key_size,

283

141

size_t max_in_memory_size)

284

142

{

285

143

ulong max_elements_in_tree;

286

144

ulong last_tree_elems;

287

int n_full_trees; /* number of trees in unique - 1 */

288

145

double result;

289

146

290

147

max_elements_in_tree= ((ulong) max_in_memory_size /

291

148

ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size));

292

149

293

n_full_trees= nkeys / max_elements_in_tree;

294

150

last_tree_elems= nkeys % max_elements_in_tree;

295

151

296

152

/* Calculate cost of creating trees */

297

153

result= 2*log2_n_fact(last_tree_elems + 1.0);

298

if (n_full_trees)

299

result+= n_full_trees * log2_n_fact(max_elements_in_tree + 1.0);

300

154

result /= TIME_FOR_COMPARE_ROWID;

301

155

302

303

if (!n_full_trees)

304

return result;

305

306

307

There is more then one tree and merging is necessary.

308

First, add cost of writing all trees to disk, assuming that all disk

309

writes are sequential.

310

311

result += DISK_SEEK_BASE_COST * n_full_trees *

312

ceil(((double) key_size)*max_elements_in_tree / IO_SIZE);

313

result += DISK_SEEK_BASE_COST * ceil(((double) key_size)*last_tree_elems / IO_SIZE);

314

315

/* Cost of merge */

316

double merge_cost= get_merge_many_buffs_cost(buffer, n_full_trees,

317

max_elements_in_tree,

318

last_tree_elems, key_size);

319

if (merge_cost < 0.0)

320

return merge_cost;

321

322

result += merge_cost;

323

324

Add cost of reading the resulting sequence, assuming there were no

325

duplicate elements.

326

327

result += ceil((double)key_size*nkeys/IO_SIZE);

328

329

156

return result;

330

157

}

331

158

332

159

Unique::~Unique()

333

160

{

334

close_cached_file(file);

335

delete_tree(&tree);

336

delete_dynamic(&file_ptrs);

337

}

338

339

340

/* Write tree to disk; clear tree */

341

bool Unique::flush()

342

{

343

buffpek_st file_ptr;

344

elements+= tree.elements_in_tree;

345

file_ptr.count=tree.elements_in_tree;

346

file_ptr.file_pos=my_b_tell(file);

347

348

if (tree_walk(&tree, (tree_walk_action) unique_write_to_file,

349

(void*) this, left_root_right) ||

350

insert_dynamic(&file_ptrs, (unsigned char*) &file_ptr))

351

return 1;

352

delete_tree(&tree);

353

return 0;

161

delete_tree(&tree);

354

162

}

355

163

356

164

357

165

358

Clear the tree and the file.

166

Clear the tree.

359

167

You must call reset() if you want to reuse Unique after walk().

360

168

361

169

363

171

Unique::reset()

364

172

{

365

173

reset_tree(&tree);

366

367

If elements != 0, some trees were stored in the file (see how

368

flush() works). Note, that we can not count on my_b_tell(&file) == 0

369

here, because it can return 0 right after walk(), and walk() does not

370

reset any Unique member.

371

372

if (elements)

373

{

374

reset_dynamic(&file_ptrs);

375

reinit_io_cache(file, internal::WRITE_CACHE, 0L, 0, 1);

376

}

377

elements= 0;

378

}

379

380

381

The comparison function, passed to queue_init() in merge_walk() and in

382

merge_buffers() when the latter is called from Uniques::get() must

383

use comparison function of Uniques::tree, but compare members of struct

384

buffpek_st.

385

386

387

static int buffpek_compare(void *arg, unsigned char *key_ptr1, unsigned char *key_ptr2)

388

{

389

BUFFPEK_COMPARE_CONTEXT *ctx= (BUFFPEK_COMPARE_CONTEXT *) arg;

390

return ctx->key_compare(ctx->key_compare_arg,

391

*((unsigned char **) key_ptr1), *((unsigned char **)key_ptr2));

392

}

393

394

395

The comparison function object, passed to a priority_queue in merge_walk()

396

as its sort function parameter.

397

398

399

class buffpek_compare_functor

400

{

401

qsort_cmp2 key_compare;

402

void *key_compare_arg;

403

public:

404

buffpek_compare_functor(qsort_cmp2 in_key_compare, void *in_compare_arg)

405

: key_compare(in_key_compare), key_compare_arg(in_compare_arg) { }

406

inline bool operator()(const buffpek_st *i, const buffpek_st *j)

407

{

408

return key_compare(key_compare_arg,

409

i->key, j->key);

410

}

411

};

412

413

414

DESCRIPTION

415

416

Function is very similar to merge_buffers, but instead of writing sorted

417

unique keys to the output file, it invokes walk_action for each key.

418

This saves I/O if you need to pass through all unique keys only once.

419

420

SYNOPSIS

421

merge_walk()

422

All params are 'IN' (but see comment for begin, end):

423

merge_buffer buffer to perform cached piece-by-piece loading

424

of trees; initially the buffer is empty

425

merge_buffer_size size of merge_buffer. Must be aligned with

426

key_length

427

key_length size of tree element; key_length * (end - begin)

428

must be less or equal than merge_buffer_size.

429

begin pointer to buffpek_st struct for the first tree.

430

end pointer to buffpek_st struct for the last tree;

431

end > begin and [begin, end) form a consecutive

432

range. buffpek_sts structs in that range are used and

433

overwritten in merge_walk().

434

walk_action element visitor. Action is called for each unique

435

key.

436

walk_action_arg argument to walk action. Passed to it on each call.

437

compare elements comparison function

438

compare_arg comparison function argument

439

file file with all trees dumped. Trees in the file

440

must contain sorted unique values. Cache must be

441

initialized in read mode.

442

RETURN VALUE

443

0 ok

444

<> 0 error

445

446

447

static bool merge_walk(unsigned char *merge_buffer, ulong merge_buffer_size,

448

uint32_t key_length, buffpek_st *begin, buffpek_st *end,

449

tree_walk_action walk_action, void *walk_action_arg,

450

qsort_cmp2 compare, void *compare_arg,

451

internal::IO_CACHE *file)

452

{

453

if (end <= begin ||

454

merge_buffer_size < (ulong) (key_length * (end - begin + 1)))

455

return 1;

456

priority_queue<buffpek_st *, vector<buffpek_st *>, buffpek_compare_functor >

457

queue(buffpek_compare_functor(compare, compare_arg));

458

/* we need space for one key when a piece of merge buffer is re-read */

459

merge_buffer_size-= key_length;

460

unsigned char *save_key_buff= merge_buffer + merge_buffer_size;

461

uint32_t max_key_count_per_piece= (uint32_t) (merge_buffer_size/(end-begin) /

462

key_length);

463

/* if piece_size is aligned reuse_freed_buffer will always hit */

464

uint32_t piece_size= max_key_count_per_piece * key_length;

465

uint32_t bytes_read; /* to hold return value of read_to_buffer */

466

buffpek_st *top;

467

int res= 1;

468

469

Invariant: queue must contain top element from each tree, until a tree

470

is not completely walked through.

471

Here we're forcing the invariant, inserting one element from each tree

472

to the queue.

473

474

for (top= begin; top != end; ++top)

475

{

476

top->base= merge_buffer + (top - begin) * piece_size;

477

top->max_keys= max_key_count_per_piece;

478

bytes_read= read_to_buffer(file, top, key_length);

479

if (bytes_read == (uint32_t) (-1))

480

goto end;

481

assert(bytes_read);

482

queue.push(top);

483

}

484

top= queue.top();

485

while (queue.size() > 1)

486

{

487

488

Every iteration one element is removed from the queue, and one is

489

inserted by the rules of the invariant. If two adjacent elements on

490

the top of the queue are not equal, biggest one is unique, because all

491

elements in each tree are unique. Action is applied only to unique

492

elements.

493

494

void *old_key= top->key;

495

496

read next key from the cache or from the file and push it to the

497

queue; this gives new top.

498

499

top->key+= key_length;

500

if (--top->mem_count)

501

{

502

queue.pop();

503

queue.push(top);

504

}

505

else /* next piece should be read */

506

{

507

/* save old_key not to overwrite it in read_to_buffer */

508

memcpy(save_key_buff, old_key, key_length);

509

old_key= save_key_buff;

510

bytes_read= read_to_buffer(file, top, key_length);

511

if (bytes_read == (uint32_t) (-1))

512

goto end;

513

else if (bytes_read > 0) /* top->key, top->mem_count are reset */

514

{ /* in read_to_buffer */

515

queue.pop();

516

queue.push(top);

517

}

518

else

519

{

520

521

Tree for old 'top' element is empty: remove it from the queue.

522

523

queue.pop();

524

}

525

}

526

top= queue.top();

527

/* new top has been obtained; if old top is unique, apply the action */

528

if (compare(compare_arg, old_key, top->key))

529

{

530

if (walk_action(old_key, 1, walk_action_arg))

531

goto end;

532

}

533

}

534

535

Applying walk_action to the tail of the last tree: this is safe because

536

either we had only one tree in the beginning, either we work with the

537

last tree in the queue.

538

539

540

{

541

542

{

543

if (walk_action(top->key, 1, walk_action_arg))

544

goto end;

545

top->key+= key_length;

546

}

547

while (--top->mem_count);

548

bytes_read= read_to_buffer(file, top, key_length);

549

if (bytes_read == (uint32_t) (-1))

550

goto end;

551

}

552

while (bytes_read);

553

res= 0;

554

end:

555

return res;

174

assert(elements == 0);

556

175

}

557

176

558

177

578

197

579

198

bool Unique::walk(tree_walk_action action, void *walk_action_arg)

580

199

{

581

int res;

582

std::vector<unsigned char> merge_buffer;

583

584

if (elements == 0) /* the whole tree is in memory */

585

return tree_walk(&tree, action, walk_action_arg, left_root_right);

586

587

/* flush current tree to the file to have some memory for merge buffer */

588

if (flush())

589

return true;

590

591

if (flush_io_cache(file) || reinit_io_cache(file, internal::READ_CACHE, 0L, 0, 0))

592

return true;

593

594

try

595

{

596

merge_buffer.resize(max_in_memory_size);

597

}

598

catch (std::bad_alloc const&)

599

{

600

return true;

601

}

602

603

res= merge_walk(&merge_buffer[0], (ulong) max_in_memory_size, size,

604

(buffpek_st *) file_ptrs.buffer,

605

(buffpek_st *) file_ptrs.buffer + file_ptrs.elements,

606

action, walk_action_arg,

607

tree.compare, tree.custom_arg, file);

608

609

return res;

200

return tree_walk(&tree, action, walk_action_arg, left_root_right);

610

201

}

611

202

612

203

616

207

617

208

bool Unique::get(Table *table)

618

209

{

619

SORTPARAM sort_param;

620

table->sort.found_records=elements+tree.elements_in_tree;

210

table->sort.found_records= elements+tree.elements_in_tree;

621

211

622

if (my_b_tell(file) == 0)

212

if ((record_pointers=table->sort.record_pointers= (unsigned char*)

213

malloc(size * tree.elements_in_tree)))

623

214

{

624

/* Whole tree is in memory; Don't use disk if you don't need to */

625

if ((record_pointers=table->sort.record_pointers= (unsigned char*)

626

malloc(size * tree.elements_in_tree)))

627

{

628

(void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs,

629

this, left_root_right);

630

return 0;

631

}

215

(void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs,

216

this, left_root_right);

217

return 0;

632

218

}

633

/* Not enough memory; Save the result to file && free memory used by tree */

634

if (flush())

635

return 1;

636

637

internal::IO_CACHE *outfile=table->sort.io_cache;

638

buffpek_st *file_ptr= (buffpek_st*) file_ptrs.buffer;

639

uint32_t maxbuffer= file_ptrs.elements - 1;

640

unsigned char *sort_buffer;

641

internal::my_off_t save_pos;

642

bool error=1;

643

644

/* Open cached file if it isn't open */

645

outfile=table->sort.io_cache= new internal::IO_CACHE;

646

647

if (!outfile || (! my_b_inited(outfile) && open_cached_file(outfile, drizzle_tmpdir.c_str(),TEMP_PREFIX,READ_RECORD_BUFFER, MYF(MY_WME))))

648

return 1;

649

reinit_io_cache(outfile, internal::WRITE_CACHE, 0L, 0, 0);

650

651

memset(&sort_param, 0, sizeof(sort_param));

652

sort_param.max_rows= elements;

653

sort_param.sort_form=table;

654

sort_param.rec_length= sort_param.sort_length= sort_param.ref_length=

655

size;

656

sort_param.keys= (uint32_t) (max_in_memory_size / sort_param.sort_length);

657

sort_param.not_killable=1;

658

659

if (!(sort_buffer=(unsigned char*) malloc((sort_param.keys+1) *

660

sort_param.sort_length)))

661

return 1;

662

sort_param.unique_buff= sort_buffer+(sort_param.keys*

663

sort_param.sort_length);

664

665

sort_param.compare= (qsort2_cmp) buffpek_compare;

666

sort_param.cmp_context.key_compare= tree.compare;

667

sort_param.cmp_context.key_compare_arg= tree.custom_arg;

668

669

/* Merge the buffers to one file, removing duplicates */

670

if (merge_many_buff(&sort_param,sort_buffer,file_ptr,&maxbuffer,file))

671

goto err;

672

if (flush_io_cache(file) ||

673

reinit_io_cache(file,internal::READ_CACHE,0L,0,0))

674

goto err;

675

if (merge_buffers(&sort_param, file, outfile, sort_buffer, file_ptr,

676

file_ptr, file_ptr+maxbuffer,0))

677

goto err;

678

error=0;

679

err:

680

if (sort_buffer)

681

free(sort_buffer);

682

if (flush_io_cache(outfile))

683

error=1;

684

685

/* Setup io_cache for reading */

686

save_pos=outfile->pos_in_file;

687

if (reinit_io_cache(outfile,internal::READ_CACHE,0L,0,0))

688

error=1;

689

outfile->end_of_file=save_pos;

690

return error;

219

/* Not enough memory */

220

return 1;

691

221

}

692

222

693

223

} /* namespace drizzled */

Older »