23
23
Handler-calling-functions
30
#include "drizzled/my_hash.h"
26
#include "drizzled/server_includes.h"
27
#include "mysys/hash.h"
31
28
#include "drizzled/error.h"
32
29
#include "drizzled/gettext.h"
30
#include "drizzled/data_home.h"
33
31
#include "drizzled/probes.h"
34
32
#include "drizzled/sql_parse.h"
35
#include "drizzled/optimizer/cost_vector.h"
33
#include "drizzled/cost_vect.h"
36
34
#include "drizzled/session.h"
37
35
#include "drizzled/sql_base.h"
38
36
#include "drizzled/transaction_services.h"
39
37
#include "drizzled/lock.h"
40
38
#include "drizzled/item/int.h"
41
39
#include "drizzled/item/empty_string.h"
40
#include "drizzled/unireg.h" // for mysql_frm_type
42
41
#include "drizzled/field/timestamp.h"
43
42
#include "drizzled/message/table.pb.h"
44
#include "drizzled/plugin/client.h"
45
#include "drizzled/internal/my_sys.h"
46
#include "drizzled/plugin/event_observer.h"
48
44
using namespace std;
46
extern drizzled::TransactionServices transaction_services;
48
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
50
/* number of entries in storage_engines[] */
52
/* number of storage engines (from storage_engines[]) that support 2pc */
53
uint32_t total_ha_2pc= 0;
54
/* size of savepoint storage area (see ha_init) */
55
uint32_t savepoint_alloc_size= 0;
57
const char *ha_row_type[] = {
58
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
61
const char *tx_isolation_names[] =
62
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
65
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
66
tx_isolation_names, NULL};
70
Register handler error messages for use with my_error().
78
int ha_init_errors(void)
80
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
83
/* Allocate a pointer array for the error message strings. */
84
/* Zerofill it to avoid uninitialized gaps. */
85
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
87
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
89
/* Set the dedicated error messages. */
90
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
91
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
92
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
93
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
94
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
95
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
96
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
97
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
98
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
99
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
100
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
101
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
102
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
103
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
104
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
105
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
106
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
107
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
108
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
109
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
110
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
111
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
112
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
113
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
114
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
115
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
116
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
117
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
118
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
119
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
120
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
121
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
122
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
123
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
124
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
125
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
126
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
127
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
128
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
129
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
130
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
132
/* Register the error messages for use with my_error(). */
133
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
138
Unregister handler error messages.
145
static int ha_finish_errors(void)
147
const char **errmsgs;
149
/* Allocate a pointer array for the error message strings. */
150
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
152
free((unsigned char*) errmsgs);
160
assert(total_ha < MAX_HA);
162
Check if there is a transaction-capable storage engine besides the
163
binary log (which is considered a transaction-capable storage engine in
166
savepoint_alloc_size+= sizeof(SAVEPOINT);
175
This should be eventualy based on the graceful shutdown flag.
176
So if flag is equal to HA_PANIC_CLOSE, the deallocate
179
if (ha_finish_errors())
187
/* ========================================================================
188
======================= TRANSACTIONS ===================================*/
191
Transaction handling in the server
192
==================================
194
In each client connection, MySQL maintains two transactional
196
- a statement transaction,
197
- a standard, also called normal transaction.
201
"Statement transaction" is a non-standard term that comes
202
from the times when MySQL supported BerkeleyDB storage engine.
204
First of all, it should be said that in BerkeleyDB auto-commit
205
mode auto-commits operations that are atomic to the storage
206
engine itself, such as a write of a record, and are too
207
high-granular to be atomic from the application perspective
208
(MySQL). One SQL statement could involve many BerkeleyDB
209
auto-committed operations and thus BerkeleyDB auto-commit was of
212
Secondly, instead of SQL standard savepoints, BerkeleyDB
213
provided the concept of "nested transactions". In a nutshell,
214
transactions could be arbitrarily nested, but when the parent
215
transaction was committed or aborted, all its child (nested)
216
transactions were handled committed or aborted as well.
217
Commit of a nested transaction, in turn, made its changes
218
visible, but not durable: it destroyed the nested transaction,
219
all its changes would become available to the parent and
220
currently active nested transactions of this parent.
222
So the mechanism of nested transactions was employed to
223
provide "all or nothing" guarantee of SQL statements
224
required by the standard.
225
A nested transaction would be created at start of each SQL
226
statement, and destroyed (committed or aborted) at statement
227
end. Such nested transaction was internally referred to as
228
a "statement transaction" and gave birth to the term.
230
<Historical note ends>
232
Since then a statement transaction is started for each statement
233
that accesses transactional tables or uses the binary log. If
234
the statement succeeds, the statement transaction is committed.
235
If the statement fails, the transaction is rolled back. Commits
236
of statement transactions are not durable -- each such
237
transaction is nested in the normal transaction, and if the
238
normal transaction is rolled back, the effects of all enclosed
239
statement transactions are undone as well. Technically,
240
a statement transaction can be viewed as a savepoint which is
241
maintained automatically in order to make effects of one
244
The normal transaction is started by the user and is ended
245
usually upon a user request as well. The normal transaction
246
encloses transactions of all statements issued between
247
its beginning and its end.
248
In autocommit mode, the normal transaction is equivalent
249
to the statement transaction.
251
Since MySQL supports PSEA (pluggable storage engine
252
architecture), more than one transactional engine can be
253
active at a time. Hence transactions, from the server
254
point of view, are always distributed. In particular,
255
transactional state is maintained independently for each
256
engine. In order to commit a transaction the two phase
257
commit protocol is employed.
259
Not all statements are executed in context of a transaction.
260
Administrative and status information statements do not modify
261
engine data, and thus do not start a statement transaction and
262
also have no effect on the normal transaction. Examples of such
263
statements are SHOW STATUS and RESET SLAVE.
265
Similarly DDL statements are not transactional,
266
and therefore a transaction is [almost] never started for a DDL
267
statement. The difference between a DDL statement and a purely
268
administrative statement though is that a DDL statement always
269
commits the current transaction before proceeding, if there is
272
At last, SQL statements that work with non-transactional
273
engines also have no effect on the transaction state of the
274
connection. Even though they are written to the binary log,
275
and the binary log is, overall, transactional, the writes
276
are done in "write-through" mode, directly to the binlog
277
file, followed with a OS cache sync, in other words,
278
bypassing the binlog undo log (translog).
279
They do not commit the current normal transaction.
280
A failure of a statement that uses non-transactional tables
281
would cause a rollback of the statement transaction, but
282
in case there no non-transactional tables are used,
283
no statement transaction is started.
288
The server stores its transaction-related data in
289
session->transaction. This structure has two members of type
290
Session_TRANS. These members correspond to the statement and
291
normal transactions respectively:
293
- session->transaction.stmt contains a list of engines
294
that are participating in the given statement
295
- session->transaction.all contains a list of engines that
296
have participated in any of the statement transactions started
297
within the context of the normal transaction.
298
Each element of the list contains a pointer to the storage
299
engine, engine-specific transactional data, and engine-specific
302
In autocommit mode session->transaction.all is empty.
303
Instead, data of session->transaction.stmt is
304
used to commit/rollback the normal transaction.
306
The list of registered engines has a few important properties:
307
- no engine is registered in the list twice
308
- engines are present in the list a reverse temporal order --
309
new participants are always added to the beginning of the list.
311
Transaction life cycle
312
----------------------
314
When a new connection is established, session->transaction
315
members are initialized to an empty state.
316
If a statement uses any tables, all affected engines
317
are registered in the statement engine list. In
318
non-autocommit mode, the same engines are registered in
319
the normal transaction list.
320
At the end of the statement, the server issues a commit
321
or a roll back for all engines in the statement list.
322
At this point transaction flags of an engine, if any, are
323
propagated from the statement list to the list of the normal
325
When commit/rollback is finished, the statement list is
326
cleared. It will be filled in again by the next statement,
327
and emptied again at the next statement's end.
329
The normal transaction is committed in a similar way
330
(by going over all engines in session->transaction.all list)
331
but at different times:
332
- upon COMMIT SQL statement is issued by the user
333
- implicitly, by the server, at the beginning of a DDL statement
334
or SET AUTOCOMMIT={0|1} statement.
336
The normal transaction can be rolled back as well:
337
- if the user has requested so, by issuing ROLLBACK SQL
339
- if one of the storage engines requested a rollback
340
by setting session->transaction_rollback_request. This may
341
happen in case, e.g., when the transaction in the engine was
342
chosen a victim of the internal deadlock resolution algorithm
343
and rolled back internally. When such a situation happens, there
344
is little the server can do and the only option is to rollback
345
transactions in all other participating engines. In this case
346
the rollback is accompanied by an error sent to the user.
348
As follows from the use cases above, the normal transaction
349
is never committed when there is an outstanding statement
350
transaction. In most cases there is no conflict, since
351
commits of the normal transaction are issued by a stand-alone
352
administrative or DDL statement, thus no outstanding statement
353
transaction of the previous statement exists. Besides,
354
all statements that manipulate with the normal transaction
355
are prohibited in stored functions and triggers, therefore
356
no conflicting situation can occur in a sub-statement either.
357
The remaining rare cases when the server explicitly has
358
to commit the statement transaction prior to committing the normal
359
one cover error-handling scenarios (see for example
362
When committing a statement or a normal transaction, the server
363
either uses the two-phase commit protocol, or issues a commit
364
in each engine independently. The two-phase commit protocol
366
- all participating engines support two-phase commit (provide
367
StorageEngine::prepare PSEA API call) and
368
- transactions in at least two engines modify data (i.e. are
371
Note that the two phase commit is used for
372
statement transactions, even though they are not durable anyway.
373
This is done to ensure logical consistency of data in a multiple-
375
For example, imagine that some day MySQL supports unique
376
constraint checks deferred till the end of statement. In such
377
case a commit in one of the engines may yield ER_DUP_KEY,
378
and MySQL should be able to gracefully abort statement
379
transactions of other participants.
381
After the normal transaction has been committed,
382
session->transaction.all list is cleared.
384
When a connection is closed, the current normal transaction, if
387
Roles and responsibilities
388
--------------------------
390
The server has no way to know that an engine participates in
391
the statement and a transaction has been started
392
in it unless the engine says so. Thus, in order to be
393
a part of a transaction, the engine must "register" itself.
394
This is done by invoking trans_register_ha() server call.
395
Normally the engine registers itself whenever handler::external_lock()
396
is called. trans_register_ha() can be invoked many times: if
397
an engine is already registered, the call does nothing.
398
In case autocommit is not set, the engine must register itself
399
twice -- both in the statement list and in the normal transaction
401
In which list to register is a parameter of trans_register_ha().
403
Note, that although the registration interface in itself is
404
fairly clear, the current usage practice often leads to undesired
405
effects. E.g. since a call to trans_register_ha() in most engines
406
is embedded into implementation of handler::external_lock(), some
407
DDL statements start a transaction (at least from the server
408
point of view) even though they are not expected to. E.g.
409
CREATE TABLE does not start a transaction, since
410
handler::external_lock() is never called during CREATE TABLE. But
411
CREATE TABLE ... SELECT does, since handler::external_lock() is
412
called for the table that is being selected from. This has no
413
practical effects currently, but must be kept in mind
416
Once an engine is registered, the server will do the rest
419
During statement execution, whenever any of data-modifying
420
PSEA API methods is used, e.g. handler::write_row() or
421
handler::update_row(), the read-write flag is raised in the
422
statement transaction for the involved engine.
423
Currently All PSEA calls are "traced", and the data can not be
424
changed in a way other than issuing a PSEA call. Important:
425
unless this invariant is preserved the server will not know that
426
a transaction in a given engine is read-write and will not
427
involve the two-phase commit protocol!
429
At the end of a statement, server call
430
ha_autocommit_or_rollback() is invoked. This call in turn
431
invokes StorageEngine::prepare() for every involved engine.
432
Prepare is followed by a call to StorageEngine::commit_one_phase()
433
If a one-phase commit will suffice, StorageEngine::prepare() is not
434
invoked and the server only calls StorageEngine::commit_one_phase().
435
At statement commit, the statement-related read-write engine
436
flag is propagated to the corresponding flag in the normal
437
transaction. When the commit is complete, the list of registered
440
Rollback is handled in a similar fashion.
442
Additional notes on DDL and the normal transaction.
443
---------------------------------------------------
445
DDLs and operations with non-transactional engines
446
do not "register" in session->transaction lists, and thus do not
447
modify the transaction state. Besides, each DDL in
448
MySQL is prefixed with an implicit normal transaction commit
449
(a call to Session::endActiveTransaction()), and thus leaves nothing
451
However, as it has been pointed out with CREATE TABLE .. SELECT,
452
some DDL statements can start a *new* transaction.
454
Behaviour of the server in this case is currently badly
456
DDL statements use a form of "semantic" logging
457
to maintain atomicity: if CREATE TABLE .. SELECT failed,
458
the newly created table is deleted.
459
In addition, some DDL statements issue interim transaction
460
commits: e.g. ALTER Table issues a commit after data is copied
461
from the original table to the internal temporary table. Other
462
statements, e.g. CREATE TABLE ... SELECT do not always commit
464
And finally there is a group of DDL statements such as
465
RENAME/DROP Table that doesn't start a new transaction
468
This diversity makes it hard to say what will happen if
469
by chance a stored function is invoked during a DDL --
470
whether any modifications it makes will be committed or not
471
is not clear. Fortunately, SQL grammar of few DDLs allows
472
invocation of a stored function.
474
A consistent behaviour is perhaps to always commit the normal
475
transaction after all DDLs, just like the statement transaction
476
is always committed at the end of all statements.
480
Register a storage engine for a transaction.
482
Every storage engine MUST call this function when it starts
483
a transaction or a statement (that is it must be called both for the
484
"beginning of transaction" and "beginning of statement").
485
Only storage engines registered for the transaction/statement
486
will know when to commit/rollback it.
489
trans_register_ha is idempotent - storage engine may register many
490
times per transaction.
493
void trans_register_ha(Session *session, bool all, StorageEngine *engine)
495
Session_TRANS *trans;
496
Ha_trx_info *ha_info;
500
trans= &session->transaction.all;
501
session->server_status|= SERVER_STATUS_IN_TRANS;
504
trans= &session->transaction.stmt;
506
ha_info= session->ha_data[engine->getSlot()].ha_info + static_cast<unsigned>(all);
508
if (ha_info->is_started())
509
return; /* already registered, return */
511
ha_info->register_ha(trans, engine);
513
trans->no_2pc|= not engine->has_2pc();
514
if (session->transaction.xid_state.xid.is_null())
515
session->transaction.xid_state.xid.set(session->query_id);
519
Check if we can skip the two-phase commit.
521
A helper function to evaluate if two-phase commit is mandatory.
522
As a side effect, propagates the read-only/read-write flags
523
of the statement transaction to its enclosing normal transaction.
525
@retval true we must run a two-phase commit. Returned
526
if we have at least two engines with read-write changes.
527
@retval false Don't need two-phase commit. Even if we have two
528
transactional engines, we can run two independent
529
commits if changes in one of the engines are read-only.
534
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
537
/* The number of storage engines that have actual changes. */
538
unsigned rw_ha_count= 0;
539
Ha_trx_info *ha_info;
541
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
543
if (ha_info->is_trx_read_write())
548
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->engine()->getSlot()].ha_info[1];
549
assert(ha_info != ha_info_all);
551
Merge read-only/read-write information about statement
552
transaction to its enclosing normal transaction. Do this
553
only if in a real transaction -- that is, if we know
554
that ha_info_all is registered in session->transaction.all.
555
Since otherwise we only clutter the normal transaction flags.
557
if (ha_info_all->is_started()) /* false if autocommit. */
558
ha_info_all->coalesce_trx_with(ha_info);
560
else if (rw_ha_count > 1)
563
It is a normal transaction, so we don't need to merge read/write
564
information up, and the need for two-phase commit has been
565
already established. Break the loop prematurely.
570
return rw_ha_count > 1;
578
1 transaction was rolled back
580
2 error during commit, data may be inconsistent
583
Since we don't support nested statement transactions in 5.0,
584
we can't commit or rollback stmt transactions while we are inside
585
stored functions or triggers. So we simply do nothing now.
586
TODO: This should be fixed in later ( >= 5.1) releases.
588
int ha_commit_trans(Session *session, bool all)
590
int error= 0, cookie= 0;
592
'all' means that this is either an explicit commit issued by
593
user, or an implicit commit issued by a DDL.
595
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
596
bool is_real_trans= all || session->transaction.all.ha_list == 0;
597
Ha_trx_info *ha_info= trans->ha_list;
600
We must not commit the normal transaction if a statement
601
transaction is pending. Otherwise statement transaction
602
flags will not get propagated to its normal transaction's
605
assert(session->transaction.stmt.ha_list == NULL ||
606
trans == &session->transaction.stmt);
612
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
614
ha_rollback_trans(session, all);
618
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
620
if (!trans->no_2pc && must_2pc)
622
for (; ha_info && !error; ha_info= ha_info->next())
625
StorageEngine *engine= ha_info->engine();
627
Do not call two-phase commit if this particular
628
transaction is read-only. This allows for simpler
629
implementation in engines that are always read-only.
631
if (! ha_info->is_trx_read_write())
634
Sic: we know that prepare() is not NULL since otherwise
635
trans->no_2pc would have been set.
637
if ((err= engine->prepare(session, all)))
639
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
642
status_var_increment(session->status_var.ha_prepare_count);
646
ha_rollback_trans(session, all);
651
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
654
start_waiting_global_read_lock(session);
661
This function does not care about global read lock. A caller should.
663
int ha_commit_one_phase(Session *session, bool all)
666
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
667
bool is_real_trans=all || session->transaction.all.ha_list == 0;
668
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
671
for (; ha_info; ha_info= ha_info_next)
674
StorageEngine *engine= ha_info->engine();
675
if ((err= engine->commit(session, all)))
677
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
680
status_var_increment(session->status_var.ha_commit_count);
681
ha_info_next= ha_info->next();
682
ha_info->reset(); /* keep it conveniently zero-filled */
687
session->transaction.xid_state.xid.null();
690
session->variables.tx_isolation=session->session_tx_isolation;
691
session->transaction.cleanup();
698
int ha_rollback_trans(Session *session, bool all)
701
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
702
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
703
bool is_real_trans=all || session->transaction.all.ha_list == 0;
706
We must not rollback the normal transaction if a statement
707
transaction is pending.
709
assert(session->transaction.stmt.ha_list == NULL ||
710
trans == &session->transaction.stmt);
714
for (; ha_info; ha_info= ha_info_next)
717
StorageEngine *engine= ha_info->engine();
718
if ((err= engine->rollback(session, all)))
720
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
723
status_var_increment(session->status_var.ha_rollback_count);
724
ha_info_next= ha_info->next();
725
ha_info->reset(); /* keep it conveniently zero-filled */
730
session->transaction.xid_state.xid.null();
733
session->variables.tx_isolation=session->session_tx_isolation;
734
session->transaction.cleanup();
738
session->transaction_rollback_request= false;
741
If a non-transactional table was updated, warn; don't warn if this is a
742
slave thread (because when a slave thread executes a ROLLBACK, it has
743
been read from the binary log, so it's 100% sure and normal to produce
744
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
745
slave SQL thread, it would not stop the thread but just be printed in
746
the error log; but we don't want users to wonder why they have this
747
message in the error log, so we don't send it.
749
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
750
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
751
ER_WARNING_NOT_COMPLETE_ROLLBACK,
752
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
757
This is used to commit or rollback a single statement depending on
761
Note that if the autocommit is on, then the following call inside
762
InnoDB will commit or rollback the whole transaction (= the statement). The
763
autocommit mechanism built into InnoDB is based on counting locks, but if
764
the user has used LOCK TABLES then that mechanism does not know to do the
767
int ha_autocommit_or_rollback(Session *session, int error)
769
if (session->transaction.stmt.ha_list)
773
if (ha_commit_trans(session, 0))
778
(void) ha_rollback_trans(session, 0);
779
if (session->transaction_rollback_request)
780
(void) ha_rollback(session);
783
session->variables.tx_isolation=session->session_tx_isolation;
793
return the list of XID's to a client, the same way SHOW commands do.
796
I didn't find in XA specs that an RM cannot return the same XID twice,
797
so mysql_xa_recover does not filter XID's to ensure uniqueness.
798
It can be easily fixed later, if necessary.
800
bool mysql_xa_recover(Session *session)
802
List<Item> field_list;
803
Protocol *protocol= session->protocol;
807
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
808
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
809
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
810
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
812
if (protocol->sendFields(&field_list,
813
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
816
pthread_mutex_lock(&LOCK_xid_cache);
817
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
819
if (xs->xa_state==XA_PREPARED)
821
protocol->prepareForResend();
822
protocol->store((int64_t)xs->xid.formatID);
823
protocol->store((int64_t)xs->xid.gtrid_length);
824
protocol->store((int64_t)xs->xid.bqual_length);
825
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length);
826
if (protocol->write())
828
pthread_mutex_unlock(&LOCK_xid_cache);
834
pthread_mutex_unlock(&LOCK_xid_cache);
840
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
843
Session_TRANS *trans= &session->transaction.all;
844
Ha_trx_info *ha_info, *ha_info_next;
848
rolling back to savepoint in all storage engines that were part of the
849
transaction when the savepoint was set
851
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
854
StorageEngine *engine= ha_info->engine();
856
if ((err= engine->savepoint_rollback(session,
859
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
862
status_var_increment(session->status_var.ha_savepoint_rollback_count);
863
trans->no_2pc|= not engine->has_2pc();
866
rolling back the transaction in all storage engines that were not part of
867
the transaction when the savepoint was set
869
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
870
ha_info= ha_info_next)
873
StorageEngine *engine= ha_info->engine();
874
if ((err= engine->rollback(session, !(0))))
876
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
879
status_var_increment(session->status_var.ha_rollback_count);
880
ha_info_next= ha_info->next();
881
ha_info->reset(); /* keep it conveniently zero-filled */
883
trans->ha_list= sv->ha_list;
889
according to the sql standard (ISO/IEC 9075-2:2003)
890
section "4.33.4 SQL-statements and transaction states",
891
SAVEPOINT is *not* transaction-initiating SQL-statement
893
int ha_savepoint(Session *session, SAVEPOINT *sv)
896
Session_TRANS *trans= &session->transaction.all;
897
Ha_trx_info *ha_info= trans->ha_list;
898
for (; ha_info; ha_info= ha_info->next())
901
StorageEngine *engine= ha_info->engine();
903
#ifdef NOT_IMPLEMENTED /*- TODO (examine this againt the original code base) */
904
if (! engine->savepoint_set)
906
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
911
if ((err= engine->savepoint_set(session, (void *)(sv+1))))
913
my_error(ER_GET_ERRNO, MYF(0), err);
916
status_var_increment(session->status_var.ha_savepoint_count);
919
Remember the list of registered storage engines. All new
920
engines are prepended to the beginning of the list.
922
sv->ha_list= trans->ha_list;
926
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
929
Ha_trx_info *ha_info= sv->ha_list;
931
for (; ha_info; ha_info= ha_info->next())
934
StorageEngine *engine= ha_info->engine();
935
/* Savepoint life time is enclosed into transaction life time. */
937
if ((err= engine->savepoint_release(session,
940
my_error(ER_GET_ERRNO, MYF(0), err);
53
951
/****************************************************************************
54
** General Cursor functions
952
** General handler functions
55
953
****************************************************************************/
56
Cursor::Cursor(plugin::StorageEngine &engine_arg,
60
estimation_rows_to_insert(0),
62
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
63
ref_length(sizeof(internal::my_off_t)),
66
next_insert_id(0), insert_id_for_cur_row(0)
954
handler::~handler(void)
71
956
assert(locked == false);
72
957
/* TODO: assert(inited == NONE); */
77
* @note this only used in
78
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
79
* of the writing of this comment. -Brian
81
Cursor *Cursor::clone(memory::Root *mem_root)
961
handler *handler::clone(MEM_ROOT *mem_root)
83
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
963
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
86
Allocate Cursor->ref here because otherwise ha_open will allocate it
965
Allocate handler->ref here because otherwise ha_open will allocate it
87
966
on this->table->mem_root and we will not be able to reclaim that memory
88
when the clone Cursor object is destroyed.
967
when the clone handler object is destroyed.
90
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
969
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
93
TableIdentifier identifier(getTable()->getShare()->getSchemaName(),
94
getTable()->getShare()->getTableName(),
95
getTable()->getShare()->getType());
97
if (new_handler && !new_handler->ha_open(identifier,
98
getTable()->getDBStat(),
971
if (new_handler && !new_handler->ha_open(table,
972
table->s->normalized_path.str,
99
974
HA_OPEN_IGNORE_IF_LOCKED))
100
975
return new_handler;
106
given a buffer with a key value, and a map of keyparts
107
that are present in this value, returns the length of the value
109
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
111
/* works only with key prefixes */
112
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
114
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
115
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
118
while (key_part_found < end_key_part_found && keypart_map_arg)
120
length+= key_part_found->store_length;
121
keypart_map_arg >>= 1;
127
int Cursor::startIndexScan(uint32_t idx, bool sorted)
979
int handler::ha_index_init(uint32_t idx, bool sorted)
130
982
assert(inited == NONE);
131
if (!(result= doStartIndexScan(idx, sorted)))
983
if (!(result= index_init(idx, sorted)))
137
int Cursor::endIndexScan()
989
int handler::ha_index_end()
139
991
assert(inited==INDEX);
142
return(doEndIndexScan());
145
int Cursor::startTableScan(bool scan)
997
int handler::ha_rnd_init(bool scan)
148
1000
assert(inited==NONE || (inited==RND && scan));
149
inited= (result= doStartTableScan(scan)) ? NONE: RND;
1001
inited= (result= rnd_init(scan)) ? NONE: RND;
154
int Cursor::endTableScan()
1006
int handler::ha_rnd_end()
156
1008
assert(inited==RND);
158
return(doEndTableScan());
161
int Cursor::ha_index_or_rnd_end()
163
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
166
void Cursor::ha_start_bulk_insert(ha_rows rows)
1013
int handler::ha_index_or_rnd_end()
1015
return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0;
1018
handler::Table_flags handler::ha_table_flags() const
1020
return cached_table_flags;
1023
void handler::ha_start_bulk_insert(ha_rows rows)
168
1025
estimation_rows_to_insert= rows;
169
1026
start_bulk_insert(rows);
172
int Cursor::ha_end_bulk_insert()
1029
int handler::ha_end_bulk_insert()
174
1031
estimation_rows_to_insert= 0;
175
1032
return end_bulk_insert();
178
const key_map *Cursor::keys_to_use_for_scanning()
1035
void handler::change_table_ptr(Table *table_arg, TableShare *share)
1041
const key_map *handler::keys_to_use_for_scanning()
180
1043
return &key_map_empty;
183
bool Cursor::has_transactions()
185
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
188
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
190
(getTable()->in_use->status_var.*offset)++;
193
void **Cursor::ha_data(Session *session) const
195
return session->getEngineData(getEngine());
198
bool Cursor::is_fatal_error(int error, uint32_t flags)
1046
bool handler::has_transactions()
1048
return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0;
1051
void handler::ha_statistic_increment(ulong SSV::*offset) const
1053
status_var_increment(table->in_use->status_var.*offset);
1056
void **handler::ha_data(Session *session) const
1058
return session_ha_data(session, engine);
1061
Session *handler::ha_session(void) const
1063
assert(!table || !table->in_use || table->in_use == current_session);
1064
return (table && table->in_use) ? table->in_use : current_session;
1068
bool handler::is_fatal_error(int error, uint32_t flags)
201
1071
((flags & HA_CHECK_DUP_KEY) &&
1149
2535
/* **************************************************************************
2536
* DS-MRR implementation
2537
***************************************************************************/
2540
DS-MRR: Initialize and start MRR scan
2542
Initialize and start the MRR scan. Depending on the mode parameter, this
2543
may use default or DS-MRR implementation.
2545
@param h Table handler to be used
2546
@param key Index to be used
2547
@param seq_funcs Interval sequence enumeration functions
2548
@param seq_init_param Interval sequence enumeration parameter
2549
@param n_ranges Number of ranges in the sequence.
2550
@param mode HA_MRR_* modes to use
2551
@param buf INOUT Buffer to use
2553
@retval 0 Ok, Scan started.
2557
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
2558
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
2559
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
2563
Item *pushed_cond= NULL;
2565
keyno= h_in->active_index;
2567
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
2569
use_default_impl= true;
2570
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
2571
n_ranges, mode, buf));
2573
rowids_buf= buf->buffer;
2574
//psergey-todo: don't add key_length as it is not needed anymore
2575
rowids_buf += key->key_length + h_in->ref_length;
2577
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
2578
rowids_buf_end= buf->buffer_end;
2580
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
2581
rowids_buf_last= rowids_buf +
2582
((rowids_buf_end - rowids_buf)/ elem_size)*
2584
rowids_buf_end= rowids_buf_last;
2586
/* Create a separate handler object to do rndpos() calls. */
2587
Session *session= current_session;
2588
if (!(new_h2= h_in->clone(session->mem_root)) ||
2589
new_h2->ha_external_lock(session, F_RDLCK))
2595
if (keyno == h_in->pushed_idx_cond_keyno)
2596
pushed_cond= h_in->pushed_idx_cond;
2597
if (h_in->ha_index_end())
2604
table->prepare_for_position();
2605
new_h2->extra(HA_EXTRA_KEYREAD);
2607
if (h2->ha_index_init(keyno, false) ||
2608
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
2611
use_default_impl= false;
2614
h2->idx_cond_push(keyno, pushed_cond);
2615
if (dsmrr_fill_buffer(new_h2))
2619
If the above call has scanned through all intervals in *seq, then
2620
adjust *buf to indicate that the remaining buffer space will not be used.
2623
buf->end_of_used_area= rowids_buf_last;
2625
if (h_in->ha_rnd_init(false))
2630
h2->ha_index_or_rnd_end();
2631
h2->ha_external_lock(session, F_UNLCK);
2638
void DsMrr_impl::dsmrr_close()
2642
h2->ha_external_lock(current_session, F_UNLCK);
2647
use_default_impl= true;
2651
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
2653
return ((handler*)h)->cmp_ref(a, b);
2658
DS-MRR: Fill the buffer with rowids and sort it by rowid
2660
{This is an internal function of DiskSweep MRR implementation}
2661
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
2662
buffer. When the buffer is full or scan is completed, sort the buffer by
2665
The function assumes that rowids buffer is empty when it is invoked.
2667
@param h Table handler
2669
@retval 0 OK, the next portion of rowids is in the buffer,
2674
int DsMrr_impl::dsmrr_fill_buffer(handler *)
2679
rowids_buf_cur= rowids_buf;
2680
while ((rowids_buf_cur < rowids_buf_end) &&
2681
!(res= h2->handler::multi_range_read_next(&range_info)))
2683
/* Put rowid, or {rowid, range_id} pair into the buffer */
2684
h2->position(table->record[0]);
2685
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
2686
rowids_buf_cur += h->ref_length;
2690
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
2691
rowids_buf_cur += sizeof(void*);
2695
if (res && res != HA_ERR_END_OF_FILE)
2697
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
2699
/* Sort the buffer contents by rowid */
2700
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
2701
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
2703
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
2705
rowids_buf_last= rowids_buf_cur;
2706
rowids_buf_cur= rowids_buf;
2712
DS-MRR implementation: multi_range_read_next() function
2715
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
2719
if (use_default_impl)
2720
return h_in->handler::multi_range_read_next(range_info);
2722
if (rowids_buf_cur == rowids_buf_last)
2726
res= HA_ERR_END_OF_FILE;
2729
res= dsmrr_fill_buffer(h);
2734
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
2735
if (rowids_buf_cur == rowids_buf_last)
2737
res= HA_ERR_END_OF_FILE;
2741
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
2742
rowids_buf_cur += h_in->ref_length;
2745
memcpy(range_info, rowids_buf_cur, sizeof(void*));
2746
rowids_buf_cur += sizeof(void*);
2757
DS-MRR implementation: multi_range_read_info() function
2759
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
2760
uint32_t *flags, COST_VECT *cost)
2763
uint32_t def_flags= *flags;
2764
uint32_t def_bufsz= *bufsz;
2766
/* Get cost/flags/mem_usage of default MRR implementation */
2767
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
2771
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
2772
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
2774
/* Default implementation is choosen */
2783
DS-MRR Implementation: multi_range_read_info_const() function
2786
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
2787
void *seq_init_param, uint32_t n_ranges,
2788
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
2791
uint32_t def_flags= *flags;
2792
uint32_t def_bufsz= *bufsz;
2793
/* Get cost/flags/mem_usage of default MRR implementation */
2794
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
2795
n_ranges, &def_bufsz,
2797
if (rows == HA_POS_ERROR)
2799
/* Default implementation can't perform MRR scan => we can't either */
2804
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
2805
use the default MRR implementation (we need it for UPDATE/DELETE).
2806
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
2808
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
2809
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
2816
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
2823
Check if key has partially-covered columns
2825
We can't use DS-MRR to perform range scans when the ranges are over
2826
partially-covered keys, because we'll not have full key part values
2827
(we'll have their prefixes from the index) and will not be able to check
2828
if we've reached the end the range.
2830
@param keyno Key to check
2833
Allow use of DS-MRR in cases where the index has partially-covered
2834
components but they are not used for scanning.
2840
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
2842
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
2843
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
2844
for (; kp != kp_end; kp++)
2846
if (!kp->field->part_of_key.test(keyno))
2854
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
2856
Make the choice between using Default MRR implementation and DS-MRR.
2857
This function contains common functionality factored out of dsmrr_info()
2858
and dsmrr_info_const(). The function assumes that the default MRR
2859
implementation's applicability requirements are satisfied.
2861
@param keyno Index number
2862
@param rows E(full rows to be retrieved)
2863
@param flags IN MRR flags provided by the MRR user
2864
OUT If DS-MRR is choosen, flags of DS-MRR implementation
2865
else the value is not modified
2866
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
2867
else the value is not modified
2868
@param cost IN Cost of default MRR implementation
2869
OUT If DS-MRR is choosen, cost of DS-MRR scan
2870
else the value is not modified
2872
@retval true Default MRR implementation should be used
2873
@retval false DS-MRR implementation should be used
2876
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
2877
uint32_t *bufsz, COST_VECT *cost)
2879
COST_VECT dsmrr_cost;
2881
Session *session= current_session;
2882
if ((session->variables.optimizer_use_mrr == 2) ||
2883
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
2884
(keyno == table->s->primary_key &&
2885
h->primary_key_is_clustered()) ||
2886
key_uses_partial_cols(keyno))
2888
/* Use the default implementation */
2889
*flags |= HA_MRR_USE_DEFAULT_IMPL;
2893
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
2895
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
2901
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
2902
DS-MRR and Default implementations cost. This allows one to force use of
2903
DS-MRR whenever it is applicable without affecting other cost-based
2906
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
2907
dsmrr_cost.total_cost() > cost->total_cost())
2910
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
2912
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
2913
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
2919
/* Use the default MRR implementation */
2926
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
2930
Get cost of DS-MRR scan
2932
@param keynr Index to be used
2933
@param rows E(Number of rows to be scanned)
2934
@param flags Scan parameters (HA_MRR_* flags)
2935
@param buffer_size INOUT Buffer size
2936
@param cost OUT The cost
2939
@retval true Error, DS-MRR cannot be used (the buffer is too small
2943
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
2944
uint32_t *buffer_size, COST_VECT *cost)
2946
uint32_t max_buff_entries, elem_size;
2947
ha_rows rows_in_full_step, rows_in_last_step;
2948
uint32_t n_full_steps;
2949
double index_read_cost;
2951
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
2952
max_buff_entries = *buffer_size / elem_size;
2954
if (!max_buff_entries)
2955
return true; /* Buffer has not enough space for even 1 rowid */
2957
/* Number of iterations we'll make with full buffer */
2958
n_full_steps= (uint32_t)floor(rows2double(rows) / max_buff_entries);
2961
Get numbers of rows we'll be processing in
2962
- non-last sweep, with full buffer
2963
- last iteration, with non-full buffer
2965
rows_in_full_step= max_buff_entries;
2966
rows_in_last_step= rows % max_buff_entries;
2968
/* Adjust buffer size if we expect to use only part of the buffer */
2971
get_sort_and_sweep_cost(table, rows, cost);
2972
cost->multiply(n_full_steps);
2977
*buffer_size= cmax((ulong)*buffer_size,
2978
(size_t)(1.2*rows_in_last_step) * elem_size +
2979
h->ref_length + table->key_info[keynr].key_length);
2982
COST_VECT last_step_cost;
2983
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
2984
cost->add(&last_step_cost);
2986
if (n_full_steps != 0)
2987
cost->mem_cost= *buffer_size;
2989
cost->mem_cost= (double)rows_in_last_step * elem_size;
2991
/* Total cost of all index accesses */
2992
index_read_cost= h->index_only_read_time(keynr, (double)rows);
2993
cost->add_io(index_read_cost, 1 /* Random seeks */);
2999
Get cost of one sort-and-sweep step
3002
get_sort_and_sweep_cost()
3003
table Table being accessed
3004
nrows Number of rows to be sorted and retrieved
3008
Get cost of these operations:
3009
- sort an array of #nrows ROWIDs using qsort
3010
- read #nrows records from table in a sweep.
3014
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3018
get_sweep_read_cost(table, nrows, false, cost);
3019
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3020
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3023
cost->cpu_cost += cmp_op * log2(cmp_op);
3031
Get cost of reading nrows table records in a "disk sweep"
3033
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3034
for an ordered sequence of rowids.
3036
We assume hard disk IO. The read is performed as follows:
3038
1. The disk head is moved to the needed cylinder
3039
2. The controller waits for the plate to rotate
3040
3. The data is transferred
3042
Time to do #3 is insignificant compared to #2+#1.
3044
Time to move the disk head is proportional to head travel distance.
3046
Time to wait for the plate to rotate depends on whether the disk head
3049
If disk head wasn't moved, the wait time is proportional to distance
3050
between the previous block and the block we're reading.
3052
If the head was moved, we don't know how much we'll need to wait for the
3053
plate to rotate. We assume the wait time to be a variate with a mean of
3054
0.5 of full rotation time.
3056
Our cost units are "random disk seeks". The cost of random disk seek is
3057
actually not a constant, it depends one range of cylinders we're going
3058
to access. We make it constant by introducing a fuzzy concept of "typical
3059
datafile length" (it's fuzzy as it's hard to tell whether it should
3060
include index file, temp.tables etc). Then random seek cost is:
3062
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3064
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3066
@param table Table to be accessed
3067
@param nrows Number of rows to retrieve
3068
@param interrupted true <=> Assume that the disk sweep will be
3069
interrupted by other disk IO. false - otherwise.
3070
@param cost OUT The cost.
3073
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3077
if (table->file->primary_key_is_clustered())
3079
cost->io_count= table->file->read_time(table->s->primary_key,
3080
(uint32_t) nrows, nrows);
3085
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3087
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3088
if (busy_blocks < 1.0)
3091
cost->io_count= busy_blocks;
3095
/* Assume reading is done in one 'sweep' */
3096
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3097
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
3103
/* **************************************************************************
1150
3104
* DS-MRR implementation ends
1151
3105
***************************************************************************/