23
23
Handler-calling-functions
26
#include "drizzled/server_includes.h"
27
#include "mysys/hash.h"
30
28
#include "drizzled/error.h"
31
#include "drizzled/field/epoch.h"
32
29
#include "drizzled/gettext.h"
33
#include "drizzled/internal/my_sys.h"
34
#include "drizzled/item/empty_string.h"
35
#include "drizzled/item/int.h"
36
#include "drizzled/lock.h"
37
#include "drizzled/message/table.h"
38
#include "drizzled/my_hash.h"
39
#include "drizzled/optimizer/cost_vector.h"
40
#include "drizzled/plugin/client.h"
41
#include "drizzled/plugin/event_observer.h"
42
#include "drizzled/plugin/storage_engine.h"
43
30
#include "drizzled/probes.h"
31
#include "drizzled/sql_parse.h"
32
#include "drizzled/cost_vect.h"
44
33
#include "drizzled/session.h"
45
34
#include "drizzled/sql_base.h"
46
#include "drizzled/sql_parse.h"
47
35
#include "drizzled/transaction_services.h"
36
#include "drizzled/lock.h"
37
#include "drizzled/item/int.h"
38
#include "drizzled/item/empty_string.h"
39
#include "drizzled/unireg.h" // for mysql_frm_type
40
#include "drizzled/field/timestamp.h"
41
#include "drizzled/message/table.pb.h"
49
43
using namespace std;
45
extern drizzled::TransactionServices transaction_services;
47
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
49
/* number of entries in storage_engines[] */
51
/* number of storage engines (from storage_engines[]) that support 2pc */
52
uint32_t total_ha_2pc= 0;
53
/* size of savepoint storage area (see ha_init) */
54
uint32_t savepoint_alloc_size= 0;
56
const char *ha_row_type[] = {
57
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
60
const char *tx_isolation_names[] =
61
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
64
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
65
tx_isolation_names, NULL};
69
Register handler error messages for use with my_error().
77
int ha_init_errors(void)
79
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
82
/* Allocate a pointer array for the error message strings. */
83
/* Zerofill it to avoid uninitialized gaps. */
84
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
86
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
88
/* Set the dedicated error messages. */
89
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
90
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
91
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
92
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
93
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
94
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
95
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
96
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
97
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
98
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
99
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
100
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
101
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
102
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
103
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
104
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
105
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
106
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
107
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
108
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
109
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
110
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
111
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
112
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
113
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
114
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
115
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
116
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
117
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
118
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
119
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
120
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
121
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
122
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
123
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
124
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
125
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
126
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
127
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
128
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
129
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
131
/* Register the error messages for use with my_error(). */
132
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
137
Unregister handler error messages.
144
static int ha_finish_errors(void)
146
const char **errmsgs;
148
/* Allocate a pointer array for the error message strings. */
149
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
151
free((unsigned char*) errmsgs);
159
assert(total_ha < MAX_HA);
161
Check if there is a transaction-capable storage engine besides the
162
binary log (which is considered a transaction-capable storage engine in
165
savepoint_alloc_size+= sizeof(SAVEPOINT);
174
This should be eventualy based on the graceful shutdown flag.
175
So if flag is equal to HA_PANIC_CLOSE, the deallocate
178
if (ha_finish_errors())
186
/* ========================================================================
187
======================= TRANSACTIONS ===================================*/
190
Transaction handling in the server
191
==================================
193
In each client connection, MySQL maintains two transactional
195
- a statement transaction,
196
- a standard, also called normal transaction.
200
"Statement transaction" is a non-standard term that comes
201
from the times when MySQL supported BerkeleyDB storage engine.
203
First of all, it should be said that in BerkeleyDB auto-commit
204
mode auto-commits operations that are atomic to the storage
205
engine itself, such as a write of a record, and are too
206
high-granular to be atomic from the application perspective
207
(MySQL). One SQL statement could involve many BerkeleyDB
208
auto-committed operations and thus BerkeleyDB auto-commit was of
211
Secondly, instead of SQL standard savepoints, BerkeleyDB
212
provided the concept of "nested transactions". In a nutshell,
213
transactions could be arbitrarily nested, but when the parent
214
transaction was committed or aborted, all its child (nested)
215
transactions were handled committed or aborted as well.
216
Commit of a nested transaction, in turn, made its changes
217
visible, but not durable: it destroyed the nested transaction,
218
all its changes would become available to the parent and
219
currently active nested transactions of this parent.
221
So the mechanism of nested transactions was employed to
222
provide "all or nothing" guarantee of SQL statements
223
required by the standard.
224
A nested transaction would be created at start of each SQL
225
statement, and destroyed (committed or aborted) at statement
226
end. Such nested transaction was internally referred to as
227
a "statement transaction" and gave birth to the term.
229
<Historical note ends>
231
Since then a statement transaction is started for each statement
232
that accesses transactional tables or uses the binary log. If
233
the statement succeeds, the statement transaction is committed.
234
If the statement fails, the transaction is rolled back. Commits
235
of statement transactions are not durable -- each such
236
transaction is nested in the normal transaction, and if the
237
normal transaction is rolled back, the effects of all enclosed
238
statement transactions are undone as well. Technically,
239
a statement transaction can be viewed as a savepoint which is
240
maintained automatically in order to make effects of one
243
The normal transaction is started by the user and is ended
244
usually upon a user request as well. The normal transaction
245
encloses transactions of all statements issued between
246
its beginning and its end.
247
In autocommit mode, the normal transaction is equivalent
248
to the statement transaction.
250
Since MySQL supports PSEA (pluggable storage engine
251
architecture), more than one transactional engine can be
252
active at a time. Hence transactions, from the server
253
point of view, are always distributed. In particular,
254
transactional state is maintained independently for each
255
engine. In order to commit a transaction the two phase
256
commit protocol is employed.
258
Not all statements are executed in context of a transaction.
259
Administrative and status information statements do not modify
260
engine data, and thus do not start a statement transaction and
261
also have no effect on the normal transaction. Examples of such
262
statements are SHOW STATUS and RESET SLAVE.
264
Similarly DDL statements are not transactional,
265
and therefore a transaction is [almost] never started for a DDL
266
statement. The difference between a DDL statement and a purely
267
administrative statement though is that a DDL statement always
268
commits the current transaction before proceeding, if there is
271
At last, SQL statements that work with non-transactional
272
engines also have no effect on the transaction state of the
273
connection. Even though they are written to the binary log,
274
and the binary log is, overall, transactional, the writes
275
are done in "write-through" mode, directly to the binlog
276
file, followed with a OS cache sync, in other words,
277
bypassing the binlog undo log (translog).
278
They do not commit the current normal transaction.
279
A failure of a statement that uses non-transactional tables
280
would cause a rollback of the statement transaction, but
281
in case there no non-transactional tables are used,
282
no statement transaction is started.
287
The server stores its transaction-related data in
288
session->transaction. This structure has two members of type
289
Session_TRANS. These members correspond to the statement and
290
normal transactions respectively:
292
- session->transaction.stmt contains a list of engines
293
that are participating in the given statement
294
- session->transaction.all contains a list of engines that
295
have participated in any of the statement transactions started
296
within the context of the normal transaction.
297
Each element of the list contains a pointer to the storage
298
engine, engine-specific transactional data, and engine-specific
301
In autocommit mode session->transaction.all is empty.
302
Instead, data of session->transaction.stmt is
303
used to commit/rollback the normal transaction.
305
The list of registered engines has a few important properties:
306
- no engine is registered in the list twice
307
- engines are present in the list a reverse temporal order --
308
new participants are always added to the beginning of the list.
310
Transaction life cycle
311
----------------------
313
When a new connection is established, session->transaction
314
members are initialized to an empty state.
315
If a statement uses any tables, all affected engines
316
are registered in the statement engine list. In
317
non-autocommit mode, the same engines are registered in
318
the normal transaction list.
319
At the end of the statement, the server issues a commit
320
or a roll back for all engines in the statement list.
321
At this point transaction flags of an engine, if any, are
322
propagated from the statement list to the list of the normal
324
When commit/rollback is finished, the statement list is
325
cleared. It will be filled in again by the next statement,
326
and emptied again at the next statement's end.
328
The normal transaction is committed in a similar way
329
(by going over all engines in session->transaction.all list)
330
but at different times:
331
- upon COMMIT SQL statement is issued by the user
332
- implicitly, by the server, at the beginning of a DDL statement
333
or SET AUTOCOMMIT={0|1} statement.
335
The normal transaction can be rolled back as well:
336
- if the user has requested so, by issuing ROLLBACK SQL
338
- if one of the storage engines requested a rollback
339
by setting session->transaction_rollback_request. This may
340
happen in case, e.g., when the transaction in the engine was
341
chosen a victim of the internal deadlock resolution algorithm
342
and rolled back internally. When such a situation happens, there
343
is little the server can do and the only option is to rollback
344
transactions in all other participating engines. In this case
345
the rollback is accompanied by an error sent to the user.
347
As follows from the use cases above, the normal transaction
348
is never committed when there is an outstanding statement
349
transaction. In most cases there is no conflict, since
350
commits of the normal transaction are issued by a stand-alone
351
administrative or DDL statement, thus no outstanding statement
352
transaction of the previous statement exists. Besides,
353
all statements that manipulate with the normal transaction
354
are prohibited in stored functions and triggers, therefore
355
no conflicting situation can occur in a sub-statement either.
356
The remaining rare cases when the server explicitly has
357
to commit the statement transaction prior to committing the normal
358
one cover error-handling scenarios (see for example
361
When committing a statement or a normal transaction, the server
362
either uses the two-phase commit protocol, or issues a commit
363
in each engine independently. The two-phase commit protocol
365
- all participating engines support two-phase commit (provide
366
StorageEngine::prepare PSEA API call) and
367
- transactions in at least two engines modify data (i.e. are
370
Note that the two phase commit is used for
371
statement transactions, even though they are not durable anyway.
372
This is done to ensure logical consistency of data in a multiple-
374
For example, imagine that some day MySQL supports unique
375
constraint checks deferred till the end of statement. In such
376
case a commit in one of the engines may yield ER_DUP_KEY,
377
and MySQL should be able to gracefully abort statement
378
transactions of other participants.
380
After the normal transaction has been committed,
381
session->transaction.all list is cleared.
383
When a connection is closed, the current normal transaction, if
386
Roles and responsibilities
387
--------------------------
389
The server has no way to know that an engine participates in
390
the statement and a transaction has been started
391
in it unless the engine says so. Thus, in order to be
392
a part of a transaction, the engine must "register" itself.
393
This is done by invoking trans_register_ha() server call.
394
Normally the engine registers itself whenever handler::external_lock()
395
is called. trans_register_ha() can be invoked many times: if
396
an engine is already registered, the call does nothing.
397
In case autocommit is not set, the engine must register itself
398
twice -- both in the statement list and in the normal transaction
400
In which list to register is a parameter of trans_register_ha().
402
Note, that although the registration interface in itself is
403
fairly clear, the current usage practice often leads to undesired
404
effects. E.g. since a call to trans_register_ha() in most engines
405
is embedded into implementation of handler::external_lock(), some
406
DDL statements start a transaction (at least from the server
407
point of view) even though they are not expected to. E.g.
408
CREATE TABLE does not start a transaction, since
409
handler::external_lock() is never called during CREATE TABLE. But
410
CREATE TABLE ... SELECT does, since handler::external_lock() is
411
called for the table that is being selected from. This has no
412
practical effects currently, but must be kept in mind
415
Once an engine is registered, the server will do the rest
418
During statement execution, whenever any of data-modifying
419
PSEA API methods is used, e.g. handler::write_row() or
420
handler::update_row(), the read-write flag is raised in the
421
statement transaction for the involved engine.
422
Currently All PSEA calls are "traced", and the data can not be
423
changed in a way other than issuing a PSEA call. Important:
424
unless this invariant is preserved the server will not know that
425
a transaction in a given engine is read-write and will not
426
involve the two-phase commit protocol!
428
At the end of a statement, server call
429
ha_autocommit_or_rollback() is invoked. This call in turn
430
invokes StorageEngine::prepare() for every involved engine.
431
Prepare is followed by a call to StorageEngine::commit_one_phase()
432
If a one-phase commit will suffice, StorageEngine::prepare() is not
433
invoked and the server only calls StorageEngine::commit_one_phase().
434
At statement commit, the statement-related read-write engine
435
flag is propagated to the corresponding flag in the normal
436
transaction. When the commit is complete, the list of registered
439
Rollback is handled in a similar fashion.
441
Additional notes on DDL and the normal transaction.
442
---------------------------------------------------
444
DDLs and operations with non-transactional engines
445
do not "register" in session->transaction lists, and thus do not
446
modify the transaction state. Besides, each DDL in
447
MySQL is prefixed with an implicit normal transaction commit
448
(a call to Session::endActiveTransaction()), and thus leaves nothing
450
However, as it has been pointed out with CREATE TABLE .. SELECT,
451
some DDL statements can start a *new* transaction.
453
Behaviour of the server in this case is currently badly
455
DDL statements use a form of "semantic" logging
456
to maintain atomicity: if CREATE TABLE .. SELECT failed,
457
the newly created table is deleted.
458
In addition, some DDL statements issue interim transaction
459
commits: e.g. ALTER Table issues a commit after data is copied
460
from the original table to the internal temporary table. Other
461
statements, e.g. CREATE TABLE ... SELECT do not always commit
463
And finally there is a group of DDL statements such as
464
RENAME/DROP Table that doesn't start a new transaction
467
This diversity makes it hard to say what will happen if
468
by chance a stored function is invoked during a DDL --
469
whether any modifications it makes will be committed or not
470
is not clear. Fortunately, SQL grammar of few DDLs allows
471
invocation of a stored function.
473
A consistent behaviour is perhaps to always commit the normal
474
transaction after all DDLs, just like the statement transaction
475
is always committed at the end of all statements.
479
Register a storage engine for a transaction.
481
Every storage engine MUST call this function when it starts
482
a transaction or a statement (that is it must be called both for the
483
"beginning of transaction" and "beginning of statement").
484
Only storage engines registered for the transaction/statement
485
will know when to commit/rollback it.
488
trans_register_ha is idempotent - storage engine may register many
489
times per transaction.
492
void trans_register_ha(Session *session, bool all, StorageEngine *engine)
494
Session_TRANS *trans;
495
Ha_trx_info *ha_info;
499
trans= &session->transaction.all;
500
session->server_status|= SERVER_STATUS_IN_TRANS;
503
trans= &session->transaction.stmt;
505
ha_info= session->ha_data[engine->getSlot()].ha_info + static_cast<unsigned>(all);
507
if (ha_info->is_started())
508
return; /* already registered, return */
510
ha_info->register_ha(trans, engine);
512
trans->no_2pc|= not engine->has_2pc();
513
if (session->transaction.xid_state.xid.is_null())
514
session->transaction.xid_state.xid.set(session->query_id);
518
Check if we can skip the two-phase commit.
520
A helper function to evaluate if two-phase commit is mandatory.
521
As a side effect, propagates the read-only/read-write flags
522
of the statement transaction to its enclosing normal transaction.
524
@retval true we must run a two-phase commit. Returned
525
if we have at least two engines with read-write changes.
526
@retval false Don't need two-phase commit. Even if we have two
527
transactional engines, we can run two independent
528
commits if changes in one of the engines are read-only.
533
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
536
/* The number of storage engines that have actual changes. */
537
unsigned rw_ha_count= 0;
538
Ha_trx_info *ha_info;
540
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
542
if (ha_info->is_trx_read_write())
547
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->engine()->getSlot()].ha_info[1];
548
assert(ha_info != ha_info_all);
550
Merge read-only/read-write information about statement
551
transaction to its enclosing normal transaction. Do this
552
only if in a real transaction -- that is, if we know
553
that ha_info_all is registered in session->transaction.all.
554
Since otherwise we only clutter the normal transaction flags.
556
if (ha_info_all->is_started()) /* false if autocommit. */
557
ha_info_all->coalesce_trx_with(ha_info);
559
else if (rw_ha_count > 1)
562
It is a normal transaction, so we don't need to merge read/write
563
information up, and the need for two-phase commit has been
564
already established. Break the loop prematurely.
569
return rw_ha_count > 1;
577
1 transaction was rolled back
579
2 error during commit, data may be inconsistent
582
Since we don't support nested statement transactions in 5.0,
583
we can't commit or rollback stmt transactions while we are inside
584
stored functions or triggers. So we simply do nothing now.
585
TODO: This should be fixed in later ( >= 5.1) releases.
587
int ha_commit_trans(Session *session, bool all)
589
int error= 0, cookie= 0;
591
'all' means that this is either an explicit commit issued by
592
user, or an implicit commit issued by a DDL.
594
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
595
bool is_real_trans= all || session->transaction.all.ha_list == 0;
596
Ha_trx_info *ha_info= trans->ha_list;
599
We must not commit the normal transaction if a statement
600
transaction is pending. Otherwise statement transaction
601
flags will not get propagated to its normal transaction's
604
assert(session->transaction.stmt.ha_list == NULL ||
605
trans == &session->transaction.stmt);
611
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
613
ha_rollback_trans(session, all);
617
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
619
if (!trans->no_2pc && must_2pc)
621
for (; ha_info && !error; ha_info= ha_info->next())
624
StorageEngine *engine= ha_info->engine();
626
Do not call two-phase commit if this particular
627
transaction is read-only. This allows for simpler
628
implementation in engines that are always read-only.
630
if (! ha_info->is_trx_read_write())
633
Sic: we know that prepare() is not NULL since otherwise
634
trans->no_2pc would have been set.
636
if ((err= engine->prepare(session, all)))
638
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
641
status_var_increment(session->status_var.ha_prepare_count);
645
ha_rollback_trans(session, all);
650
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
653
start_waiting_global_read_lock(session);
660
This function does not care about global read lock. A caller should.
662
int ha_commit_one_phase(Session *session, bool all)
665
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
666
bool is_real_trans=all || session->transaction.all.ha_list == 0;
667
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
670
for (; ha_info; ha_info= ha_info_next)
673
StorageEngine *engine= ha_info->engine();
674
if ((err= engine->commit(session, all)))
676
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
679
status_var_increment(session->status_var.ha_commit_count);
680
ha_info_next= ha_info->next();
681
ha_info->reset(); /* keep it conveniently zero-filled */
686
session->transaction.xid_state.xid.null();
689
session->variables.tx_isolation=session->session_tx_isolation;
690
session->transaction.cleanup();
697
int ha_rollback_trans(Session *session, bool all)
700
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
701
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
702
bool is_real_trans=all || session->transaction.all.ha_list == 0;
705
We must not rollback the normal transaction if a statement
706
transaction is pending.
708
assert(session->transaction.stmt.ha_list == NULL ||
709
trans == &session->transaction.stmt);
713
for (; ha_info; ha_info= ha_info_next)
716
StorageEngine *engine= ha_info->engine();
717
if ((err= engine->rollback(session, all)))
719
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
722
status_var_increment(session->status_var.ha_rollback_count);
723
ha_info_next= ha_info->next();
724
ha_info->reset(); /* keep it conveniently zero-filled */
729
session->transaction.xid_state.xid.null();
732
session->variables.tx_isolation=session->session_tx_isolation;
733
session->transaction.cleanup();
737
session->transaction_rollback_request= false;
740
If a non-transactional table was updated, warn; don't warn if this is a
741
slave thread (because when a slave thread executes a ROLLBACK, it has
742
been read from the binary log, so it's 100% sure and normal to produce
743
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
744
slave SQL thread, it would not stop the thread but just be printed in
745
the error log; but we don't want users to wonder why they have this
746
message in the error log, so we don't send it.
748
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
749
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
750
ER_WARNING_NOT_COMPLETE_ROLLBACK,
751
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
756
This is used to commit or rollback a single statement depending on
760
Note that if the autocommit is on, then the following call inside
761
InnoDB will commit or rollback the whole transaction (= the statement). The
762
autocommit mechanism built into InnoDB is based on counting locks, but if
763
the user has used LOCK TABLES then that mechanism does not know to do the
766
int ha_autocommit_or_rollback(Session *session, int error)
768
if (session->transaction.stmt.ha_list)
772
if (ha_commit_trans(session, 0))
777
(void) ha_rollback_trans(session, 0);
778
if (session->transaction_rollback_request)
779
(void) ha_rollback(session);
782
session->variables.tx_isolation=session->session_tx_isolation;
792
return the list of XID's to a client, the same way SHOW commands do.
795
I didn't find in XA specs that an RM cannot return the same XID twice,
796
so mysql_xa_recover does not filter XID's to ensure uniqueness.
797
It can be easily fixed later, if necessary.
799
bool mysql_xa_recover(Session *session)
801
List<Item> field_list;
802
Protocol *protocol= session->protocol;
806
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
807
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
808
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
809
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
811
if (protocol->sendFields(&field_list,
812
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
815
pthread_mutex_lock(&LOCK_xid_cache);
816
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
818
if (xs->xa_state==XA_PREPARED)
820
protocol->prepareForResend();
821
protocol->store((int64_t)xs->xid.formatID);
822
protocol->store((int64_t)xs->xid.gtrid_length);
823
protocol->store((int64_t)xs->xid.bqual_length);
824
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length);
825
if (protocol->write())
827
pthread_mutex_unlock(&LOCK_xid_cache);
833
pthread_mutex_unlock(&LOCK_xid_cache);
839
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
842
Session_TRANS *trans= &session->transaction.all;
843
Ha_trx_info *ha_info, *ha_info_next;
847
rolling back to savepoint in all storage engines that were part of the
848
transaction when the savepoint was set
850
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
853
StorageEngine *engine= ha_info->engine();
855
if ((err= engine->savepoint_rollback(session,
858
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
861
status_var_increment(session->status_var.ha_savepoint_rollback_count);
862
trans->no_2pc|= not engine->has_2pc();
865
rolling back the transaction in all storage engines that were not part of
866
the transaction when the savepoint was set
868
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
869
ha_info= ha_info_next)
872
StorageEngine *engine= ha_info->engine();
873
if ((err= engine->rollback(session, !(0))))
875
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
878
status_var_increment(session->status_var.ha_rollback_count);
879
ha_info_next= ha_info->next();
880
ha_info->reset(); /* keep it conveniently zero-filled */
882
trans->ha_list= sv->ha_list;
888
according to the sql standard (ISO/IEC 9075-2:2003)
889
section "4.33.4 SQL-statements and transaction states",
890
SAVEPOINT is *not* transaction-initiating SQL-statement
892
int ha_savepoint(Session *session, SAVEPOINT *sv)
895
Session_TRANS *trans= &session->transaction.all;
896
Ha_trx_info *ha_info= trans->ha_list;
897
for (; ha_info; ha_info= ha_info->next())
900
StorageEngine *engine= ha_info->engine();
902
#ifdef NOT_IMPLEMENTED /*- TODO (examine this againt the original code base) */
903
if (! engine->savepoint_set)
905
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
910
if ((err= engine->savepoint_set(session, (void *)(sv+1))))
912
my_error(ER_GET_ERRNO, MYF(0), err);
915
status_var_increment(session->status_var.ha_savepoint_count);
918
Remember the list of registered storage engines. All new
919
engines are prepended to the beginning of the list.
921
sv->ha_list= trans->ha_list;
925
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
928
Ha_trx_info *ha_info= sv->ha_list;
930
for (; ha_info; ha_info= ha_info->next())
933
StorageEngine *engine= ha_info->engine();
934
/* Savepoint life time is enclosed into transaction life time. */
936
if ((err= engine->savepoint_release(session,
939
my_error(ER_GET_ERRNO, MYF(0), err);
54
950
/****************************************************************************
55
** General Cursor functions
951
** General handler functions
56
952
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
61
estimation_rows_to_insert(0),
63
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
64
ref_length(sizeof(internal::my_off_t)),
67
next_insert_id(0), insert_id_for_cur_row(0)
953
handler::~handler(void)
72
955
assert(locked == false);
73
956
/* TODO: assert(inited == NONE); */
78
* @note this only used in
79
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
80
* of the writing of this comment. -Brian
82
Cursor *Cursor::clone(memory::Root *mem_root)
960
handler *handler::clone(MEM_ROOT *mem_root)
84
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
962
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
87
Allocate Cursor->ref here because otherwise ha_open will allocate it
964
Allocate handler->ref here because otherwise ha_open will allocate it
88
965
on this->table->mem_root and we will not be able to reclaim that memory
89
when the clone Cursor object is destroyed.
966
when the clone handler object is destroyed.
91
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
968
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
94
identifier::Table identifier(getTable()->getShare()->getSchemaName(),
95
getTable()->getShare()->getTableName(),
96
getTable()->getShare()->getType());
98
if (new_handler && !new_handler->ha_open(identifier,
99
getTable()->getDBStat(),
970
if (new_handler && !new_handler->ha_open(table,
971
table->s->normalized_path.str,
100
973
HA_OPEN_IGNORE_IF_LOCKED))
101
974
return new_handler;
107
given a buffer with a key value, and a map of keyparts
108
that are present in this value, returns the length of the value
110
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
112
/* works only with key prefixes */
113
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
115
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
116
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
119
while (key_part_found < end_key_part_found && keypart_map_arg)
121
length+= key_part_found->store_length;
122
keypart_map_arg >>= 1;
128
int Cursor::startIndexScan(uint32_t idx, bool sorted)
978
int handler::ha_index_init(uint32_t idx, bool sorted)
131
981
assert(inited == NONE);
132
if (!(result= doStartIndexScan(idx, sorted)))
982
if (!(result= index_init(idx, sorted)))
138
int Cursor::endIndexScan()
988
int handler::ha_index_end()
140
990
assert(inited==INDEX);
143
return(doEndIndexScan());
146
int Cursor::startTableScan(bool scan)
996
int handler::ha_rnd_init(bool scan)
149
999
assert(inited==NONE || (inited==RND && scan));
150
inited= (result= doStartTableScan(scan)) ? NONE: RND;
1000
inited= (result= rnd_init(scan)) ? NONE: RND;
155
int Cursor::endTableScan()
1005
int handler::ha_rnd_end()
157
1007
assert(inited==RND);
159
return(doEndTableScan());
162
int Cursor::ha_index_or_rnd_end()
164
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
167
void Cursor::ha_start_bulk_insert(ha_rows rows)
1012
int handler::ha_index_or_rnd_end()
1014
return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0;
1017
handler::Table_flags handler::ha_table_flags() const
1019
return cached_table_flags;
1022
void handler::ha_start_bulk_insert(ha_rows rows)
169
1024
estimation_rows_to_insert= rows;
170
1025
start_bulk_insert(rows);
173
int Cursor::ha_end_bulk_insert()
1028
int handler::ha_end_bulk_insert()
175
1030
estimation_rows_to_insert= 0;
176
1031
return end_bulk_insert();
179
const key_map *Cursor::keys_to_use_for_scanning()
1034
void handler::change_table_ptr(Table *table_arg, TableShare *share)
1040
const key_map *handler::keys_to_use_for_scanning()
181
1042
return &key_map_empty;
184
bool Cursor::has_transactions()
186
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
189
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
191
(getTable()->in_use->status_var.*offset)++;
194
void **Cursor::ha_data(Session *session) const
196
return session->getEngineData(getEngine());
199
bool Cursor::is_fatal_error(int error, uint32_t flags)
1045
bool handler::has_transactions()
1047
return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0;
1050
void handler::ha_statistic_increment(ulong SSV::*offset) const
1052
status_var_increment(table->in_use->status_var.*offset);
1055
void **handler::ha_data(Session *session) const
1057
return session_ha_data(session, engine);
1060
Session *handler::ha_session(void) const
1062
assert(!table || !table->in_use || table->in_use == current_session);
1063
return (table && table->in_use) ? table->in_use : current_session;
1067
bool handler::is_fatal_error(int error, uint32_t flags)
202
1070
((flags & HA_CHECK_DUP_KEY) &&
1156
2498
/* **************************************************************************
2499
* DS-MRR implementation
2500
***************************************************************************/
2503
DS-MRR: Initialize and start MRR scan
2505
Initialize and start the MRR scan. Depending on the mode parameter, this
2506
may use default or DS-MRR implementation.
2508
@param h Table handler to be used
2509
@param key Index to be used
2510
@param seq_funcs Interval sequence enumeration functions
2511
@param seq_init_param Interval sequence enumeration parameter
2512
@param n_ranges Number of ranges in the sequence.
2513
@param mode HA_MRR_* modes to use
2514
@param buf INOUT Buffer to use
2516
@retval 0 Ok, Scan started.
2520
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
2521
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
2522
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
2526
Item *pushed_cond= NULL;
2528
keyno= h_in->active_index;
2530
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
2532
use_default_impl= true;
2533
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
2534
n_ranges, mode, buf));
2536
rowids_buf= buf->buffer;
2537
//psergey-todo: don't add key_length as it is not needed anymore
2538
rowids_buf += key->key_length + h_in->ref_length;
2540
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
2541
rowids_buf_end= buf->buffer_end;
2543
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
2544
rowids_buf_last= rowids_buf +
2545
((rowids_buf_end - rowids_buf)/ elem_size)*
2547
rowids_buf_end= rowids_buf_last;
2549
/* Create a separate handler object to do rndpos() calls. */
2550
Session *session= current_session;
2551
if (!(new_h2= h_in->clone(session->mem_root)) ||
2552
new_h2->ha_external_lock(session, F_RDLCK))
2558
if (keyno == h_in->pushed_idx_cond_keyno)
2559
pushed_cond= h_in->pushed_idx_cond;
2560
if (h_in->ha_index_end())
2567
table->prepare_for_position();
2568
new_h2->extra(HA_EXTRA_KEYREAD);
2570
if (h2->ha_index_init(keyno, false) ||
2571
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
2574
use_default_impl= false;
2577
h2->idx_cond_push(keyno, pushed_cond);
2578
if (dsmrr_fill_buffer(new_h2))
2582
If the above call has scanned through all intervals in *seq, then
2583
adjust *buf to indicate that the remaining buffer space will not be used.
2586
buf->end_of_used_area= rowids_buf_last;
2588
if (h_in->ha_rnd_init(false))
2593
h2->ha_index_or_rnd_end();
2594
h2->ha_external_lock(session, F_UNLCK);
2601
void DsMrr_impl::dsmrr_close()
2605
h2->ha_external_lock(current_session, F_UNLCK);
2610
use_default_impl= true;
2614
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
2616
return ((handler*)h)->cmp_ref(a, b);
2621
DS-MRR: Fill the buffer with rowids and sort it by rowid
2623
{This is an internal function of DiskSweep MRR implementation}
2624
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
2625
buffer. When the buffer is full or scan is completed, sort the buffer by
2628
The function assumes that rowids buffer is empty when it is invoked.
2630
@param h Table handler
2632
@retval 0 OK, the next portion of rowids is in the buffer,
2637
int DsMrr_impl::dsmrr_fill_buffer(handler *)
2642
rowids_buf_cur= rowids_buf;
2643
while ((rowids_buf_cur < rowids_buf_end) &&
2644
!(res= h2->handler::multi_range_read_next(&range_info)))
2646
/* Put rowid, or {rowid, range_id} pair into the buffer */
2647
h2->position(table->record[0]);
2648
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
2649
rowids_buf_cur += h->ref_length;
2653
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
2654
rowids_buf_cur += sizeof(void*);
2658
if (res && res != HA_ERR_END_OF_FILE)
2660
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
2662
/* Sort the buffer contents by rowid */
2663
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
2664
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
2666
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
2668
rowids_buf_last= rowids_buf_cur;
2669
rowids_buf_cur= rowids_buf;
2675
DS-MRR implementation: multi_range_read_next() function
2678
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
2682
if (use_default_impl)
2683
return h_in->handler::multi_range_read_next(range_info);
2685
if (rowids_buf_cur == rowids_buf_last)
2689
res= HA_ERR_END_OF_FILE;
2692
res= dsmrr_fill_buffer(h);
2697
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
2698
if (rowids_buf_cur == rowids_buf_last)
2700
res= HA_ERR_END_OF_FILE;
2704
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
2705
rowids_buf_cur += h_in->ref_length;
2708
memcpy(range_info, rowids_buf_cur, sizeof(void*));
2709
rowids_buf_cur += sizeof(void*);
2720
DS-MRR implementation: multi_range_read_info() function
2722
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
2723
uint32_t *flags, COST_VECT *cost)
2726
uint32_t def_flags= *flags;
2727
uint32_t def_bufsz= *bufsz;
2729
/* Get cost/flags/mem_usage of default MRR implementation */
2730
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
2734
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
2735
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
2737
/* Default implementation is choosen */
2746
DS-MRR Implementation: multi_range_read_info_const() function
2749
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
2750
void *seq_init_param, uint32_t n_ranges,
2751
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
2754
uint32_t def_flags= *flags;
2755
uint32_t def_bufsz= *bufsz;
2756
/* Get cost/flags/mem_usage of default MRR implementation */
2757
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
2758
n_ranges, &def_bufsz,
2760
if (rows == HA_POS_ERROR)
2762
/* Default implementation can't perform MRR scan => we can't either */
2767
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
2768
use the default MRR implementation (we need it for UPDATE/DELETE).
2769
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
2771
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
2772
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
2779
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
2786
Check if key has partially-covered columns
2788
We can't use DS-MRR to perform range scans when the ranges are over
2789
partially-covered keys, because we'll not have full key part values
2790
(we'll have their prefixes from the index) and will not be able to check
2791
if we've reached the end the range.
2793
@param keyno Key to check
2796
Allow use of DS-MRR in cases where the index has partially-covered
2797
components but they are not used for scanning.
2803
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
2805
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
2806
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
2807
for (; kp != kp_end; kp++)
2809
if (!kp->field->part_of_key.test(keyno))
2817
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
2819
Make the choice between using Default MRR implementation and DS-MRR.
2820
This function contains common functionality factored out of dsmrr_info()
2821
and dsmrr_info_const(). The function assumes that the default MRR
2822
implementation's applicability requirements are satisfied.
2824
@param keyno Index number
2825
@param rows E(full rows to be retrieved)
2826
@param flags IN MRR flags provided by the MRR user
2827
OUT If DS-MRR is choosen, flags of DS-MRR implementation
2828
else the value is not modified
2829
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
2830
else the value is not modified
2831
@param cost IN Cost of default MRR implementation
2832
OUT If DS-MRR is choosen, cost of DS-MRR scan
2833
else the value is not modified
2835
@retval true Default MRR implementation should be used
2836
@retval false DS-MRR implementation should be used
2839
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
2840
uint32_t *bufsz, COST_VECT *cost)
2842
COST_VECT dsmrr_cost;
2844
Session *session= current_session;
2845
if ((session->variables.optimizer_use_mrr == 2) ||
2846
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
2847
(keyno == table->s->primary_key &&
2848
h->primary_key_is_clustered()) ||
2849
key_uses_partial_cols(keyno))
2851
/* Use the default implementation */
2852
*flags |= HA_MRR_USE_DEFAULT_IMPL;
2856
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
2858
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
2864
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
2865
DS-MRR and Default implementations cost. This allows one to force use of
2866
DS-MRR whenever it is applicable without affecting other cost-based
2869
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
2870
dsmrr_cost.total_cost() > cost->total_cost())
2873
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
2875
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
2876
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
2882
/* Use the default MRR implementation */
2889
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
2893
Get cost of DS-MRR scan
2895
@param keynr Index to be used
2896
@param rows E(Number of rows to be scanned)
2897
@param flags Scan parameters (HA_MRR_* flags)
2898
@param buffer_size INOUT Buffer size
2899
@param cost OUT The cost
2902
@retval true Error, DS-MRR cannot be used (the buffer is too small
2906
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
2907
uint32_t *buffer_size, COST_VECT *cost)
2909
uint32_t max_buff_entries, elem_size;
2910
ha_rows rows_in_full_step, rows_in_last_step;
2911
uint32_t n_full_steps;
2912
double index_read_cost;
2914
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
2915
max_buff_entries = *buffer_size / elem_size;
2917
if (!max_buff_entries)
2918
return true; /* Buffer has not enough space for even 1 rowid */
2920
/* Number of iterations we'll make with full buffer */
2921
n_full_steps= (uint32_t)floor(rows2double(rows) / max_buff_entries);
2924
Get numbers of rows we'll be processing in
2925
- non-last sweep, with full buffer
2926
- last iteration, with non-full buffer
2928
rows_in_full_step= max_buff_entries;
2929
rows_in_last_step= rows % max_buff_entries;
2931
/* Adjust buffer size if we expect to use only part of the buffer */
2934
get_sort_and_sweep_cost(table, rows, cost);
2935
cost->multiply(n_full_steps);
2940
*buffer_size= max(*buffer_size,
2941
(uint32_t)(1.2*rows_in_last_step) * elem_size +
2942
h->ref_length + table->key_info[keynr].key_length);
2945
COST_VECT last_step_cost;
2946
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
2947
cost->add(&last_step_cost);
2949
if (n_full_steps != 0)
2950
cost->mem_cost= *buffer_size;
2952
cost->mem_cost= (double)rows_in_last_step * elem_size;
2954
/* Total cost of all index accesses */
2955
index_read_cost= h->index_only_read_time(keynr, (double)rows);
2956
cost->add_io(index_read_cost, 1 /* Random seeks */);
2962
Get cost of one sort-and-sweep step
2965
get_sort_and_sweep_cost()
2966
table Table being accessed
2967
nrows Number of rows to be sorted and retrieved
2971
Get cost of these operations:
2972
- sort an array of #nrows ROWIDs using qsort
2973
- read #nrows records from table in a sweep.
2977
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
2981
get_sweep_read_cost(table, nrows, false, cost);
2982
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
2983
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
2986
cost->cpu_cost += cmp_op * log2(cmp_op);
2994
Get cost of reading nrows table records in a "disk sweep"
2996
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
2997
for an ordered sequence of rowids.
2999
We assume hard disk IO. The read is performed as follows:
3001
1. The disk head is moved to the needed cylinder
3002
2. The controller waits for the plate to rotate
3003
3. The data is transferred
3005
Time to do #3 is insignificant compared to #2+#1.
3007
Time to move the disk head is proportional to head travel distance.
3009
Time to wait for the plate to rotate depends on whether the disk head
3012
If disk head wasn't moved, the wait time is proportional to distance
3013
between the previous block and the block we're reading.
3015
If the head was moved, we don't know how much we'll need to wait for the
3016
plate to rotate. We assume the wait time to be a variate with a mean of
3017
0.5 of full rotation time.
3019
Our cost units are "random disk seeks". The cost of random disk seek is
3020
actually not a constant, it depends one range of cylinders we're going
3021
to access. We make it constant by introducing a fuzzy concept of "typical
3022
datafile length" (it's fuzzy as it's hard to tell whether it should
3023
include index file, temp.tables etc). Then random seek cost is:
3025
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3027
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3029
@param table Table to be accessed
3030
@param nrows Number of rows to retrieve
3031
@param interrupted true <=> Assume that the disk sweep will be
3032
interrupted by other disk IO. false - otherwise.
3033
@param cost OUT The cost.
3036
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3040
if (table->file->primary_key_is_clustered())
3042
cost->io_count= table->file->read_time(table->s->primary_key,
3043
(uint32_t) nrows, nrows);
3048
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3050
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3051
if (busy_blocks < 1.0)
3054
cost->io_count= busy_blocks;
3058
/* Assume reading is done in one 'sweep' */
3059
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3060
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
3066
/* **************************************************************************
1157
3067
* DS-MRR implementation ends
1158
3068
***************************************************************************/