23
23
Handler-calling-functions
30
#include "drizzled/my_hash.h"
26
#include "drizzled/server_includes.h"
27
#include "mysys/hash.h"
31
28
#include "drizzled/error.h"
32
29
#include "drizzled/gettext.h"
30
#include "drizzled/data_home.h"
33
31
#include "drizzled/probes.h"
34
32
#include "drizzled/sql_parse.h"
35
#include "drizzled/optimizer/cost_vector.h"
33
#include "drizzled/cost_vect.h"
36
34
#include "drizzled/session.h"
37
35
#include "drizzled/sql_base.h"
38
36
#include "drizzled/transaction_services.h"
39
37
#include "drizzled/lock.h"
40
38
#include "drizzled/item/int.h"
41
39
#include "drizzled/item/empty_string.h"
40
#include "drizzled/unireg.h" // for mysql_frm_type
42
41
#include "drizzled/field/timestamp.h"
43
42
#include "drizzled/message/table.pb.h"
44
#include "drizzled/plugin/client.h"
45
#include "drizzled/internal/my_sys.h"
46
#include "drizzled/plugin/event_observer.h"
48
44
using namespace std;
46
extern drizzled::TransactionServices transaction_services;
48
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
50
/* number of entries in storage_engines[] */
52
/* number of storage engines (from storage_engines[]) that support 2pc */
53
uint32_t total_ha_2pc= 0;
54
/* size of savepoint storage area (see ha_init) */
55
uint32_t savepoint_alloc_size= 0;
57
const char *ha_row_type[] = {
58
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
61
const char *tx_isolation_names[] =
62
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
65
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
66
tx_isolation_names, NULL};
70
Register handler error messages for use with my_error().
78
int ha_init_errors(void)
80
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
83
/* Allocate a pointer array for the error message strings. */
84
/* Zerofill it to avoid uninitialized gaps. */
85
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
87
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
89
/* Set the dedicated error messages. */
90
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
91
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
92
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
93
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
94
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
95
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
96
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
97
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
98
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
99
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
100
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
101
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
102
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
103
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
104
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
105
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
106
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
107
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
108
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
109
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
110
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
111
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
112
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
113
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
114
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
115
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
116
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
117
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
118
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
119
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
120
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
121
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
122
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
123
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
124
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
125
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
126
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
127
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
128
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
129
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
130
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
132
/* Register the error messages for use with my_error(). */
133
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
138
Unregister handler error messages.
145
static int ha_finish_errors(void)
147
const char **errmsgs;
149
/* Allocate a pointer array for the error message strings. */
150
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
152
free((unsigned char*) errmsgs);
160
assert(total_ha < MAX_HA);
162
Check if there is a transaction-capable storage engine besides the
163
binary log (which is considered a transaction-capable storage engine in
166
savepoint_alloc_size+= sizeof(SAVEPOINT);
175
This should be eventualy based on the graceful shutdown flag.
176
So if flag is equal to HA_PANIC_CLOSE, the deallocate
179
if (ha_finish_errors())
187
/* ========================================================================
188
======================= TRANSACTIONS ===================================*/
191
Transaction handling in the server
192
==================================
194
In each client connection, MySQL maintains two transactional
196
- a statement transaction,
197
- a standard, also called normal transaction.
201
"Statement transaction" is a non-standard term that comes
202
from the times when MySQL supported BerkeleyDB storage engine.
204
First of all, it should be said that in BerkeleyDB auto-commit
205
mode auto-commits operations that are atomic to the storage
206
engine itself, such as a write of a record, and are too
207
high-granular to be atomic from the application perspective
208
(MySQL). One SQL statement could involve many BerkeleyDB
209
auto-committed operations and thus BerkeleyDB auto-commit was of
212
Secondly, instead of SQL standard savepoints, BerkeleyDB
213
provided the concept of "nested transactions". In a nutshell,
214
transactions could be arbitrarily nested, but when the parent
215
transaction was committed or aborted, all its child (nested)
216
transactions were handled committed or aborted as well.
217
Commit of a nested transaction, in turn, made its changes
218
visible, but not durable: it destroyed the nested transaction,
219
all its changes would become available to the parent and
220
currently active nested transactions of this parent.
222
So the mechanism of nested transactions was employed to
223
provide "all or nothing" guarantee of SQL statements
224
required by the standard.
225
A nested transaction would be created at start of each SQL
226
statement, and destroyed (committed or aborted) at statement
227
end. Such nested transaction was internally referred to as
228
a "statement transaction" and gave birth to the term.
230
<Historical note ends>
232
Since then a statement transaction is started for each statement
233
that accesses transactional tables or uses the binary log. If
234
the statement succeeds, the statement transaction is committed.
235
If the statement fails, the transaction is rolled back. Commits
236
of statement transactions are not durable -- each such
237
transaction is nested in the normal transaction, and if the
238
normal transaction is rolled back, the effects of all enclosed
239
statement transactions are undone as well. Technically,
240
a statement transaction can be viewed as a savepoint which is
241
maintained automatically in order to make effects of one
244
The normal transaction is started by the user and is ended
245
usually upon a user request as well. The normal transaction
246
encloses transactions of all statements issued between
247
its beginning and its end.
248
In autocommit mode, the normal transaction is equivalent
249
to the statement transaction.
251
Since MySQL supports PSEA (pluggable storage engine
252
architecture), more than one transactional engine can be
253
active at a time. Hence transactions, from the server
254
point of view, are always distributed. In particular,
255
transactional state is maintained independently for each
256
engine. In order to commit a transaction the two phase
257
commit protocol is employed.
259
Not all statements are executed in context of a transaction.
260
Administrative and status information statements do not modify
261
engine data, and thus do not start a statement transaction and
262
also have no effect on the normal transaction. Examples of such
263
statements are SHOW STATUS and RESET SLAVE.
265
Similarly DDL statements are not transactional,
266
and therefore a transaction is [almost] never started for a DDL
267
statement. The difference between a DDL statement and a purely
268
administrative statement though is that a DDL statement always
269
commits the current transaction before proceeding, if there is
272
At last, SQL statements that work with non-transactional
273
engines also have no effect on the transaction state of the
274
connection. Even though they are written to the binary log,
275
and the binary log is, overall, transactional, the writes
276
are done in "write-through" mode, directly to the binlog
277
file, followed with a OS cache sync, in other words,
278
bypassing the binlog undo log (translog).
279
They do not commit the current normal transaction.
280
A failure of a statement that uses non-transactional tables
281
would cause a rollback of the statement transaction, but
282
in case there no non-transactional tables are used,
283
no statement transaction is started.
288
The server stores its transaction-related data in
289
session->transaction. This structure has two members of type
290
Session_TRANS. These members correspond to the statement and
291
normal transactions respectively:
293
- session->transaction.stmt contains a list of engines
294
that are participating in the given statement
295
- session->transaction.all contains a list of engines that
296
have participated in any of the statement transactions started
297
within the context of the normal transaction.
298
Each element of the list contains a pointer to the storage
299
engine, engine-specific transactional data, and engine-specific
302
In autocommit mode session->transaction.all is empty.
303
Instead, data of session->transaction.stmt is
304
used to commit/rollback the normal transaction.
306
The list of registered engines has a few important properties:
307
- no engine is registered in the list twice
308
- engines are present in the list a reverse temporal order --
309
new participants are always added to the beginning of the list.
311
Transaction life cycle
312
----------------------
314
When a new connection is established, session->transaction
315
members are initialized to an empty state.
316
If a statement uses any tables, all affected engines
317
are registered in the statement engine list. In
318
non-autocommit mode, the same engines are registered in
319
the normal transaction list.
320
At the end of the statement, the server issues a commit
321
or a roll back for all engines in the statement list.
322
At this point transaction flags of an engine, if any, are
323
propagated from the statement list to the list of the normal
325
When commit/rollback is finished, the statement list is
326
cleared. It will be filled in again by the next statement,
327
and emptied again at the next statement's end.
329
The normal transaction is committed in a similar way
330
(by going over all engines in session->transaction.all list)
331
but at different times:
332
- upon COMMIT SQL statement is issued by the user
333
- implicitly, by the server, at the beginning of a DDL statement
334
or SET AUTOCOMMIT={0|1} statement.
336
The normal transaction can be rolled back as well:
337
- if the user has requested so, by issuing ROLLBACK SQL
339
- if one of the storage engines requested a rollback
340
by setting session->transaction_rollback_request. This may
341
happen in case, e.g., when the transaction in the engine was
342
chosen a victim of the internal deadlock resolution algorithm
343
and rolled back internally. When such a situation happens, there
344
is little the server can do and the only option is to rollback
345
transactions in all other participating engines. In this case
346
the rollback is accompanied by an error sent to the user.
348
As follows from the use cases above, the normal transaction
349
is never committed when there is an outstanding statement
350
transaction. In most cases there is no conflict, since
351
commits of the normal transaction are issued by a stand-alone
352
administrative or DDL statement, thus no outstanding statement
353
transaction of the previous statement exists. Besides,
354
all statements that manipulate with the normal transaction
355
are prohibited in stored functions and triggers, therefore
356
no conflicting situation can occur in a sub-statement either.
357
The remaining rare cases when the server explicitly has
358
to commit the statement transaction prior to committing the normal
359
one cover error-handling scenarios (see for example
362
When committing a statement or a normal transaction, the server
363
either uses the two-phase commit protocol, or issues a commit
364
in each engine independently. The two-phase commit protocol
366
- all participating engines support two-phase commit (provide
367
StorageEngine::prepare PSEA API call) and
368
- transactions in at least two engines modify data (i.e. are
371
Note that the two phase commit is used for
372
statement transactions, even though they are not durable anyway.
373
This is done to ensure logical consistency of data in a multiple-
375
For example, imagine that some day MySQL supports unique
376
constraint checks deferred till the end of statement. In such
377
case a commit in one of the engines may yield ER_DUP_KEY,
378
and MySQL should be able to gracefully abort statement
379
transactions of other participants.
381
After the normal transaction has been committed,
382
session->transaction.all list is cleared.
384
When a connection is closed, the current normal transaction, if
387
Roles and responsibilities
388
--------------------------
390
The server has no way to know that an engine participates in
391
the statement and a transaction has been started
392
in it unless the engine says so. Thus, in order to be
393
a part of a transaction, the engine must "register" itself.
394
This is done by invoking trans_register_ha() server call.
395
Normally the engine registers itself whenever handler::external_lock()
396
is called. trans_register_ha() can be invoked many times: if
397
an engine is already registered, the call does nothing.
398
In case autocommit is not set, the engine must register itself
399
twice -- both in the statement list and in the normal transaction
401
In which list to register is a parameter of trans_register_ha().
403
Note, that although the registration interface in itself is
404
fairly clear, the current usage practice often leads to undesired
405
effects. E.g. since a call to trans_register_ha() in most engines
406
is embedded into implementation of handler::external_lock(), some
407
DDL statements start a transaction (at least from the server
408
point of view) even though they are not expected to. E.g.
409
CREATE TABLE does not start a transaction, since
410
handler::external_lock() is never called during CREATE TABLE. But
411
CREATE TABLE ... SELECT does, since handler::external_lock() is
412
called for the table that is being selected from. This has no
413
practical effects currently, but must be kept in mind
416
Once an engine is registered, the server will do the rest
419
During statement execution, whenever any of data-modifying
420
PSEA API methods is used, e.g. handler::write_row() or
421
handler::update_row(), the read-write flag is raised in the
422
statement transaction for the involved engine.
423
Currently All PSEA calls are "traced", and the data can not be
424
changed in a way other than issuing a PSEA call. Important:
425
unless this invariant is preserved the server will not know that
426
a transaction in a given engine is read-write and will not
427
involve the two-phase commit protocol!
429
At the end of a statement, server call
430
ha_autocommit_or_rollback() is invoked. This call in turn
431
invokes StorageEngine::prepare() for every involved engine.
432
Prepare is followed by a call to StorageEngine::commit_one_phase()
433
If a one-phase commit will suffice, StorageEngine::prepare() is not
434
invoked and the server only calls StorageEngine::commit_one_phase().
435
At statement commit, the statement-related read-write engine
436
flag is propagated to the corresponding flag in the normal
437
transaction. When the commit is complete, the list of registered
440
Rollback is handled in a similar fashion.
442
Additional notes on DDL and the normal transaction.
443
---------------------------------------------------
445
DDLs and operations with non-transactional engines
446
do not "register" in session->transaction lists, and thus do not
447
modify the transaction state. Besides, each DDL in
448
MySQL is prefixed with an implicit normal transaction commit
449
(a call to Session::endActiveTransaction()), and thus leaves nothing
451
However, as it has been pointed out with CREATE TABLE .. SELECT,
452
some DDL statements can start a *new* transaction.
454
Behaviour of the server in this case is currently badly
456
DDL statements use a form of "semantic" logging
457
to maintain atomicity: if CREATE TABLE .. SELECT failed,
458
the newly created table is deleted.
459
In addition, some DDL statements issue interim transaction
460
commits: e.g. ALTER Table issues a commit after data is copied
461
from the original table to the internal temporary table. Other
462
statements, e.g. CREATE TABLE ... SELECT do not always commit
464
And finally there is a group of DDL statements such as
465
RENAME/DROP Table that doesn't start a new transaction
468
This diversity makes it hard to say what will happen if
469
by chance a stored function is invoked during a DDL --
470
whether any modifications it makes will be committed or not
471
is not clear. Fortunately, SQL grammar of few DDLs allows
472
invocation of a stored function.
474
A consistent behaviour is perhaps to always commit the normal
475
transaction after all DDLs, just like the statement transaction
476
is always committed at the end of all statements.
480
Register a storage engine for a transaction.
482
Every storage engine MUST call this function when it starts
483
a transaction or a statement (that is it must be called both for the
484
"beginning of transaction" and "beginning of statement").
485
Only storage engines registered for the transaction/statement
486
will know when to commit/rollback it.
489
trans_register_ha is idempotent - storage engine may register many
490
times per transaction.
493
void trans_register_ha(Session *session, bool all, StorageEngine *engine)
495
Session_TRANS *trans;
496
Ha_trx_info *ha_info;
500
trans= &session->transaction.all;
501
session->server_status|= SERVER_STATUS_IN_TRANS;
504
trans= &session->transaction.stmt;
506
ha_info= session->ha_data[engine->getSlot()].ha_info + static_cast<unsigned>(all);
508
if (ha_info->is_started())
509
return; /* already registered, return */
511
ha_info->register_ha(trans, engine);
513
trans->no_2pc|= not engine->has_2pc();
514
if (session->transaction.xid_state.xid.is_null())
515
session->transaction.xid_state.xid.set(session->query_id);
524
1 error, transaction was rolled back
526
int ha_prepare(Session *session)
529
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
530
Ha_trx_info *ha_info= trans->ha_list;
533
for (; ha_info; ha_info= ha_info->next())
536
StorageEngine *engine= ha_info->engine();
537
status_var_increment(session->status_var.ha_prepare_count);
538
if ((err= engine->prepare(session, all)))
540
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
541
ha_rollback_trans(session, all);
547
push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
548
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
549
engine->getName().c_str());
557
Check if we can skip the two-phase commit.
559
A helper function to evaluate if two-phase commit is mandatory.
560
As a side effect, propagates the read-only/read-write flags
561
of the statement transaction to its enclosing normal transaction.
563
@retval true we must run a two-phase commit. Returned
564
if we have at least two engines with read-write changes.
565
@retval false Don't need two-phase commit. Even if we have two
566
transactional engines, we can run two independent
567
commits if changes in one of the engines are read-only.
572
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
575
/* The number of storage engines that have actual changes. */
576
unsigned rw_ha_count= 0;
577
Ha_trx_info *ha_info;
579
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
581
if (ha_info->is_trx_read_write())
586
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->engine()->getSlot()].ha_info[1];
587
assert(ha_info != ha_info_all);
589
Merge read-only/read-write information about statement
590
transaction to its enclosing normal transaction. Do this
591
only if in a real transaction -- that is, if we know
592
that ha_info_all is registered in session->transaction.all.
593
Since otherwise we only clutter the normal transaction flags.
595
if (ha_info_all->is_started()) /* false if autocommit. */
596
ha_info_all->coalesce_trx_with(ha_info);
598
else if (rw_ha_count > 1)
601
It is a normal transaction, so we don't need to merge read/write
602
information up, and the need for two-phase commit has been
603
already established. Break the loop prematurely.
608
return rw_ha_count > 1;
616
1 transaction was rolled back
618
2 error during commit, data may be inconsistent
621
Since we don't support nested statement transactions in 5.0,
622
we can't commit or rollback stmt transactions while we are inside
623
stored functions or triggers. So we simply do nothing now.
624
TODO: This should be fixed in later ( >= 5.1) releases.
626
int ha_commit_trans(Session *session, bool all)
628
int error= 0, cookie= 0;
630
'all' means that this is either an explicit commit issued by
631
user, or an implicit commit issued by a DDL.
633
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
634
bool is_real_trans= all || session->transaction.all.ha_list == 0;
635
Ha_trx_info *ha_info= trans->ha_list;
638
We must not commit the normal transaction if a statement
639
transaction is pending. Otherwise statement transaction
640
flags will not get propagated to its normal transaction's
643
assert(session->transaction.stmt.ha_list == NULL ||
644
trans == &session->transaction.stmt);
650
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
652
ha_rollback_trans(session, all);
656
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
658
if (!trans->no_2pc && must_2pc)
660
for (; ha_info && !error; ha_info= ha_info->next())
663
StorageEngine *engine= ha_info->engine();
665
Do not call two-phase commit if this particular
666
transaction is read-only. This allows for simpler
667
implementation in engines that are always read-only.
669
if (! ha_info->is_trx_read_write())
672
Sic: we know that prepare() is not NULL since otherwise
673
trans->no_2pc would have been set.
675
if ((err= engine->prepare(session, all)))
677
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
680
status_var_increment(session->status_var.ha_prepare_count);
684
ha_rollback_trans(session, all);
689
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
692
start_waiting_global_read_lock(session);
699
This function does not care about global read lock. A caller should.
701
int ha_commit_one_phase(Session *session, bool all)
704
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
705
bool is_real_trans=all || session->transaction.all.ha_list == 0;
706
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
709
for (; ha_info; ha_info= ha_info_next)
712
StorageEngine *engine= ha_info->engine();
713
if ((err= engine->commit(session, all)))
715
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
718
status_var_increment(session->status_var.ha_commit_count);
719
ha_info_next= ha_info->next();
720
ha_info->reset(); /* keep it conveniently zero-filled */
725
session->transaction.xid_state.xid.null();
728
session->variables.tx_isolation=session->session_tx_isolation;
729
session->transaction.cleanup();
736
int ha_rollback_trans(Session *session, bool all)
739
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
740
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
741
bool is_real_trans=all || session->transaction.all.ha_list == 0;
744
We must not rollback the normal transaction if a statement
745
transaction is pending.
747
assert(session->transaction.stmt.ha_list == NULL ||
748
trans == &session->transaction.stmt);
752
for (; ha_info; ha_info= ha_info_next)
755
StorageEngine *engine= ha_info->engine();
756
if ((err= engine->rollback(session, all)))
758
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
761
status_var_increment(session->status_var.ha_rollback_count);
762
ha_info_next= ha_info->next();
763
ha_info->reset(); /* keep it conveniently zero-filled */
768
session->transaction.xid_state.xid.null();
771
session->variables.tx_isolation=session->session_tx_isolation;
772
session->transaction.cleanup();
776
session->transaction_rollback_request= false;
779
If a non-transactional table was updated, warn; don't warn if this is a
780
slave thread (because when a slave thread executes a ROLLBACK, it has
781
been read from the binary log, so it's 100% sure and normal to produce
782
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
783
slave SQL thread, it would not stop the thread but just be printed in
784
the error log; but we don't want users to wonder why they have this
785
message in the error log, so we don't send it.
787
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
788
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
789
ER_WARNING_NOT_COMPLETE_ROLLBACK,
790
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
795
This is used to commit or rollback a single statement depending on
799
Note that if the autocommit is on, then the following call inside
800
InnoDB will commit or rollback the whole transaction (= the statement). The
801
autocommit mechanism built into InnoDB is based on counting locks, but if
802
the user has used LOCK TABLES then that mechanism does not know to do the
805
int ha_autocommit_or_rollback(Session *session, int error)
807
if (session->transaction.stmt.ha_list)
811
if (ha_commit_trans(session, 0))
816
(void) ha_rollback_trans(session, 0);
817
if (session->transaction_rollback_request)
818
(void) ha_rollback(session);
821
session->variables.tx_isolation=session->session_tx_isolation;
830
return the list of XID's to a client, the same way SHOW commands do.
833
I didn't find in XA specs that an RM cannot return the same XID twice,
834
so mysql_xa_recover does not filter XID's to ensure uniqueness.
835
It can be easily fixed later, if necessary.
837
bool mysql_xa_recover(Session *session)
839
List<Item> field_list;
840
Protocol *protocol= session->protocol;
844
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
845
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
846
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
847
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
849
if (protocol->sendFields(&field_list,
850
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
853
pthread_mutex_lock(&LOCK_xid_cache);
854
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
856
if (xs->xa_state==XA_PREPARED)
858
protocol->prepareForResend();
859
protocol->store((int64_t)xs->xid.formatID);
860
protocol->store((int64_t)xs->xid.gtrid_length);
861
protocol->store((int64_t)xs->xid.bqual_length);
862
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
864
if (protocol->write())
866
pthread_mutex_unlock(&LOCK_xid_cache);
872
pthread_mutex_unlock(&LOCK_xid_cache);
878
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
881
Session_TRANS *trans= &session->transaction.all;
882
Ha_trx_info *ha_info, *ha_info_next;
886
rolling back to savepoint in all storage engines that were part of the
887
transaction when the savepoint was set
889
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
892
StorageEngine *engine= ha_info->engine();
894
if ((err= engine->savepoint_rollback(session,
897
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
900
status_var_increment(session->status_var.ha_savepoint_rollback_count);
901
trans->no_2pc|= not engine->has_2pc();
904
rolling back the transaction in all storage engines that were not part of
905
the transaction when the savepoint was set
907
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
908
ha_info= ha_info_next)
911
StorageEngine *engine= ha_info->engine();
912
if ((err= engine->rollback(session, !(0))))
914
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
917
status_var_increment(session->status_var.ha_rollback_count);
918
ha_info_next= ha_info->next();
919
ha_info->reset(); /* keep it conveniently zero-filled */
921
trans->ha_list= sv->ha_list;
927
according to the sql standard (ISO/IEC 9075-2:2003)
928
section "4.33.4 SQL-statements and transaction states",
929
SAVEPOINT is *not* transaction-initiating SQL-statement
931
int ha_savepoint(Session *session, SAVEPOINT *sv)
934
Session_TRANS *trans= &session->transaction.all;
935
Ha_trx_info *ha_info= trans->ha_list;
936
for (; ha_info; ha_info= ha_info->next())
939
StorageEngine *engine= ha_info->engine();
941
/* if (! engine->savepoint_set)
943
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
947
if ((err= engine->savepoint_set(session, (void *)(sv+1))))
949
my_error(ER_GET_ERRNO, MYF(0), err);
952
status_var_increment(session->status_var.ha_savepoint_count);
955
Remember the list of registered storage engines. All new
956
engines are prepended to the beginning of the list.
958
sv->ha_list= trans->ha_list;
962
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
965
Ha_trx_info *ha_info= sv->ha_list;
967
for (; ha_info; ha_info= ha_info->next())
970
StorageEngine *engine= ha_info->engine();
971
/* Savepoint life time is enclosed into transaction life time. */
973
if ((err= engine->savepoint_release(session,
976
my_error(ER_GET_ERRNO, MYF(0), err);
53
987
/****************************************************************************
54
** General Cursor functions
988
** General handler functions
55
989
****************************************************************************/
56
Cursor::Cursor(plugin::StorageEngine &engine_arg,
60
estimation_rows_to_insert(0),
62
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
63
ref_length(sizeof(internal::my_off_t)),
66
next_insert_id(0), insert_id_for_cur_row(0)
990
handler::~handler(void)
71
992
assert(locked == false);
72
993
/* TODO: assert(inited == NONE); */
77
* @note this only used in
78
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
79
* of the writing of this comment. -Brian
81
Cursor *Cursor::clone(memory::Root *mem_root)
997
handler *handler::clone(MEM_ROOT *mem_root)
83
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
999
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
86
Allocate Cursor->ref here because otherwise ha_open will allocate it
1001
Allocate handler->ref here because otherwise ha_open will allocate it
87
1002
on this->table->mem_root and we will not be able to reclaim that memory
88
when the clone Cursor object is destroyed.
1003
when the clone handler object is destroyed.
90
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1005
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
93
TableIdentifier identifier(getTable()->getShare()->getSchemaName(),
94
getTable()->getShare()->getTableName(),
95
getTable()->getShare()->getType());
97
if (new_handler && !new_handler->ha_open(identifier,
98
getTable()->getDBStat(),
1007
if (new_handler && !new_handler->ha_open(table,
1008
table->s->normalized_path.str,
99
1010
HA_OPEN_IGNORE_IF_LOCKED))
100
1011
return new_handler;
106
given a buffer with a key value, and a map of keyparts
107
that are present in this value, returns the length of the value
109
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
111
/* works only with key prefixes */
112
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
114
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
115
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
118
while (key_part_found < end_key_part_found && keypart_map_arg)
120
length+= key_part_found->store_length;
121
keypart_map_arg >>= 1;
127
int Cursor::startIndexScan(uint32_t idx, bool sorted)
1015
int handler::ha_index_init(uint32_t idx, bool sorted)
130
assert(inited == NONE);
131
if (!(result= doStartIndexScan(idx, sorted)))
1018
assert(inited==NONE);
1019
if (!(result= index_init(idx, sorted)))
133
1021
end_range= NULL;
137
int Cursor::endIndexScan()
1025
int handler::ha_index_end()
139
1027
assert(inited==INDEX);
141
1029
end_range= NULL;
142
return(doEndIndexScan());
1030
return(index_end());
145
int Cursor::startTableScan(bool scan)
1033
int handler::ha_rnd_init(bool scan)
148
1036
assert(inited==NONE || (inited==RND && scan));
149
inited= (result= doStartTableScan(scan)) ? NONE: RND;
1037
inited= (result= rnd_init(scan)) ? NONE: RND;
154
int Cursor::endTableScan()
1041
int handler::ha_rnd_end()
156
1043
assert(inited==RND);
158
return(doEndTableScan());
161
int Cursor::ha_index_or_rnd_end()
163
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
166
void Cursor::ha_start_bulk_insert(ha_rows rows)
1048
int handler::ha_index_or_rnd_end()
1050
return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0;
1053
handler::Table_flags handler::ha_table_flags() const
1055
return cached_table_flags;
1058
void handler::ha_start_bulk_insert(ha_rows rows)
168
1060
estimation_rows_to_insert= rows;
169
1061
start_bulk_insert(rows);
172
int Cursor::ha_end_bulk_insert()
1064
int handler::ha_end_bulk_insert()
174
1066
estimation_rows_to_insert= 0;
175
1067
return end_bulk_insert();
178
const key_map *Cursor::keys_to_use_for_scanning()
1070
void handler::change_table_ptr(Table *table_arg, TableShare *share)
1076
const key_map *handler::keys_to_use_for_scanning()
180
1078
return &key_map_empty;
183
bool Cursor::has_transactions()
185
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
188
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
190
(getTable()->in_use->status_var.*offset)++;
193
void **Cursor::ha_data(Session *session) const
195
return session->getEngineData(getEngine());
198
bool Cursor::is_fatal_error(int error, uint32_t flags)
1081
bool handler::has_transactions()
1083
return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0;
1086
void handler::ha_statistic_increment(ulong SSV::*offset) const
1088
status_var_increment(table->in_use->status_var.*offset);
1091
void **handler::ha_data(Session *session) const
1093
return session_ha_data(session, engine);
1096
Session *handler::ha_session(void) const
1098
assert(!table || !table->in_use || table->in_use == current_session);
1099
return (table && table->in_use) ? table->in_use : current_session;
1103
bool handler::is_fatal_error(int error, uint32_t flags)
201
1106
((flags & HA_CHECK_DUP_KEY) &&
612
1585
this statement used forced auto_increment values if there were some,
613
1586
wipe them away for other statements.
615
getTable()->in_use->auto_inc_intervals_forced.empty();
619
void Cursor::drop_table(const char *)
1588
table->in_use->auto_inc_intervals_forced.empty();
1593
void handler::print_keydup_error(uint32_t key_nr, const char *msg)
1595
/* Write the duplicated key in the error message */
1596
char key[MAX_KEY_LENGTH];
1597
String str(key,sizeof(key),system_charset_info);
1599
if (key_nr == MAX_KEY)
1601
/* Key is unknown */
1602
str.copy("", 0, system_charset_info);
1603
my_printf_error(ER_DUP_ENTRY, msg, MYF(0), str.c_ptr(), "*UNKNOWN*");
1607
/* Table is opened and defined at this point */
1608
key_unpack(&str,table,(uint32_t) key_nr);
1609
uint32_t max_length=DRIZZLE_ERRMSG_SIZE-(uint32_t) strlen(msg);
1610
if (str.length() >= max_length)
1612
str.length(max_length-4);
1613
str.append(STRING_WITH_LEN("..."));
1615
my_printf_error(ER_DUP_ENTRY, msg,
1616
MYF(0), str.c_ptr(), table->key_info[key_nr].name);
1622
Print error that we got from handler function.
1625
In case of delete table it's only safe to use the following parts of
1626
the 'table' structure:
1630
void handler::print_error(int error, myf errflag)
1632
int textno=ER_GET_ERRNO;
1635
textno=ER_OPEN_AS_READONLY;
1638
textno=ER_FILE_USED;
1641
textno=ER_FILE_NOT_FOUND;
1643
case HA_ERR_KEY_NOT_FOUND:
1644
case HA_ERR_NO_ACTIVE_RECORD:
1645
case HA_ERR_END_OF_FILE:
1646
textno=ER_KEY_NOT_FOUND;
1648
case HA_ERR_WRONG_MRG_TABLE_DEF:
1649
textno=ER_WRONG_MRG_TABLE;
1651
case HA_ERR_FOUND_DUPP_KEY:
1653
uint32_t key_nr=get_dup_key(error);
1654
if ((int) key_nr >= 0)
1656
print_keydup_error(key_nr, ER(ER_DUP_ENTRY_WITH_KEY_NAME));
1662
case HA_ERR_FOREIGN_DUPLICATE_KEY:
1664
uint32_t key_nr= get_dup_key(error);
1665
if ((int) key_nr >= 0)
1667
uint32_t max_length;
1668
/* Write the key in the error message */
1669
char key[MAX_KEY_LENGTH];
1670
String str(key,sizeof(key),system_charset_info);
1671
/* Table is opened and defined at this point */
1672
key_unpack(&str,table,(uint32_t) key_nr);
1673
max_length= (DRIZZLE_ERRMSG_SIZE-
1674
(uint32_t) strlen(ER(ER_FOREIGN_DUPLICATE_KEY)));
1675
if (str.length() >= max_length)
1677
str.length(max_length-4);
1678
str.append(STRING_WITH_LEN("..."));
1680
my_error(ER_FOREIGN_DUPLICATE_KEY, MYF(0), table_share->table_name.str,
1681
str.c_ptr(), key_nr+1);
1687
case HA_ERR_FOUND_DUPP_UNIQUE:
1688
textno=ER_DUP_UNIQUE;
1690
case HA_ERR_RECORD_CHANGED:
1691
textno=ER_CHECKREAD;
1693
case HA_ERR_CRASHED:
1694
textno=ER_NOT_KEYFILE;
1696
case HA_ERR_WRONG_IN_RECORD:
1697
textno= ER_CRASHED_ON_USAGE;
1699
case HA_ERR_CRASHED_ON_USAGE:
1700
textno=ER_CRASHED_ON_USAGE;
1702
case HA_ERR_NOT_A_TABLE:
1705
case HA_ERR_CRASHED_ON_REPAIR:
1706
textno=ER_CRASHED_ON_REPAIR;
1708
case HA_ERR_OUT_OF_MEM:
1709
textno=ER_OUT_OF_RESOURCES;
1711
case HA_ERR_WRONG_COMMAND:
1712
textno=ER_ILLEGAL_HA;
1714
case HA_ERR_OLD_FILE:
1715
textno=ER_OLD_KEYFILE;
1717
case HA_ERR_UNSUPPORTED:
1718
textno=ER_UNSUPPORTED_EXTENSION;
1720
case HA_ERR_RECORD_FILE_FULL:
1721
case HA_ERR_INDEX_FILE_FULL:
1722
textno=ER_RECORD_FILE_FULL;
1724
case HA_ERR_LOCK_WAIT_TIMEOUT:
1725
textno=ER_LOCK_WAIT_TIMEOUT;
1727
case HA_ERR_LOCK_TABLE_FULL:
1728
textno=ER_LOCK_TABLE_FULL;
1730
case HA_ERR_LOCK_DEADLOCK:
1731
textno=ER_LOCK_DEADLOCK;
1733
case HA_ERR_READ_ONLY_TRANSACTION:
1734
textno=ER_READ_ONLY_TRANSACTION;
1736
case HA_ERR_CANNOT_ADD_FOREIGN:
1737
textno=ER_CANNOT_ADD_FOREIGN;
1739
case HA_ERR_ROW_IS_REFERENCED:
1742
get_error_message(error, &str);
1743
my_error(ER_ROW_IS_REFERENCED_2, MYF(0), str.c_ptr_safe());
1746
case HA_ERR_NO_REFERENCED_ROW:
1749
get_error_message(error, &str);
1750
my_error(ER_NO_REFERENCED_ROW_2, MYF(0), str.c_ptr_safe());
1753
case HA_ERR_TABLE_DEF_CHANGED:
1754
textno=ER_TABLE_DEF_CHANGED;
1756
case HA_ERR_NO_SUCH_TABLE:
1757
my_error(ER_NO_SUCH_TABLE, MYF(0), table_share->db.str,
1758
table_share->table_name.str);
1760
case HA_ERR_RBR_LOGGING_FAILED:
1761
textno= ER_BINLOG_ROW_LOGGING_FAILED;
1763
case HA_ERR_DROP_INDEX_FK:
1765
const char *ptr= "???";
1766
uint32_t key_nr= get_dup_key(error);
1767
if ((int) key_nr >= 0)
1768
ptr= table->key_info[key_nr].name;
1769
my_error(ER_DROP_INDEX_FK, MYF(0), ptr);
1772
case HA_ERR_TABLE_NEEDS_UPGRADE:
1773
textno=ER_TABLE_NEEDS_UPGRADE;
1775
case HA_ERR_TABLE_READONLY:
1776
textno= ER_OPEN_AS_READONLY;
1778
case HA_ERR_AUTOINC_READ_FAILED:
1779
textno= ER_AUTOINC_READ_FAILED;
1781
case HA_ERR_AUTOINC_ERANGE:
1782
textno= ER_WARN_DATA_OUT_OF_RANGE;
1784
case HA_ERR_LOCK_OR_ACTIVE_TRANSACTION:
1785
my_message(ER_LOCK_OR_ACTIVE_TRANSACTION,
1786
ER(ER_LOCK_OR_ACTIVE_TRANSACTION), MYF(0));
1790
/* The error was "unknown" to this function.
1791
Ask handler if it has got a message for this error */
1792
bool temporary= false;
1794
temporary= get_error_message(error, &str);
1795
if (!str.is_empty())
1797
const char* engine_name= engine->getName().c_str();
1799
my_error(ER_GET_TEMPORARY_ERRMSG, MYF(0), error, str.ptr(),
1802
my_error(ER_GET_ERRMSG, MYF(0), error, str.ptr(), engine_name);
1806
my_error(ER_GET_ERRNO,errflag,error);
1811
my_error(textno, errflag, table_share->table_name.str, error);
1817
Return an error message specific to this handler.
1819
@param error error code previously returned by handler
1820
@param buf pointer to String where to add error message
1823
Returns true if this is a temporary error
1825
bool handler::get_error_message(int ,
1832
/* Code left, but Drizzle has no legacy yet (while MySQL did) */
1833
int handler::check_old_types()
1840
key if error because of duplicated keys
1842
uint32_t handler::get_dup_key(int error)
1844
table->file->errkey = (uint32_t) -1;
1845
if (error == HA_ERR_FOUND_DUPP_KEY || error == HA_ERR_FOREIGN_DUPLICATE_KEY ||
1846
error == HA_ERR_FOUND_DUPP_UNIQUE ||
1847
error == HA_ERR_DROP_INDEX_FK)
1848
info(HA_STATUS_ERRKEY | HA_STATUS_NO_LOCK);
1849
return(table->file->errkey);
1854
Delete all files with extension from bas_ext().
1856
@param name Base name of table
1859
We assume that the handler may return more extensions than
1860
was actually used for the file.
1863
0 If we successfully deleted at least one file from base_ext and
1864
didn't get any other errors than ENOENT
1868
int handler::delete_table(const char *name)
1871
int enoent_or_zero= ENOENT; // Error if no file was deleted
1872
char buff[FN_REFLEN];
1874
for (const char **ext=bas_ext(); *ext ; ext++)
1876
fn_format(buff, name, "", *ext, MY_UNPACK_FILENAME|MY_APPEND_EXT);
1877
if (my_delete_with_symlink(buff, MYF(0)))
1879
if ((error= my_errno) != ENOENT)
1883
enoent_or_zero= 0; // No error for ENOENT
1884
error= enoent_or_zero;
1890
int handler::rename_table(const char * from, const char * to)
1893
for (const char **ext= bas_ext(); *ext ; ext++)
1895
if (rename_file_ext(from, to, *ext))
1897
if ((error=my_errno) != ENOENT)
1906
void handler::drop_table(const char *name)
1142
2664
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1144
2666
*range_info= mrr_cur_range.ptr;
2671
/* **************************************************************************
2672
* DS-MRR implementation
2673
***************************************************************************/
2676
DS-MRR: Initialize and start MRR scan
2678
Initialize and start the MRR scan. Depending on the mode parameter, this
2679
may use default or DS-MRR implementation.
2681
@param h Table handler to be used
2682
@param key Index to be used
2683
@param seq_funcs Interval sequence enumeration functions
2684
@param seq_init_param Interval sequence enumeration parameter
2685
@param n_ranges Number of ranges in the sequence.
2686
@param mode HA_MRR_* modes to use
2687
@param buf INOUT Buffer to use
2689
@retval 0 Ok, Scan started.
2693
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
2694
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
2695
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
2699
Item *pushed_cond= NULL;
2701
keyno= h_in->active_index;
2703
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
2705
use_default_impl= true;
2706
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
2707
n_ranges, mode, buf));
2709
rowids_buf= buf->buffer;
2710
//psergey-todo: don't add key_length as it is not needed anymore
2711
rowids_buf += key->key_length + h_in->ref_length;
2713
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
2714
rowids_buf_end= buf->buffer_end;
2716
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
2717
rowids_buf_last= rowids_buf +
2718
((rowids_buf_end - rowids_buf)/ elem_size)*
2720
rowids_buf_end= rowids_buf_last;
2722
/* Create a separate handler object to do rndpos() calls. */
2723
Session *session= current_session;
2724
if (!(new_h2= h_in->clone(session->mem_root)) ||
2725
new_h2->ha_external_lock(session, F_RDLCK))
2731
if (keyno == h_in->pushed_idx_cond_keyno)
2732
pushed_cond= h_in->pushed_idx_cond;
2733
if (h_in->ha_index_end())
2740
table->prepare_for_position();
2741
new_h2->extra(HA_EXTRA_KEYREAD);
2743
if (h2->ha_index_init(keyno, false) ||
2744
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
2747
use_default_impl= false;
2750
h2->idx_cond_push(keyno, pushed_cond);
2751
if (dsmrr_fill_buffer(new_h2))
2755
If the above call has scanned through all intervals in *seq, then
2756
adjust *buf to indicate that the remaining buffer space will not be used.
2759
buf->end_of_used_area= rowids_buf_last;
2761
if (h_in->ha_rnd_init(false))
2766
h2->ha_index_or_rnd_end();
2767
h2->ha_external_lock(session, F_UNLCK);
2774
void DsMrr_impl::dsmrr_close()
2778
h2->ha_external_lock(current_session, F_UNLCK);
2783
use_default_impl= true;
2788
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
2790
return ((handler*)h)->cmp_ref(a, b);
2795
DS-MRR: Fill the buffer with rowids and sort it by rowid
2797
{This is an internal function of DiskSweep MRR implementation}
2798
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
2799
buffer. When the buffer is full or scan is completed, sort the buffer by
2802
The function assumes that rowids buffer is empty when it is invoked.
2804
@param h Table handler
2806
@retval 0 OK, the next portion of rowids is in the buffer,
2811
int DsMrr_impl::dsmrr_fill_buffer(handler *)
2816
rowids_buf_cur= rowids_buf;
2817
while ((rowids_buf_cur < rowids_buf_end) &&
2818
!(res= h2->handler::multi_range_read_next(&range_info)))
2820
/* Put rowid, or {rowid, range_id} pair into the buffer */
2821
h2->position(table->record[0]);
2822
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
2823
rowids_buf_cur += h->ref_length;
2827
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
2828
rowids_buf_cur += sizeof(void*);
2832
if (res && res != HA_ERR_END_OF_FILE)
2834
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
2836
/* Sort the buffer contents by rowid */
2837
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
2838
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
2840
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
2842
rowids_buf_last= rowids_buf_cur;
2843
rowids_buf_cur= rowids_buf;
2849
DS-MRR implementation: multi_range_read_next() function
2852
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
2856
if (use_default_impl)
2857
return h_in->handler::multi_range_read_next(range_info);
2859
if (rowids_buf_cur == rowids_buf_last)
2863
res= HA_ERR_END_OF_FILE;
2866
res= dsmrr_fill_buffer(h);
2871
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
2872
if (rowids_buf_cur == rowids_buf_last)
2874
res= HA_ERR_END_OF_FILE;
2878
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
2879
rowids_buf_cur += h_in->ref_length;
2882
memcpy(range_info, rowids_buf_cur, sizeof(void*));
2883
rowids_buf_cur += sizeof(void*);
2894
DS-MRR implementation: multi_range_read_info() function
2896
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
2897
uint32_t *flags, COST_VECT *cost)
2900
uint32_t def_flags= *flags;
2901
uint32_t def_bufsz= *bufsz;
2903
/* Get cost/flags/mem_usage of default MRR implementation */
2904
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
2908
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
2909
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
2911
/* Default implementation is choosen */
2920
DS-MRR Implementation: multi_range_read_info_const() function
2923
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
2924
void *seq_init_param, uint32_t n_ranges,
2925
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
2928
uint32_t def_flags= *flags;
2929
uint32_t def_bufsz= *bufsz;
2930
/* Get cost/flags/mem_usage of default MRR implementation */
2931
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
2932
n_ranges, &def_bufsz,
2934
if (rows == HA_POS_ERROR)
2936
/* Default implementation can't perform MRR scan => we can't either */
2941
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
2942
use the default MRR implementation (we need it for UPDATE/DELETE).
2943
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
2945
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
2946
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
2953
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
2960
Check if key has partially-covered columns
2962
We can't use DS-MRR to perform range scans when the ranges are over
2963
partially-covered keys, because we'll not have full key part values
2964
(we'll have their prefixes from the index) and will not be able to check
2965
if we've reached the end the range.
2967
@param keyno Key to check
2970
Allow use of DS-MRR in cases where the index has partially-covered
2971
components but they are not used for scanning.
2977
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
2979
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
2980
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
2981
for (; kp != kp_end; kp++)
2983
if (!kp->field->part_of_key.test(keyno))
2991
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
2993
Make the choice between using Default MRR implementation and DS-MRR.
2994
This function contains common functionality factored out of dsmrr_info()
2995
and dsmrr_info_const(). The function assumes that the default MRR
2996
implementation's applicability requirements are satisfied.
2998
@param keyno Index number
2999
@param rows E(full rows to be retrieved)
3000
@param flags IN MRR flags provided by the MRR user
3001
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3002
else the value is not modified
3003
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3004
else the value is not modified
3005
@param cost IN Cost of default MRR implementation
3006
OUT If DS-MRR is choosen, cost of DS-MRR scan
3007
else the value is not modified
3009
@retval true Default MRR implementation should be used
3010
@retval false DS-MRR implementation should be used
3013
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3014
uint32_t *bufsz, COST_VECT *cost)
3016
COST_VECT dsmrr_cost;
3018
Session *session= current_session;
3019
if ((session->variables.optimizer_use_mrr == 2) ||
3020
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3021
(keyno == table->s->primary_key &&
3022
h->primary_key_is_clustered()) ||
3023
key_uses_partial_cols(keyno))
3025
/* Use the default implementation */
3026
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3030
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3032
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3038
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
3039
DS-MRR and Default implementations cost. This allows one to force use of
3040
DS-MRR whenever it is applicable without affecting other cost-based
3043
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
3044
dsmrr_cost.total_cost() > cost->total_cost())
3047
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
3049
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
3050
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
3056
/* Use the default MRR implementation */
3063
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
3067
Get cost of DS-MRR scan
3069
@param keynr Index to be used
3070
@param rows E(Number of rows to be scanned)
3071
@param flags Scan parameters (HA_MRR_* flags)
3072
@param buffer_size INOUT Buffer size
3073
@param cost OUT The cost
3076
@retval true Error, DS-MRR cannot be used (the buffer is too small
3080
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
3081
uint32_t *buffer_size, COST_VECT *cost)
3083
uint32_t max_buff_entries, elem_size;
3084
ha_rows rows_in_full_step, rows_in_last_step;
3085
uint32_t n_full_steps;
3086
double index_read_cost;
3088
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
3089
max_buff_entries = *buffer_size / elem_size;
3091
if (!max_buff_entries)
3092
return true; /* Buffer has not enough space for even 1 rowid */
3094
/* Number of iterations we'll make with full buffer */
3095
n_full_steps= (uint32_t)floor(rows2double(rows) / max_buff_entries);
3098
Get numbers of rows we'll be processing in
3099
- non-last sweep, with full buffer
3100
- last iteration, with non-full buffer
3102
rows_in_full_step= max_buff_entries;
3103
rows_in_last_step= rows % max_buff_entries;
3105
/* Adjust buffer size if we expect to use only part of the buffer */
3108
get_sort_and_sweep_cost(table, rows, cost);
3109
cost->multiply(n_full_steps);
3114
*buffer_size= cmax((ulong)*buffer_size,
3115
(size_t)(1.2*rows_in_last_step) * elem_size +
3116
h->ref_length + table->key_info[keynr].key_length);
3119
COST_VECT last_step_cost;
3120
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
3121
cost->add(&last_step_cost);
3123
if (n_full_steps != 0)
3124
cost->mem_cost= *buffer_size;
3126
cost->mem_cost= (double)rows_in_last_step * elem_size;
3128
/* Total cost of all index accesses */
3129
index_read_cost= h->index_only_read_time(keynr, (double)rows);
3130
cost->add_io(index_read_cost, 1 /* Random seeks */);
3136
Get cost of one sort-and-sweep step
3139
get_sort_and_sweep_cost()
3140
table Table being accessed
3141
nrows Number of rows to be sorted and retrieved
3145
Get cost of these operations:
3146
- sort an array of #nrows ROWIDs using qsort
3147
- read #nrows records from table in a sweep.
3151
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3155
get_sweep_read_cost(table, nrows, false, cost);
3156
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3157
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3160
cost->cpu_cost += cmp_op * log2(cmp_op);
3168
Get cost of reading nrows table records in a "disk sweep"
3170
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3171
for an ordered sequence of rowids.
3173
We assume hard disk IO. The read is performed as follows:
3175
1. The disk head is moved to the needed cylinder
3176
2. The controller waits for the plate to rotate
3177
3. The data is transferred
3179
Time to do #3 is insignificant compared to #2+#1.
3181
Time to move the disk head is proportional to head travel distance.
3183
Time to wait for the plate to rotate depends on whether the disk head
3186
If disk head wasn't moved, the wait time is proportional to distance
3187
between the previous block and the block we're reading.
3189
If the head was moved, we don't know how much we'll need to wait for the
3190
plate to rotate. We assume the wait time to be a variate with a mean of
3191
0.5 of full rotation time.
3193
Our cost units are "random disk seeks". The cost of random disk seek is
3194
actually not a constant, it depends one range of cylinders we're going
3195
to access. We make it constant by introducing a fuzzy concept of "typical
3196
datafile length" (it's fuzzy as it's hard to tell whether it should
3197
include index file, temp.tables etc). Then random seek cost is:
3199
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3201
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3203
@param table Table to be accessed
3204
@param nrows Number of rows to retrieve
3205
@param interrupted true <=> Assume that the disk sweep will be
3206
interrupted by other disk IO. false - otherwise.
3207
@param cost OUT The cost.
3210
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3214
if (table->file->primary_key_is_clustered())
3216
cost->io_count= table->file->read_time(table->s->primary_key,
3217
(uint32_t) nrows, nrows);
3222
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3224
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3225
if (busy_blocks < 1.0)
3228
cost->io_count= busy_blocks;
3232
/* Assume reading is done in one 'sweep' */
3233
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3234
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);