23
23
Handler-calling-functions
30
#include "drizzled/my_hash.h"
26
#include "drizzled/server_includes.h"
27
#include "mysys/hash.h"
31
28
#include "drizzled/error.h"
32
29
#include "drizzled/gettext.h"
30
#include "drizzled/data_home.h"
33
31
#include "drizzled/probes.h"
34
32
#include "drizzled/sql_parse.h"
35
#include "drizzled/optimizer/cost_vector.h"
33
#include "drizzled/cost_vect.h"
36
34
#include "drizzled/session.h"
37
35
#include "drizzled/sql_base.h"
38
#include "drizzled/transaction_services.h"
39
#include "drizzled/replication_services.h"
36
#include "drizzled/replicator.h"
40
37
#include "drizzled/lock.h"
41
38
#include "drizzled/item/int.h"
42
39
#include "drizzled/item/empty_string.h"
40
#include "drizzled/unireg.h" // for mysql_frm_type
43
41
#include "drizzled/field/timestamp.h"
44
#include "drizzled/message/table.pb.h"
45
#include "drizzled/plugin/client.h"
46
#include "drizzled/internal/my_sys.h"
47
#include "drizzled/transaction_services.h"
42
#include "drizzled/serialize/table.pb.h"
49
44
using namespace std;
46
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
48
/* number of entries in storage_engines[] */
50
/* number of storage engines (from storage_engines[]) that support 2pc */
51
uint32_t total_ha_2pc= 0;
52
/* size of savepoint storage area (see ha_init) */
53
uint32_t savepoint_alloc_size= 0;
55
const char *ha_row_type[] = {
56
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
59
const char *tx_isolation_names[] =
60
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
63
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
64
tx_isolation_names, NULL};
66
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
67
uint32_t known_extensions_id= 0;
71
Register handler error messages for use with my_error().
79
int ha_init_errors(void)
81
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
84
/* Allocate a pointer array for the error message strings. */
85
/* Zerofill it to avoid uninitialized gaps. */
86
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
88
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
90
/* Set the dedicated error messages. */
91
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
92
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
93
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
94
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
95
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
96
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
97
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
98
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
99
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
100
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
101
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
102
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
103
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
104
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
105
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
106
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
107
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
108
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
109
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
110
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
111
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
112
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
113
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
114
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
115
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
116
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
117
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
118
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
119
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
120
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
121
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
122
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
123
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
124
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
125
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
126
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
127
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
128
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
129
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
130
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
131
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
133
/* Register the error messages for use with my_error(). */
134
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
139
Unregister handler error messages.
146
static int ha_finish_errors(void)
148
const char **errmsgs;
150
/* Allocate a pointer array for the error message strings. */
151
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
153
free((unsigned char*) errmsgs);
161
assert(total_ha < MAX_HA);
163
Check if there is a transaction-capable storage engine besides the
164
binary log (which is considered a transaction-capable storage engine in
167
savepoint_alloc_size+= sizeof(SAVEPOINT);
176
This should be eventualy based on the graceful shutdown flag.
177
So if flag is equal to HA_PANIC_CLOSE, the deallocate
180
if (ha_finish_errors())
186
static bool dropdb_storage_engine(Session *,
190
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
191
if (engine->is_enabled())
192
engine->drop_database((char *)path);
197
void ha_drop_database(char* path)
199
plugin_foreach(NULL, dropdb_storage_engine, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
203
static bool closecon_storage_engine(Session *session, plugin_ref plugin,
206
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
208
there's no need to rollback here as all transactions must
209
be rolled back already
211
if (engine->is_enabled() &&
212
session_get_ha_data(session, engine))
213
engine->close_connection(session);
220
don't bother to rollback here, it's done already
222
void ha_close_connection(Session* session)
224
plugin_foreach(session, closecon_storage_engine, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
227
/* ========================================================================
228
======================= TRANSACTIONS ===================================*/
231
Transaction handling in the server
232
==================================
234
In each client connection, MySQL maintains two transactional
236
- a statement transaction,
237
- a standard, also called normal transaction.
241
"Statement transaction" is a non-standard term that comes
242
from the times when MySQL supported BerkeleyDB storage engine.
244
First of all, it should be said that in BerkeleyDB auto-commit
245
mode auto-commits operations that are atomic to the storage
246
engine itself, such as a write of a record, and are too
247
high-granular to be atomic from the application perspective
248
(MySQL). One SQL statement could involve many BerkeleyDB
249
auto-committed operations and thus BerkeleyDB auto-commit was of
252
Secondly, instead of SQL standard savepoints, BerkeleyDB
253
provided the concept of "nested transactions". In a nutshell,
254
transactions could be arbitrarily nested, but when the parent
255
transaction was committed or aborted, all its child (nested)
256
transactions were handled committed or aborted as well.
257
Commit of a nested transaction, in turn, made its changes
258
visible, but not durable: it destroyed the nested transaction,
259
all its changes would become available to the parent and
260
currently active nested transactions of this parent.
262
So the mechanism of nested transactions was employed to
263
provide "all or nothing" guarantee of SQL statements
264
required by the standard.
265
A nested transaction would be created at start of each SQL
266
statement, and destroyed (committed or aborted) at statement
267
end. Such nested transaction was internally referred to as
268
a "statement transaction" and gave birth to the term.
270
<Historical note ends>
272
Since then a statement transaction is started for each statement
273
that accesses transactional tables or uses the binary log. If
274
the statement succeeds, the statement transaction is committed.
275
If the statement fails, the transaction is rolled back. Commits
276
of statement transactions are not durable -- each such
277
transaction is nested in the normal transaction, and if the
278
normal transaction is rolled back, the effects of all enclosed
279
statement transactions are undone as well. Technically,
280
a statement transaction can be viewed as a savepoint which is
281
maintained automatically in order to make effects of one
284
The normal transaction is started by the user and is ended
285
usually upon a user request as well. The normal transaction
286
encloses transactions of all statements issued between
287
its beginning and its end.
288
In autocommit mode, the normal transaction is equivalent
289
to the statement transaction.
291
Since MySQL supports PSEA (pluggable storage engine
292
architecture), more than one transactional engine can be
293
active at a time. Hence transactions, from the server
294
point of view, are always distributed. In particular,
295
transactional state is maintained independently for each
296
engine. In order to commit a transaction the two phase
297
commit protocol is employed.
299
Not all statements are executed in context of a transaction.
300
Administrative and status information statements do not modify
301
engine data, and thus do not start a statement transaction and
302
also have no effect on the normal transaction. Examples of such
303
statements are SHOW STATUS and RESET SLAVE.
305
Similarly DDL statements are not transactional,
306
and therefore a transaction is [almost] never started for a DDL
307
statement. The difference between a DDL statement and a purely
308
administrative statement though is that a DDL statement always
309
commits the current transaction before proceeding, if there is
312
At last, SQL statements that work with non-transactional
313
engines also have no effect on the transaction state of the
314
connection. Even though they are written to the binary log,
315
and the binary log is, overall, transactional, the writes
316
are done in "write-through" mode, directly to the binlog
317
file, followed with a OS cache sync, in other words,
318
bypassing the binlog undo log (translog).
319
They do not commit the current normal transaction.
320
A failure of a statement that uses non-transactional tables
321
would cause a rollback of the statement transaction, but
322
in case there no non-transactional tables are used,
323
no statement transaction is started.
328
The server stores its transaction-related data in
329
session->transaction. This structure has two members of type
330
Session_TRANS. These members correspond to the statement and
331
normal transactions respectively:
333
- session->transaction.stmt contains a list of engines
334
that are participating in the given statement
335
- session->transaction.all contains a list of engines that
336
have participated in any of the statement transactions started
337
within the context of the normal transaction.
338
Each element of the list contains a pointer to the storage
339
engine, engine-specific transactional data, and engine-specific
342
In autocommit mode session->transaction.all is empty.
343
Instead, data of session->transaction.stmt is
344
used to commit/rollback the normal transaction.
346
The list of registered engines has a few important properties:
347
- no engine is registered in the list twice
348
- engines are present in the list a reverse temporal order --
349
new participants are always added to the beginning of the list.
351
Transaction life cycle
352
----------------------
354
When a new connection is established, session->transaction
355
members are initialized to an empty state.
356
If a statement uses any tables, all affected engines
357
are registered in the statement engine list. In
358
non-autocommit mode, the same engines are registered in
359
the normal transaction list.
360
At the end of the statement, the server issues a commit
361
or a roll back for all engines in the statement list.
362
At this point transaction flags of an engine, if any, are
363
propagated from the statement list to the list of the normal
365
When commit/rollback is finished, the statement list is
366
cleared. It will be filled in again by the next statement,
367
and emptied again at the next statement's end.
369
The normal transaction is committed in a similar way
370
(by going over all engines in session->transaction.all list)
371
but at different times:
372
- upon COMMIT SQL statement is issued by the user
373
- implicitly, by the server, at the beginning of a DDL statement
374
or SET AUTOCOMMIT={0|1} statement.
376
The normal transaction can be rolled back as well:
377
- if the user has requested so, by issuing ROLLBACK SQL
379
- if one of the storage engines requested a rollback
380
by setting session->transaction_rollback_request. This may
381
happen in case, e.g., when the transaction in the engine was
382
chosen a victim of the internal deadlock resolution algorithm
383
and rolled back internally. When such a situation happens, there
384
is little the server can do and the only option is to rollback
385
transactions in all other participating engines. In this case
386
the rollback is accompanied by an error sent to the user.
388
As follows from the use cases above, the normal transaction
389
is never committed when there is an outstanding statement
390
transaction. In most cases there is no conflict, since
391
commits of the normal transaction are issued by a stand-alone
392
administrative or DDL statement, thus no outstanding statement
393
transaction of the previous statement exists. Besides,
394
all statements that manipulate with the normal transaction
395
are prohibited in stored functions and triggers, therefore
396
no conflicting situation can occur in a sub-statement either.
397
The remaining rare cases when the server explicitly has
398
to commit the statement transaction prior to committing the normal
399
one cover error-handling scenarios (see for example
402
When committing a statement or a normal transaction, the server
403
either uses the two-phase commit protocol, or issues a commit
404
in each engine independently. The two-phase commit protocol
406
- all participating engines support two-phase commit (provide
407
StorageEngine::prepare PSEA API call) and
408
- transactions in at least two engines modify data (i.e. are
411
Note that the two phase commit is used for
412
statement transactions, even though they are not durable anyway.
413
This is done to ensure logical consistency of data in a multiple-
415
For example, imagine that some day MySQL supports unique
416
constraint checks deferred till the end of statement. In such
417
case a commit in one of the engines may yield ER_DUP_KEY,
418
and MySQL should be able to gracefully abort statement
419
transactions of other participants.
421
After the normal transaction has been committed,
422
session->transaction.all list is cleared.
424
When a connection is closed, the current normal transaction, if
427
Roles and responsibilities
428
--------------------------
430
The server has no way to know that an engine participates in
431
the statement and a transaction has been started
432
in it unless the engine says so. Thus, in order to be
433
a part of a transaction, the engine must "register" itself.
434
This is done by invoking trans_register_ha() server call.
435
Normally the engine registers itself whenever handler::external_lock()
436
is called. trans_register_ha() can be invoked many times: if
437
an engine is already registered, the call does nothing.
438
In case autocommit is not set, the engine must register itself
439
twice -- both in the statement list and in the normal transaction
441
In which list to register is a parameter of trans_register_ha().
443
Note, that although the registration interface in itself is
444
fairly clear, the current usage practice often leads to undesired
445
effects. E.g. since a call to trans_register_ha() in most engines
446
is embedded into implementation of handler::external_lock(), some
447
DDL statements start a transaction (at least from the server
448
point of view) even though they are not expected to. E.g.
449
CREATE TABLE does not start a transaction, since
450
handler::external_lock() is never called during CREATE TABLE. But
451
CREATE TABLE ... SELECT does, since handler::external_lock() is
452
called for the table that is being selected from. This has no
453
practical effects currently, but must be kept in mind
456
Once an engine is registered, the server will do the rest
459
During statement execution, whenever any of data-modifying
460
PSEA API methods is used, e.g. handler::write_row() or
461
handler::update_row(), the read-write flag is raised in the
462
statement transaction for the involved engine.
463
Currently All PSEA calls are "traced", and the data can not be
464
changed in a way other than issuing a PSEA call. Important:
465
unless this invariant is preserved the server will not know that
466
a transaction in a given engine is read-write and will not
467
involve the two-phase commit protocol!
469
At the end of a statement, server call
470
ha_autocommit_or_rollback() is invoked. This call in turn
471
invokes StorageEngine::prepare() for every involved engine.
472
Prepare is followed by a call to StorageEngine::commit_one_phase()
473
If a one-phase commit will suffice, StorageEngine::prepare() is not
474
invoked and the server only calls StorageEngine::commit_one_phase().
475
At statement commit, the statement-related read-write engine
476
flag is propagated to the corresponding flag in the normal
477
transaction. When the commit is complete, the list of registered
480
Rollback is handled in a similar fashion.
482
Additional notes on DDL and the normal transaction.
483
---------------------------------------------------
485
DDLs and operations with non-transactional engines
486
do not "register" in session->transaction lists, and thus do not
487
modify the transaction state. Besides, each DDL in
488
MySQL is prefixed with an implicit normal transaction commit
489
(a call to Session::endActiveTransaction()), and thus leaves nothing
491
However, as it has been pointed out with CREATE TABLE .. SELECT,
492
some DDL statements can start a *new* transaction.
494
Behaviour of the server in this case is currently badly
496
DDL statements use a form of "semantic" logging
497
to maintain atomicity: if CREATE TABLE .. SELECT failed,
498
the newly created table is deleted.
499
In addition, some DDL statements issue interim transaction
500
commits: e.g. ALTER Table issues a commit after data is copied
501
from the original table to the internal temporary table. Other
502
statements, e.g. CREATE TABLE ... SELECT do not always commit
504
And finally there is a group of DDL statements such as
505
RENAME/DROP Table that doesn't start a new transaction
508
This diversity makes it hard to say what will happen if
509
by chance a stored function is invoked during a DDL --
510
whether any modifications it makes will be committed or not
511
is not clear. Fortunately, SQL grammar of few DDLs allows
512
invocation of a stored function.
514
A consistent behaviour is perhaps to always commit the normal
515
transaction after all DDLs, just like the statement transaction
516
is always committed at the end of all statements.
520
Register a storage engine for a transaction.
522
Every storage engine MUST call this function when it starts
523
a transaction or a statement (that is it must be called both for the
524
"beginning of transaction" and "beginning of statement").
525
Only storage engines registered for the transaction/statement
526
will know when to commit/rollback it.
529
trans_register_ha is idempotent - storage engine may register many
530
times per transaction.
533
void trans_register_ha(Session *session, bool all, StorageEngine *engine)
535
Session_TRANS *trans;
536
Ha_trx_info *ha_info;
540
trans= &session->transaction.all;
541
session->server_status|= SERVER_STATUS_IN_TRANS;
544
trans= &session->transaction.stmt;
546
ha_info= session->ha_data[engine->slot].ha_info + static_cast<unsigned>(all);
548
if (ha_info->is_started())
549
return; /* already registered, return */
551
ha_info->register_ha(trans, engine);
553
trans->no_2pc|= not engine->has_2pc();
554
if (session->transaction.xid_state.xid.is_null())
555
session->transaction.xid_state.xid.set(session->query_id);
564
1 error, transaction was rolled back
566
int ha_prepare(Session *session)
569
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
570
Ha_trx_info *ha_info= trans->ha_list;
573
for (; ha_info; ha_info= ha_info->next())
576
StorageEngine *engine= ha_info->engine();
577
status_var_increment(session->status_var.ha_prepare_count);
578
if ((err= engine->prepare(session, all)))
580
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
581
ha_rollback_trans(session, all);
587
push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
588
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
589
ha_resolve_storage_engine_name(engine));
597
Check if we can skip the two-phase commit.
599
A helper function to evaluate if two-phase commit is mandatory.
600
As a side effect, propagates the read-only/read-write flags
601
of the statement transaction to its enclosing normal transaction.
603
@retval true we must run a two-phase commit. Returned
604
if we have at least two engines with read-write changes.
605
@retval false Don't need two-phase commit. Even if we have two
606
transactional engines, we can run two independent
607
commits if changes in one of the engines are read-only.
612
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
615
/* The number of storage engines that have actual changes. */
616
unsigned rw_ha_count= 0;
617
Ha_trx_info *ha_info;
619
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
621
if (ha_info->is_trx_read_write())
626
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->engine()->slot].ha_info[1];
627
assert(ha_info != ha_info_all);
629
Merge read-only/read-write information about statement
630
transaction to its enclosing normal transaction. Do this
631
only if in a real transaction -- that is, if we know
632
that ha_info_all is registered in session->transaction.all.
633
Since otherwise we only clutter the normal transaction flags.
635
if (ha_info_all->is_started()) /* false if autocommit. */
636
ha_info_all->coalesce_trx_with(ha_info);
638
else if (rw_ha_count > 1)
641
It is a normal transaction, so we don't need to merge read/write
642
information up, and the need for two-phase commit has been
643
already established. Break the loop prematurely.
648
return rw_ha_count > 1;
656
1 transaction was rolled back
658
2 error during commit, data may be inconsistent
661
Since we don't support nested statement transactions in 5.0,
662
we can't commit or rollback stmt transactions while we are inside
663
stored functions or triggers. So we simply do nothing now.
664
TODO: This should be fixed in later ( >= 5.1) releases.
666
int ha_commit_trans(Session *session, bool all)
668
int error= 0, cookie= 0;
670
'all' means that this is either an explicit commit issued by
671
user, or an implicit commit issued by a DDL.
673
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
674
bool is_real_trans= all || session->transaction.all.ha_list == 0;
675
Ha_trx_info *ha_info= trans->ha_list;
678
We must not commit the normal transaction if a statement
679
transaction is pending. Otherwise statement transaction
680
flags will not get propagated to its normal transaction's
683
assert(session->transaction.stmt.ha_list == NULL ||
684
trans == &session->transaction.stmt);
690
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
692
ha_rollback_trans(session, all);
696
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
698
if (!trans->no_2pc && must_2pc)
700
for (; ha_info && !error; ha_info= ha_info->next())
703
StorageEngine *engine= ha_info->engine();
705
Do not call two-phase commit if this particular
706
transaction is read-only. This allows for simpler
707
implementation in engines that are always read-only.
709
if (! ha_info->is_trx_read_write())
712
Sic: we know that prepare() is not NULL since otherwise
713
trans->no_2pc would have been set.
715
if ((err= engine->prepare(session, all)))
717
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
720
status_var_increment(session->status_var.ha_prepare_count);
724
ha_rollback_trans(session, all);
729
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
732
start_waiting_global_read_lock(session);
739
This function does not care about global read lock. A caller should.
741
int ha_commit_one_phase(Session *session, bool all)
744
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
745
bool is_real_trans=all || session->transaction.all.ha_list == 0;
746
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
749
for (; ha_info; ha_info= ha_info_next)
752
StorageEngine *engine= ha_info->engine();
753
if ((err= engine->commit(session, all)))
755
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
758
status_var_increment(session->status_var.ha_commit_count);
759
ha_info_next= ha_info->next();
760
ha_info->reset(); /* keep it conveniently zero-filled */
765
session->transaction.xid_state.xid.null();
768
session->variables.tx_isolation=session->session_tx_isolation;
769
session->transaction.cleanup();
776
int ha_rollback_trans(Session *session, bool all)
779
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
780
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
781
bool is_real_trans=all || session->transaction.all.ha_list == 0;
784
We must not rollback the normal transaction if a statement
785
transaction is pending.
787
assert(session->transaction.stmt.ha_list == NULL ||
788
trans == &session->transaction.stmt);
792
for (; ha_info; ha_info= ha_info_next)
795
StorageEngine *engine= ha_info->engine();
796
if ((err= engine->rollback(session, all)))
798
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
801
status_var_increment(session->status_var.ha_rollback_count);
802
ha_info_next= ha_info->next();
803
ha_info->reset(); /* keep it conveniently zero-filled */
808
session->transaction.xid_state.xid.null();
811
session->variables.tx_isolation=session->session_tx_isolation;
812
session->transaction.cleanup();
816
session->transaction_rollback_request= false;
819
If a non-transactional table was updated, warn; don't warn if this is a
820
slave thread (because when a slave thread executes a ROLLBACK, it has
821
been read from the binary log, so it's 100% sure and normal to produce
822
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
823
slave SQL thread, it would not stop the thread but just be printed in
824
the error log; but we don't want users to wonder why they have this
825
message in the error log, so we don't send it.
827
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
828
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
829
ER_WARNING_NOT_COMPLETE_ROLLBACK,
830
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
835
This is used to commit or rollback a single statement depending on
839
Note that if the autocommit is on, then the following call inside
840
InnoDB will commit or rollback the whole transaction (= the statement). The
841
autocommit mechanism built into InnoDB is based on counting locks, but if
842
the user has used LOCK TABLES then that mechanism does not know to do the
845
int ha_autocommit_or_rollback(Session *session, int error)
847
if (session->transaction.stmt.ha_list)
851
if (ha_commit_trans(session, 0))
856
(void) ha_rollback_trans(session, 0);
857
if (session->transaction_rollback_request)
858
(void) ha_rollback(session);
861
session->variables.tx_isolation=session->session_tx_isolation;
872
static bool xacommit_storage_engine(Session *,
876
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
877
if (engine->is_enabled())
879
engine->commit_by_xid(((struct xaengine_st *)arg)->xid);
880
((struct xaengine_st *)arg)->result= 0;
885
static bool xarollback_storage_engine(Session *,
889
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
890
if (engine->is_enabled())
892
engine->rollback_by_xid(((struct xaengine_st *)arg)->xid);
893
((struct xaengine_st *)arg)->result= 0;
899
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
901
struct xaengine_st xaop;
905
plugin_foreach(NULL, commit ? xacommit_storage_engine : xarollback_storage_engine,
906
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
912
recover() step of xa.
915
there are three modes of operation:
916
- automatic recover after a crash
917
in this case commit_list != 0, tc_heuristic_recover==0
918
all xids from commit_list are committed, others are rolled back
919
- manual (heuristic) recover
920
in this case commit_list==0, tc_heuristic_recover != 0
921
DBA has explicitly specified that all prepared transactions should
922
be committed (or rolled back).
923
- no recovery (MySQL did not detect a crash)
924
in this case commit_list==0, tc_heuristic_recover == 0
925
there should be no prepared transactions in this case.
929
int len, found_foreign_xids, found_my_xids;
935
static bool xarecover_storage_engine(Session *,
939
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
940
struct xarecover_st *info= (struct xarecover_st *) arg;
943
if (engine->is_enabled())
945
while ((got= engine->recover(info->list, info->len)) > 0 )
947
errmsg_printf(ERRMSG_LVL_INFO, _("Found %d prepared transaction(s) in %s"),
948
got, ha_resolve_storage_engine_name(engine));
949
for (int i=0; i < got; i ++)
951
my_xid x=info->list[i].get_my_xid();
952
if (!x) // not "mine" - that is generated by external TM
954
xid_cache_insert(info->list+i, XA_PREPARED);
955
info->found_foreign_xids++;
960
info->found_my_xids++;
964
if (info->commit_list ?
965
hash_search(info->commit_list, (unsigned char *)&x, sizeof(x)) != 0 :
966
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
968
engine->commit_by_xid(info->list+i);
972
engine->rollback_by_xid(info->list+i);
982
int ha_recover(HASH *commit_list)
984
struct xarecover_st info;
985
info.found_foreign_xids= info.found_my_xids= 0;
986
info.commit_list= commit_list;
987
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
990
/* commit_list and tc_heuristic_recover cannot be set both */
991
assert(info.commit_list==0 || tc_heuristic_recover==0);
992
/* if either is set, total_ha_2pc must be set too */
993
assert(info.dry_run);
995
if (total_ha_2pc <= 1)
998
if (info.commit_list)
999
errmsg_printf(ERRMSG_LVL_INFO, _("Starting crash recovery..."));
1002
#ifndef WILL_BE_DELETED_LATER
1005
for now, only InnoDB supports 2pc. It means we can always safely
1006
rollback all pending transactions, without risking inconsistent data
1009
assert(total_ha_2pc == 2); // only InnoDB and binlog
1010
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1015
for (info.len= MAX_XID_LIST_SIZE ;
1016
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1018
info.list=(XID *)malloc(info.len*sizeof(XID));
1022
errmsg_printf(ERRMSG_LVL_ERROR, ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1026
plugin_foreach(NULL, xarecover_storage_engine,
1027
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1029
free((unsigned char*)info.list);
1030
if (info.found_foreign_xids)
1031
errmsg_printf(ERRMSG_LVL_WARN, _("Found %d prepared XA transactions"),
1032
info.found_foreign_xids);
1033
if (info.dry_run && info.found_my_xids)
1035
errmsg_printf(ERRMSG_LVL_ERROR,
1036
_("Found %d prepared transactions! It means that drizzled "
1037
"was not shut down properly last time and critical "
1038
"recovery information (last binlog or %s file) was "
1039
"manually deleted after a crash. You have to start "
1040
"drizzled with the --tc-heuristic-recover switch to "
1041
"commit or rollback pending transactions."),
1042
info.found_my_xids, opt_tc_log_file);
1045
if (info.commit_list)
1046
errmsg_printf(ERRMSG_LVL_INFO, _("Crash recovery finished."));
1051
return the list of XID's to a client, the same way SHOW commands do.
1054
I didn't find in XA specs that an RM cannot return the same XID twice,
1055
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1056
It can be easily fixed later, if necessary.
1058
bool mysql_xa_recover(Session *session)
1060
List<Item> field_list;
1061
Protocol *protocol= session->protocol;
1065
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1066
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1067
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1068
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1070
if (protocol->send_fields(&field_list,
1071
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1074
pthread_mutex_lock(&LOCK_xid_cache);
1075
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1077
if (xs->xa_state==XA_PREPARED)
1079
protocol->prepare_for_resend();
1080
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1081
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1082
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1083
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1085
if (protocol->write())
1087
pthread_mutex_unlock(&LOCK_xid_cache);
1093
pthread_mutex_unlock(&LOCK_xid_cache);
1100
This function should be called when MySQL sends rows of a SELECT result set
1101
or the EOF mark to the client. It releases a possible adaptive hash index
1102
S-latch held by session in InnoDB and also releases a possible InnoDB query
1103
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a session to
1104
keep them over several calls of the InnoDB handler interface when a join
1105
is executed. But when we let the control to pass to the client they have
1106
to be released because if the application program uses mysql_use_result(),
1107
it may deadlock on the S-latch if the application on another connection
1108
performs another SQL query. In MySQL-4.1 this is even more important because
1109
there a connection can have several SELECT queries open at the same time.
1111
@param session the thread handle of the current connection
1116
static bool release_temporary_latches(Session *session, plugin_ref plugin,
1119
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
1121
if (engine->is_enabled())
1122
engine->release_temporary_latches(session);
1128
int ha_release_temporary_latches(Session *session)
1130
plugin_foreach(session, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1136
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
1139
Session_TRANS *trans= &session->transaction.all;
1140
Ha_trx_info *ha_info, *ha_info_next;
1144
rolling back to savepoint in all storage engines that were part of the
1145
transaction when the savepoint was set
1147
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1150
StorageEngine *engine= ha_info->engine();
1152
if ((err= engine->savepoint_rollback(session,
1155
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1158
status_var_increment(session->status_var.ha_savepoint_rollback_count);
1159
trans->no_2pc|= not engine->has_2pc();
1162
rolling back the transaction in all storage engines that were not part of
1163
the transaction when the savepoint was set
1165
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1166
ha_info= ha_info_next)
1169
StorageEngine *engine= ha_info->engine();
1170
if ((err= engine->rollback(session, !(0))))
1172
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1175
status_var_increment(session->status_var.ha_rollback_count);
1176
ha_info_next= ha_info->next();
1177
ha_info->reset(); /* keep it conveniently zero-filled */
1179
trans->ha_list= sv->ha_list;
1185
according to the sql standard (ISO/IEC 9075-2:2003)
1186
section "4.33.4 SQL-statements and transaction states",
1187
SAVEPOINT is *not* transaction-initiating SQL-statement
1189
int ha_savepoint(Session *session, SAVEPOINT *sv)
1192
Session_TRANS *trans= &session->transaction.all;
1193
Ha_trx_info *ha_info= trans->ha_list;
1194
for (; ha_info; ha_info= ha_info->next())
1197
StorageEngine *engine= ha_info->engine();
1199
/* if (! engine->savepoint_set)
1201
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1205
if ((err= engine->savepoint_set(session, (void *)(sv+1))))
1207
my_error(ER_GET_ERRNO, MYF(0), err);
1210
status_var_increment(session->status_var.ha_savepoint_count);
1213
Remember the list of registered storage engines. All new
1214
engines are prepended to the beginning of the list.
1216
sv->ha_list= trans->ha_list;
1220
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
1223
Ha_trx_info *ha_info= sv->ha_list;
1225
for (; ha_info; ha_info= ha_info->next())
1228
StorageEngine *engine= ha_info->engine();
1229
/* Savepoint life time is enclosed into transaction life time. */
1231
if ((err= engine->savepoint_release(session,
1234
my_error(ER_GET_ERRNO, MYF(0), err);
1242
static bool snapshot_storage_engine(Session *session, plugin_ref plugin, void *arg)
1244
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
1245
if (engine->is_enabled())
1247
engine->start_consistent_snapshot(session);
1248
*((bool *)arg)= false;
1253
int ha_start_consistent_snapshot(Session *session)
1257
plugin_foreach(session, snapshot_storage_engine, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1260
Same idea as when one wants to CREATE TABLE in one engine which does not
1264
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1265
"This Drizzle server does not support any "
1266
"consistent-read capable storage engine");
1271
static bool flush_storage_engine(Session *,
1275
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
1276
if (engine->is_enabled() &&
1277
engine->flush_logs())
1283
bool ha_flush_logs(StorageEngine *engine)
1287
if (plugin_foreach(NULL, flush_storage_engine,
1288
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1293
if ((!engine->is_enabled()) ||
1294
(engine->flush_logs()))
1300
static const char *check_lowercase_names(handler *file, const char *path,
1303
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1306
/* Ensure that table handler get path in lower case */
1307
if (tmp_path != path)
1308
strcpy(tmp_path, path);
1311
we only should turn into lowercase database/table part
1312
so start the process after homedirectory
1314
my_casedn_str(files_charset_info, tmp_path + drizzle_data_home_len);
1320
An interceptor to hijack the text of the error message without
1321
setting an error in the thread. We need the text to present it
1322
in the form of a warning to the user.
1325
struct Ha_delete_table_error_handler: public Internal_error_handler
1328
Ha_delete_table_error_handler() : Internal_error_handler() {}
1329
virtual bool handle_error(uint32_t sql_errno,
1330
const char *message,
1331
DRIZZLE_ERROR::enum_warning_level level,
1333
char buff[DRIZZLE_ERRMSG_SIZE];
1338
Ha_delete_table_error_handler::
1339
handle_error(uint32_t ,
1340
const char *message,
1341
DRIZZLE_ERROR::enum_warning_level ,
1344
/* Grab the error message */
1345
strncpy(buff, message, sizeof(buff)-1);
1350
struct storage_engine_delete_table_args {
1357
static bool deletetable_storage_engine(Session *,
1361
struct storage_engine_delete_table_args *dtargs= (struct storage_engine_delete_table_args *) args;
1363
Session *session= dtargs->session;
1364
const char *path= dtargs->path;
1367
char tmp_path[FN_REFLEN];
1369
if(dtargs->error!=ENOENT) /* already deleted table */
1372
StorageEngine *engine= plugin_data(plugin, StorageEngine *);
1377
if (!engine->is_enabled())
1380
if ((file= engine->create(NULL, session->mem_root)))
1385
path= check_lowercase_names(file, path, tmp_path);
1386
int error= file->ha_delete_table(path);
1390
dtargs->error= error;
1392
delete dtargs->file;
1403
This should return ENOENT if the file doesn't exists.
1404
The .frm file will be deleted only if we return 0 or ENOENT
1406
int ha_delete_table(Session *session, const char *path,
1407
const char *db, const char *alias, bool generate_warning)
1409
TABLE_SHARE dummy_share;
1412
struct storage_engine_delete_table_args dtargs;
1413
dtargs.error= ENOENT;
1414
dtargs.session= session;
1418
plugin_foreach(NULL, deletetable_storage_engine, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1421
memset(&dummy_table, 0, sizeof(dummy_table));
1422
memset(&dummy_share, 0, sizeof(dummy_share));
1423
dummy_table.s= &dummy_share;
1425
if (dtargs.error && generate_warning)
1428
Because file->print_error() use my_error() to generate the error message
1429
we use an internal error handler to intercept it and store the text
1430
in a temporary buffer. Later the message will be presented to user
1433
Ha_delete_table_error_handler ha_delete_table_error_handler;
1435
/* Fill up strucutures that print_error may need */
1436
dummy_share.path.str= (char*) path;
1437
dummy_share.path.length= strlen(path);
1438
dummy_share.db.str= (char*) db;
1439
dummy_share.db.length= strlen(db);
1440
dummy_share.table_name.str= (char*) alias;
1441
dummy_share.table_name.length= strlen(alias);
1442
dummy_table.alias= alias;
1446
handler *file= dtargs.file;
1447
file->change_table_ptr(&dummy_table, &dummy_share);
1449
session->push_internal_handler(&ha_delete_table_error_handler);
1450
file->print_error(dtargs.error, 0);
1452
session->pop_internal_handler();
1455
dtargs.error= -1; /* General form of fail. maybe bad FRM */
1458
XXX: should we convert *all* errors to warnings here?
1459
What if the error is fatal?
1461
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_ERROR, dtargs.error,
1462
ha_delete_table_error_handler.buff);
1468
return dtargs.error;
54
1471
/****************************************************************************
55
** General Cursor functions
1472
** General handler functions
56
1473
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
58
TableShare &share_arg)
59
: table_share(&share_arg), table(0),
60
estimation_rows_to_insert(0), engine(&engine_arg),
61
ref(0), in_range_check_pushed_down(false),
62
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
63
ref_length(sizeof(internal::my_off_t)),
65
locked(false), implicit_emptied(0),
66
next_insert_id(0), insert_id_for_cur_row(0)
71
assert(locked == false);
72
/* TODO: assert(inited == NONE); */
76
Cursor *Cursor::clone(memory::Root *mem_root)
78
Cursor *new_handler= table->s->db_type()->getCursor(*table->s, mem_root);
1474
handler *handler::clone(MEM_ROOT *mem_root)
1476
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
81
Allocate Cursor->ref here because otherwise ha_open will allocate it
1478
Allocate handler->ref here because otherwise ha_open will allocate it
82
1479
on this->table->mem_root and we will not be able to reclaim that memory
83
when the clone Cursor object is destroyed.
1480
when the clone handler object is destroyed.
85
1482
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
1136
3349
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1138
3351
*range_info= mrr_cur_range.ptr;
3356
/* **************************************************************************
3357
* DS-MRR implementation
3358
***************************************************************************/
3361
DS-MRR: Initialize and start MRR scan
3363
Initialize and start the MRR scan. Depending on the mode parameter, this
3364
may use default or DS-MRR implementation.
3366
@param h Table handler to be used
3367
@param key Index to be used
3368
@param seq_funcs Interval sequence enumeration functions
3369
@param seq_init_param Interval sequence enumeration parameter
3370
@param n_ranges Number of ranges in the sequence.
3371
@param mode HA_MRR_* modes to use
3372
@param buf INOUT Buffer to use
3374
@retval 0 Ok, Scan started.
3378
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
3379
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3380
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
3384
Item *pushed_cond= NULL;
3386
keyno= h_in->active_index;
3388
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3390
use_default_impl= true;
3391
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
3392
n_ranges, mode, buf));
3394
rowids_buf= buf->buffer;
3395
//psergey-todo: don't add key_length as it is not needed anymore
3396
rowids_buf += key->key_length + h_in->ref_length;
3398
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3399
rowids_buf_end= buf->buffer_end;
3401
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
3402
rowids_buf_last= rowids_buf +
3403
((rowids_buf_end - rowids_buf)/ elem_size)*
3405
rowids_buf_end= rowids_buf_last;
3407
/* Create a separate handler object to do rndpos() calls. */
3408
Session *session= current_session;
3409
if (!(new_h2= h_in->clone(session->mem_root)) ||
3410
new_h2->ha_external_lock(session, F_RDLCK))
3416
if (keyno == h_in->pushed_idx_cond_keyno)
3417
pushed_cond= h_in->pushed_idx_cond;
3418
if (h_in->ha_index_end())
3425
table->prepare_for_position();
3426
new_h2->extra(HA_EXTRA_KEYREAD);
3428
if (h2->ha_index_init(keyno, false) ||
3429
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3432
use_default_impl= false;
3435
h2->idx_cond_push(keyno, pushed_cond);
3436
if (dsmrr_fill_buffer(new_h2))
3440
If the above call has scanned through all intervals in *seq, then
3441
adjust *buf to indicate that the remaining buffer space will not be used.
3444
buf->end_of_used_area= rowids_buf_last;
3446
if (h_in->ha_rnd_init(false))
3451
h2->ha_index_or_rnd_end();
3452
h2->ha_external_lock(session, F_UNLCK);
3459
void DsMrr_impl::dsmrr_close()
3463
h2->ha_external_lock(current_session, F_UNLCK);
3468
use_default_impl= true;
3473
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
3475
return ((handler*)h)->cmp_ref(a, b);
3480
DS-MRR: Fill the buffer with rowids and sort it by rowid
3482
{This is an internal function of DiskSweep MRR implementation}
3483
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3484
buffer. When the buffer is full or scan is completed, sort the buffer by
3487
The function assumes that rowids buffer is empty when it is invoked.
3489
@param h Table handler
3491
@retval 0 OK, the next portion of rowids is in the buffer,
3496
int DsMrr_impl::dsmrr_fill_buffer(handler *)
3501
rowids_buf_cur= rowids_buf;
3502
while ((rowids_buf_cur < rowids_buf_end) &&
3503
!(res= h2->handler::multi_range_read_next(&range_info)))
3505
/* Put rowid, or {rowid, range_id} pair into the buffer */
3506
h2->position(table->record[0]);
3507
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3508
rowids_buf_cur += h->ref_length;
3512
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3513
rowids_buf_cur += sizeof(void*);
3517
if (res && res != HA_ERR_END_OF_FILE)
3519
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3521
/* Sort the buffer contents by rowid */
3522
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3523
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3525
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3527
rowids_buf_last= rowids_buf_cur;
3528
rowids_buf_cur= rowids_buf;
3534
DS-MRR implementation: multi_range_read_next() function
3537
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
3541
if (use_default_impl)
3542
return h_in->handler::multi_range_read_next(range_info);
3544
if (rowids_buf_cur == rowids_buf_last)
3548
res= HA_ERR_END_OF_FILE;
3551
res= dsmrr_fill_buffer(h);
3556
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3557
if (rowids_buf_cur == rowids_buf_last)
3559
res= HA_ERR_END_OF_FILE;
3563
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
3564
rowids_buf_cur += h_in->ref_length;
3567
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3568
rowids_buf_cur += sizeof(void*);
3579
DS-MRR implementation: multi_range_read_info() function
3581
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
3582
uint32_t *flags, COST_VECT *cost)
3585
uint32_t def_flags= *flags;
3586
uint32_t def_bufsz= *bufsz;
3588
/* Get cost/flags/mem_usage of default MRR implementation */
3589
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3593
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3594
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3596
/* Default implementation is choosen */
3605
DS-MRR Implementation: multi_range_read_info_const() function
3608
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
3609
void *seq_init_param, uint32_t n_ranges,
3610
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
3613
uint32_t def_flags= *flags;
3614
uint32_t def_bufsz= *bufsz;
3615
/* Get cost/flags/mem_usage of default MRR implementation */
3616
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3617
n_ranges, &def_bufsz,
3619
if (rows == HA_POS_ERROR)
3621
/* Default implementation can't perform MRR scan => we can't either */
3626
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3627
use the default MRR implementation (we need it for UPDATE/DELETE).
3628
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3630
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3631
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3638
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3645
Check if key has partially-covered columns
3647
We can't use DS-MRR to perform range scans when the ranges are over
3648
partially-covered keys, because we'll not have full key part values
3649
(we'll have their prefixes from the index) and will not be able to check
3650
if we've reached the end the range.
3652
@param keyno Key to check
3655
Allow use of DS-MRR in cases where the index has partially-covered
3656
components but they are not used for scanning.
3662
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
3664
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3665
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3666
for (; kp != kp_end; kp++)
3668
if (!kp->field->part_of_key.is_set(keyno))
3676
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
3678
Make the choice between using Default MRR implementation and DS-MRR.
3679
This function contains common functionality factored out of dsmrr_info()
3680
and dsmrr_info_const(). The function assumes that the default MRR
3681
implementation's applicability requirements are satisfied.
3683
@param keyno Index number
3684
@param rows E(full rows to be retrieved)
3685
@param flags IN MRR flags provided by the MRR user
3686
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3687
else the value is not modified
3688
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3689
else the value is not modified
3690
@param cost IN Cost of default MRR implementation
3691
OUT If DS-MRR is choosen, cost of DS-MRR scan
3692
else the value is not modified
3694
@retval true Default MRR implementation should be used
3695
@retval false DS-MRR implementation should be used
3698
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3699
uint32_t *bufsz, COST_VECT *cost)
3701
COST_VECT dsmrr_cost;
3703
Session *session= current_session;
3704
if ((session->variables.optimizer_use_mrr == 2) ||
3705
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3706
(keyno == table->s->primary_key &&
3707
h->primary_key_is_clustered()) ||
3708
key_uses_partial_cols(keyno))
3710
/* Use the default implementation */
3711
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3715
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3717
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3723
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
3724
DS-MRR and Default implementations cost. This allows one to force use of
3725
DS-MRR whenever it is applicable without affecting other cost-based
3728
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
3729
dsmrr_cost.total_cost() > cost->total_cost())
3732
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
3734
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
3735
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
3741
/* Use the default MRR implementation */
3748
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
3752
Get cost of DS-MRR scan
3754
@param keynr Index to be used
3755
@param rows E(Number of rows to be scanned)
3756
@param flags Scan parameters (HA_MRR_* flags)
3757
@param buffer_size INOUT Buffer size
3758
@param cost OUT The cost
3761
@retval true Error, DS-MRR cannot be used (the buffer is too small
3765
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
3766
uint32_t *buffer_size, COST_VECT *cost)
3768
uint32_t max_buff_entries, elem_size;
3769
ha_rows rows_in_full_step, rows_in_last_step;
3770
uint32_t n_full_steps;
3771
double index_read_cost;
3773
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
3774
max_buff_entries = *buffer_size / elem_size;
3776
if (!max_buff_entries)
3777
return true; /* Buffer has not enough space for even 1 rowid */
3779
/* Number of iterations we'll make with full buffer */
3780
n_full_steps= (uint32_t)floor(rows2double(rows) / max_buff_entries);
3783
Get numbers of rows we'll be processing in
3784
- non-last sweep, with full buffer
3785
- last iteration, with non-full buffer
3787
rows_in_full_step= max_buff_entries;
3788
rows_in_last_step= rows % max_buff_entries;
3790
/* Adjust buffer size if we expect to use only part of the buffer */
3793
get_sort_and_sweep_cost(table, rows, cost);
3794
cost->multiply(n_full_steps);
3799
*buffer_size= cmax((ulong)*buffer_size,
3800
(size_t)(1.2*rows_in_last_step) * elem_size +
3801
h->ref_length + table->key_info[keynr].key_length);
3804
COST_VECT last_step_cost;
3805
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
3806
cost->add(&last_step_cost);
3808
if (n_full_steps != 0)
3809
cost->mem_cost= *buffer_size;
3811
cost->mem_cost= (double)rows_in_last_step * elem_size;
3813
/* Total cost of all index accesses */
3814
index_read_cost= h->index_only_read_time(keynr, (double)rows);
3815
cost->add_io(index_read_cost, 1 /* Random seeks */);
3821
Get cost of one sort-and-sweep step
3824
get_sort_and_sweep_cost()
3825
table Table being accessed
3826
nrows Number of rows to be sorted and retrieved
3830
Get cost of these operations:
3831
- sort an array of #nrows ROWIDs using qsort
3832
- read #nrows records from table in a sweep.
3836
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3840
get_sweep_read_cost(table, nrows, false, cost);
3841
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3842
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3845
cost->cpu_cost += cmp_op * log2(cmp_op);
3853
Get cost of reading nrows table records in a "disk sweep"
3855
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3856
for an ordered sequence of rowids.
3858
We assume hard disk IO. The read is performed as follows:
3860
1. The disk head is moved to the needed cylinder
3861
2. The controller waits for the plate to rotate
3862
3. The data is transferred
3864
Time to do #3 is insignificant compared to #2+#1.
3866
Time to move the disk head is proportional to head travel distance.
3868
Time to wait for the plate to rotate depends on whether the disk head
3871
If disk head wasn't moved, the wait time is proportional to distance
3872
between the previous block and the block we're reading.
3874
If the head was moved, we don't know how much we'll need to wait for the
3875
plate to rotate. We assume the wait time to be a variate with a mean of
3876
0.5 of full rotation time.
3878
Our cost units are "random disk seeks". The cost of random disk seek is
3879
actually not a constant, it depends one range of cylinders we're going
3880
to access. We make it constant by introducing a fuzzy concept of "typical
3881
datafile length" (it's fuzzy as it's hard to tell whether it should
3882
include index file, temp.tables etc). Then random seek cost is:
3884
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3886
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3888
@param table Table to be accessed
3889
@param nrows Number of rows to retrieve
3890
@param interrupted true <=> Assume that the disk sweep will be
3891
interrupted by other disk IO. false - otherwise.
3892
@param cost OUT The cost.
3895
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3899
if (table->file->primary_key_is_clustered())
3901
cost->io_count= table->file->read_time(table->s->primary_key,
3902
(uint32_t) nrows, nrows);
3907
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3909
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3910
if (busy_blocks < 1.0)
3913
cost->io_count= busy_blocks;
3917
/* Assume reading is done in one 'sweep' */
3918
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3919
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);