23
23
Handler-calling-functions
30
#include "drizzled/my_hash.h"
31
#include "drizzled/error.h"
32
#include "drizzled/gettext.h"
33
#include "drizzled/probes.h"
34
#include "drizzled/sql_parse.h"
35
#include "drizzled/cost_vect.h"
36
#include "drizzled/session.h"
37
#include "drizzled/sql_base.h"
38
#include "drizzled/replication_services.h"
39
#include "drizzled/lock.h"
40
#include "drizzled/item/int.h"
41
#include "drizzled/item/empty_string.h"
42
#include "drizzled/field/timestamp.h"
43
#include "drizzled/message/table.pb.h"
44
#include "drizzled/plugin/client.h"
45
#include "drizzled/internal/my_sys.h"
46
#include "drizzled/transaction_services.h"
26
#include <drizzled/server_includes.h>
27
#include <libdrizzle/libdrizzle.h>
28
#include <mysys/hash.h>
29
#include <drizzled/error.h>
30
#include <drizzled/gettext.h>
31
#include <drizzled/data_home.h>
32
#include <drizzled/probes.h>
33
#include <drizzled/sql_parse.h>
34
#include <drizzled/cost_vect.h>
36
#include <drizzled/session.h>
37
#include <drizzled/sql_base.h>
38
#include <drizzled/replicator.h>
39
#include <drizzled/lock.h>
40
#include <drizzled/item/int.h>
41
#include <drizzled/item/empty_string.h>
42
#include <drizzled/unireg.h> // for mysql_frm_type
44
#if defined(CMATH_NAMESPACE)
45
using namespace CMATH_NAMESPACE;
49
extern HASH open_cache;
51
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
53
/* number of entries in handlertons[] */
55
/* number of storage engines (from handlertons[]) that support 2pc */
56
uint32_t total_ha_2pc= 0;
57
/* size of savepoint storage area (see ha_init) */
58
uint32_t savepoint_alloc_size= 0;
60
const char *ha_row_type[] = {
61
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
64
const char *tx_isolation_names[] =
65
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
68
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
69
tx_isolation_names, NULL};
71
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
72
uint32_t known_extensions_id= 0;
76
Register handler error messages for use with my_error().
84
int ha_init_errors(void)
86
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
89
/* Allocate a pointer array for the error message strings. */
90
/* Zerofill it to avoid uninitialized gaps. */
91
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
93
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
95
/* Set the dedicated error messages. */
96
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
97
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
98
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
99
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
100
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
101
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
102
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
103
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
104
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
105
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
106
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
107
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
108
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
109
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
110
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
111
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
112
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
113
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
114
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
115
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
116
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
117
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
118
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
119
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
120
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
121
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
122
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
123
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
124
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
125
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
126
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
127
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
128
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
129
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
130
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
131
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
132
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
133
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
134
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
135
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
136
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
138
/* Register the error messages for use with my_error(). */
139
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
144
Unregister handler error messages.
151
static int ha_finish_errors(void)
153
const char **errmsgs;
155
/* Allocate a pointer array for the error message strings. */
156
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
158
free((unsigned char*) errmsgs);
166
assert(total_ha < MAX_HA);
168
Check if there is a transaction-capable storage engine besides the
169
binary log (which is considered a transaction-capable storage engine in
172
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
173
savepoint_alloc_size+= sizeof(SAVEPOINT);
182
This should be eventualy based on the graceful shutdown flag.
183
So if flag is equal to HA_PANIC_CLOSE, the deallocate
186
if (ha_finish_errors())
192
static bool dropdb_handlerton(Session *unused1 __attribute__((unused)),
196
handlerton *hton= plugin_data(plugin, handlerton *);
197
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
198
hton->drop_database(hton, (char *)path);
203
void ha_drop_database(char* path)
205
plugin_foreach(NULL, dropdb_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
209
static bool closecon_handlerton(Session *session, plugin_ref plugin,
210
void *unused __attribute__((unused)))
212
handlerton *hton= plugin_data(plugin, handlerton *);
214
there's no need to rollback here as all transactions must
215
be rolled back already
217
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
218
session_get_ha_data(session, hton))
219
hton->close_connection(hton, session);
226
don't bother to rollback here, it's done already
228
void ha_close_connection(Session* session)
230
plugin_foreach(session, closecon_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
233
/* ========================================================================
234
======================= TRANSACTIONS ===================================*/
237
Transaction handling in the server
238
==================================
240
In each client connection, MySQL maintains two transactional
242
- a statement transaction,
243
- a standard, also called normal transaction.
247
"Statement transaction" is a non-standard term that comes
248
from the times when MySQL supported BerkeleyDB storage engine.
250
First of all, it should be said that in BerkeleyDB auto-commit
251
mode auto-commits operations that are atomic to the storage
252
engine itself, such as a write of a record, and are too
253
high-granular to be atomic from the application perspective
254
(MySQL). One SQL statement could involve many BerkeleyDB
255
auto-committed operations and thus BerkeleyDB auto-commit was of
258
Secondly, instead of SQL standard savepoints, BerkeleyDB
259
provided the concept of "nested transactions". In a nutshell,
260
transactions could be arbitrarily nested, but when the parent
261
transaction was committed or aborted, all its child (nested)
262
transactions were handled committed or aborted as well.
263
Commit of a nested transaction, in turn, made its changes
264
visible, but not durable: it destroyed the nested transaction,
265
all its changes would become available to the parent and
266
currently active nested transactions of this parent.
268
So the mechanism of nested transactions was employed to
269
provide "all or nothing" guarantee of SQL statements
270
required by the standard.
271
A nested transaction would be created at start of each SQL
272
statement, and destroyed (committed or aborted) at statement
273
end. Such nested transaction was internally referred to as
274
a "statement transaction" and gave birth to the term.
276
<Historical note ends>
278
Since then a statement transaction is started for each statement
279
that accesses transactional tables or uses the binary log. If
280
the statement succeeds, the statement transaction is committed.
281
If the statement fails, the transaction is rolled back. Commits
282
of statement transactions are not durable -- each such
283
transaction is nested in the normal transaction, and if the
284
normal transaction is rolled back, the effects of all enclosed
285
statement transactions are undone as well. Technically,
286
a statement transaction can be viewed as a savepoint which is
287
maintained automatically in order to make effects of one
290
The normal transaction is started by the user and is ended
291
usually upon a user request as well. The normal transaction
292
encloses transactions of all statements issued between
293
its beginning and its end.
294
In autocommit mode, the normal transaction is equivalent
295
to the statement transaction.
297
Since MySQL supports PSEA (pluggable storage engine
298
architecture), more than one transactional engine can be
299
active at a time. Hence transactions, from the server
300
point of view, are always distributed. In particular,
301
transactional state is maintained independently for each
302
engine. In order to commit a transaction the two phase
303
commit protocol is employed.
305
Not all statements are executed in context of a transaction.
306
Administrative and status information statements do not modify
307
engine data, and thus do not start a statement transaction and
308
also have no effect on the normal transaction. Examples of such
309
statements are SHOW STATUS and RESET SLAVE.
311
Similarly DDL statements are not transactional,
312
and therefore a transaction is [almost] never started for a DDL
313
statement. The difference between a DDL statement and a purely
314
administrative statement though is that a DDL statement always
315
commits the current transaction before proceeding, if there is
318
At last, SQL statements that work with non-transactional
319
engines also have no effect on the transaction state of the
320
connection. Even though they are written to the binary log,
321
and the binary log is, overall, transactional, the writes
322
are done in "write-through" mode, directly to the binlog
323
file, followed with a OS cache sync, in other words,
324
bypassing the binlog undo log (translog).
325
They do not commit the current normal transaction.
326
A failure of a statement that uses non-transactional tables
327
would cause a rollback of the statement transaction, but
328
in case there no non-transactional tables are used,
329
no statement transaction is started.
334
The server stores its transaction-related data in
335
session->transaction. This structure has two members of type
336
Session_TRANS. These members correspond to the statement and
337
normal transactions respectively:
339
- session->transaction.stmt contains a list of engines
340
that are participating in the given statement
341
- session->transaction.all contains a list of engines that
342
have participated in any of the statement transactions started
343
within the context of the normal transaction.
344
Each element of the list contains a pointer to the storage
345
engine, engine-specific transactional data, and engine-specific
348
In autocommit mode session->transaction.all is empty.
349
Instead, data of session->transaction.stmt is
350
used to commit/rollback the normal transaction.
352
The list of registered engines has a few important properties:
353
- no engine is registered in the list twice
354
- engines are present in the list a reverse temporal order --
355
new participants are always added to the beginning of the list.
357
Transaction life cycle
358
----------------------
360
When a new connection is established, session->transaction
361
members are initialized to an empty state.
362
If a statement uses any tables, all affected engines
363
are registered in the statement engine list. In
364
non-autocommit mode, the same engines are registered in
365
the normal transaction list.
366
At the end of the statement, the server issues a commit
367
or a roll back for all engines in the statement list.
368
At this point transaction flags of an engine, if any, are
369
propagated from the statement list to the list of the normal
371
When commit/rollback is finished, the statement list is
372
cleared. It will be filled in again by the next statement,
373
and emptied again at the next statement's end.
375
The normal transaction is committed in a similar way
376
(by going over all engines in session->transaction.all list)
377
but at different times:
378
- upon COMMIT SQL statement is issued by the user
379
- implicitly, by the server, at the beginning of a DDL statement
380
or SET AUTOCOMMIT={0|1} statement.
382
The normal transaction can be rolled back as well:
383
- if the user has requested so, by issuing ROLLBACK SQL
385
- if one of the storage engines requested a rollback
386
by setting session->transaction_rollback_request. This may
387
happen in case, e.g., when the transaction in the engine was
388
chosen a victim of the internal deadlock resolution algorithm
389
and rolled back internally. When such a situation happens, there
390
is little the server can do and the only option is to rollback
391
transactions in all other participating engines. In this case
392
the rollback is accompanied by an error sent to the user.
394
As follows from the use cases above, the normal transaction
395
is never committed when there is an outstanding statement
396
transaction. In most cases there is no conflict, since
397
commits of the normal transaction are issued by a stand-alone
398
administrative or DDL statement, thus no outstanding statement
399
transaction of the previous statement exists. Besides,
400
all statements that manipulate with the normal transaction
401
are prohibited in stored functions and triggers, therefore
402
no conflicting situation can occur in a sub-statement either.
403
The remaining rare cases when the server explicitly has
404
to commit the statement transaction prior to committing the normal
405
one cover error-handling scenarios (see for example
408
When committing a statement or a normal transaction, the server
409
either uses the two-phase commit protocol, or issues a commit
410
in each engine independently. The two-phase commit protocol
412
- all participating engines support two-phase commit (provide
413
handlerton::prepare PSEA API call) and
414
- transactions in at least two engines modify data (i.e. are
417
Note that the two phase commit is used for
418
statement transactions, even though they are not durable anyway.
419
This is done to ensure logical consistency of data in a multiple-
421
For example, imagine that some day MySQL supports unique
422
constraint checks deferred till the end of statement. In such
423
case a commit in one of the engines may yield ER_DUP_KEY,
424
and MySQL should be able to gracefully abort statement
425
transactions of other participants.
427
After the normal transaction has been committed,
428
session->transaction.all list is cleared.
430
When a connection is closed, the current normal transaction, if
433
Roles and responsibilities
434
--------------------------
436
The server has no way to know that an engine participates in
437
the statement and a transaction has been started
438
in it unless the engine says so. Thus, in order to be
439
a part of a transaction, the engine must "register" itself.
440
This is done by invoking trans_register_ha() server call.
441
Normally the engine registers itself whenever handler::external_lock()
442
is called. trans_register_ha() can be invoked many times: if
443
an engine is already registered, the call does nothing.
444
In case autocommit is not set, the engine must register itself
445
twice -- both in the statement list and in the normal transaction
447
In which list to register is a parameter of trans_register_ha().
449
Note, that although the registration interface in itself is
450
fairly clear, the current usage practice often leads to undesired
451
effects. E.g. since a call to trans_register_ha() in most engines
452
is embedded into implementation of handler::external_lock(), some
453
DDL statements start a transaction (at least from the server
454
point of view) even though they are not expected to. E.g.
455
CREATE TABLE does not start a transaction, since
456
handler::external_lock() is never called during CREATE TABLE. But
457
CREATE TABLE ... SELECT does, since handler::external_lock() is
458
called for the table that is being selected from. This has no
459
practical effects currently, but must be kept in mind
462
Once an engine is registered, the server will do the rest
465
During statement execution, whenever any of data-modifying
466
PSEA API methods is used, e.g. handler::write_row() or
467
handler::update_row(), the read-write flag is raised in the
468
statement transaction for the involved engine.
469
Currently All PSEA calls are "traced", and the data can not be
470
changed in a way other than issuing a PSEA call. Important:
471
unless this invariant is preserved the server will not know that
472
a transaction in a given engine is read-write and will not
473
involve the two-phase commit protocol!
475
At the end of a statement, server call
476
ha_autocommit_or_rollback() is invoked. This call in turn
477
invokes handlerton::prepare() for every involved engine.
478
Prepare is followed by a call to handlerton::commit_one_phase()
479
If a one-phase commit will suffice, handlerton::prepare() is not
480
invoked and the server only calls handlerton::commit_one_phase().
481
At statement commit, the statement-related read-write engine
482
flag is propagated to the corresponding flag in the normal
483
transaction. When the commit is complete, the list of registered
486
Rollback is handled in a similar fashion.
488
Additional notes on DDL and the normal transaction.
489
---------------------------------------------------
491
DDLs and operations with non-transactional engines
492
do not "register" in session->transaction lists, and thus do not
493
modify the transaction state. Besides, each DDL in
494
MySQL is prefixed with an implicit normal transaction commit
495
(a call to end_active_trans()), and thus leaves nothing
497
However, as it has been pointed out with CREATE TABLE .. SELECT,
498
some DDL statements can start a *new* transaction.
500
Behaviour of the server in this case is currently badly
502
DDL statements use a form of "semantic" logging
503
to maintain atomicity: if CREATE TABLE .. SELECT failed,
504
the newly created table is deleted.
505
In addition, some DDL statements issue interim transaction
506
commits: e.g. ALTER Table issues a commit after data is copied
507
from the original table to the internal temporary table. Other
508
statements, e.g. CREATE TABLE ... SELECT do not always commit
510
And finally there is a group of DDL statements such as
511
RENAME/DROP Table that doesn't start a new transaction
514
This diversity makes it hard to say what will happen if
515
by chance a stored function is invoked during a DDL --
516
whether any modifications it makes will be committed or not
517
is not clear. Fortunately, SQL grammar of few DDLs allows
518
invocation of a stored function.
520
A consistent behaviour is perhaps to always commit the normal
521
transaction after all DDLs, just like the statement transaction
522
is always committed at the end of all statements.
526
Register a storage engine for a transaction.
528
Every storage engine MUST call this function when it starts
529
a transaction or a statement (that is it must be called both for the
530
"beginning of transaction" and "beginning of statement").
531
Only storage engines registered for the transaction/statement
532
will know when to commit/rollback it.
535
trans_register_ha is idempotent - storage engine may register many
536
times per transaction.
539
void trans_register_ha(Session *session, bool all, handlerton *ht_arg)
541
Session_TRANS *trans;
542
Ha_trx_info *ha_info;
546
trans= &session->transaction.all;
547
session->server_status|= SERVER_STATUS_IN_TRANS;
550
trans= &session->transaction.stmt;
552
ha_info= session->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
554
if (ha_info->is_started())
555
return; /* already registered, return */
557
ha_info->register_ha(trans, ht_arg);
559
trans->no_2pc|=(ht_arg->prepare==0);
560
if (session->transaction.xid_state.xid.is_null())
561
session->transaction.xid_state.xid.set(session->query_id);
570
1 error, transaction was rolled back
572
int ha_prepare(Session *session)
575
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
576
Ha_trx_info *ha_info= trans->ha_list;
579
for (; ha_info; ha_info= ha_info->next())
582
handlerton *ht= ha_info->ht();
583
status_var_increment(session->status_var.ha_prepare_count);
586
if ((err= ht->prepare(ht, session, all)))
588
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
589
ha_rollback_trans(session, all);
596
push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
597
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
598
ha_resolve_storage_engine_name(ht));
606
Check if we can skip the two-phase commit.
608
A helper function to evaluate if two-phase commit is mandatory.
609
As a side effect, propagates the read-only/read-write flags
610
of the statement transaction to its enclosing normal transaction.
612
@retval true we must run a two-phase commit. Returned
613
if we have at least two engines with read-write changes.
614
@retval false Don't need two-phase commit. Even if we have two
615
transactional engines, we can run two independent
616
commits if changes in one of the engines are read-only.
621
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
624
/* The number of storage engines that have actual changes. */
625
unsigned rw_ha_count= 0;
626
Ha_trx_info *ha_info;
628
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
630
if (ha_info->is_trx_read_write())
635
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->ht()->slot].ha_info[1];
636
assert(ha_info != ha_info_all);
638
Merge read-only/read-write information about statement
639
transaction to its enclosing normal transaction. Do this
640
only if in a real transaction -- that is, if we know
641
that ha_info_all is registered in session->transaction.all.
642
Since otherwise we only clutter the normal transaction flags.
644
if (ha_info_all->is_started()) /* false if autocommit. */
645
ha_info_all->coalesce_trx_with(ha_info);
647
else if (rw_ha_count > 1)
650
It is a normal transaction, so we don't need to merge read/write
651
information up, and the need for two-phase commit has been
652
already established. Break the loop prematurely.
657
return rw_ha_count > 1;
665
1 transaction was rolled back
667
2 error during commit, data may be inconsistent
670
Since we don't support nested statement transactions in 5.0,
671
we can't commit or rollback stmt transactions while we are inside
672
stored functions or triggers. So we simply do nothing now.
673
TODO: This should be fixed in later ( >= 5.1) releases.
675
int ha_commit_trans(Session *session, bool all)
677
int error= 0, cookie= 0;
679
'all' means that this is either an explicit commit issued by
680
user, or an implicit commit issued by a DDL.
682
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
683
bool is_real_trans= all || session->transaction.all.ha_list == 0;
684
Ha_trx_info *ha_info= trans->ha_list;
685
my_xid xid= session->transaction.xid_state.xid.get_my_xid();
688
We must not commit the normal transaction if a statement
689
transaction is pending. Otherwise statement transaction
690
flags will not get propagated to its normal transaction's
693
assert(session->transaction.stmt.ha_list == NULL ||
694
trans == &session->transaction.stmt);
700
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
702
ha_rollback_trans(session, all);
706
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
708
if (!trans->no_2pc && must_2pc)
710
for (; ha_info && !error; ha_info= ha_info->next())
713
handlerton *ht= ha_info->ht();
715
Do not call two-phase commit if this particular
716
transaction is read-only. This allows for simpler
717
implementation in engines that are always read-only.
719
if (! ha_info->is_trx_read_write())
722
Sic: we know that prepare() is not NULL since otherwise
723
trans->no_2pc would have been set.
725
if ((err= ht->prepare(ht, session, all)))
727
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
730
status_var_increment(session->status_var.ha_prepare_count);
732
if (error || (is_real_trans && xid &&
733
(error= !(cookie= tc_log->log_xid(session, xid)))))
735
ha_rollback_trans(session, all);
740
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
742
tc_log->unlog(cookie, xid);
745
start_waiting_global_read_lock(session);
752
This function does not care about global read lock. A caller should.
754
int ha_commit_one_phase(Session *session, bool all)
757
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
758
bool is_real_trans=all || session->transaction.all.ha_list == 0;
759
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
762
for (; ha_info; ha_info= ha_info_next)
765
handlerton *ht= ha_info->ht();
766
if ((err= ht->commit(ht, session, all)))
768
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
771
status_var_increment(session->status_var.ha_commit_count);
772
ha_info_next= ha_info->next();
773
ha_info->reset(); /* keep it conveniently zero-filled */
778
session->transaction.xid_state.xid.null();
781
session->variables.tx_isolation=session->session_tx_isolation;
782
session->transaction.cleanup();
789
int ha_rollback_trans(Session *session, bool all)
792
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
793
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
794
bool is_real_trans=all || session->transaction.all.ha_list == 0;
797
We must not rollback the normal transaction if a statement
798
transaction is pending.
800
assert(session->transaction.stmt.ha_list == NULL ||
801
trans == &session->transaction.stmt);
805
for (; ha_info; ha_info= ha_info_next)
808
handlerton *ht= ha_info->ht();
809
if ((err= ht->rollback(ht, session, all)))
811
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
814
status_var_increment(session->status_var.ha_rollback_count);
815
ha_info_next= ha_info->next();
816
ha_info->reset(); /* keep it conveniently zero-filled */
821
session->transaction.xid_state.xid.null();
824
session->variables.tx_isolation=session->session_tx_isolation;
825
session->transaction.cleanup();
829
session->transaction_rollback_request= false;
832
If a non-transactional table was updated, warn; don't warn if this is a
833
slave thread (because when a slave thread executes a ROLLBACK, it has
834
been read from the binary log, so it's 100% sure and normal to produce
835
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
836
slave SQL thread, it would not stop the thread but just be printed in
837
the error log; but we don't want users to wonder why they have this
838
message in the error log, so we don't send it.
840
if (is_real_trans && session->transaction.all.modified_non_trans_table &&
841
!session->slave_thread && session->killed != Session::KILL_CONNECTION)
842
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
843
ER_WARNING_NOT_COMPLETE_ROLLBACK,
844
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
849
This is used to commit or rollback a single statement depending on
853
Note that if the autocommit is on, then the following call inside
854
InnoDB will commit or rollback the whole transaction (= the statement). The
855
autocommit mechanism built into InnoDB is based on counting locks, but if
856
the user has used LOCK TABLES then that mechanism does not know to do the
859
int ha_autocommit_or_rollback(Session *session, int error)
861
if (session->transaction.stmt.ha_list)
865
if (ha_commit_trans(session, 0))
870
(void) ha_rollback_trans(session, 0);
871
if (session->transaction_rollback_request)
872
(void) ha_rollback(session);
875
session->variables.tx_isolation=session->session_tx_isolation;
886
static bool xacommit_handlerton(Session *unused1 __attribute__((unused)),
890
handlerton *hton= plugin_data(plugin, handlerton *);
891
if (hton->state == SHOW_OPTION_YES && hton->recover)
893
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
894
((struct xahton_st *)arg)->result= 0;
899
static bool xarollback_handlerton(Session *unused1 __attribute__((unused)),
903
handlerton *hton= plugin_data(plugin, handlerton *);
904
if (hton->state == SHOW_OPTION_YES && hton->recover)
906
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
907
((struct xahton_st *)arg)->result= 0;
913
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
915
struct xahton_st xaop;
919
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
920
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
926
recover() step of xa.
929
there are three modes of operation:
930
- automatic recover after a crash
931
in this case commit_list != 0, tc_heuristic_recover==0
932
all xids from commit_list are committed, others are rolled back
933
- manual (heuristic) recover
934
in this case commit_list==0, tc_heuristic_recover != 0
935
DBA has explicitly specified that all prepared transactions should
936
be committed (or rolled back).
937
- no recovery (MySQL did not detect a crash)
938
in this case commit_list==0, tc_heuristic_recover == 0
939
there should be no prepared transactions in this case.
943
int len, found_foreign_xids, found_my_xids;
949
static bool xarecover_handlerton(Session *unused __attribute__((unused)),
953
handlerton *hton= plugin_data(plugin, handlerton *);
954
struct xarecover_st *info= (struct xarecover_st *) arg;
957
if (hton->state == SHOW_OPTION_YES && hton->recover)
959
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
961
errmsg_printf(ERRMSG_LVL_INFO, _("Found %d prepared transaction(s) in %s"),
962
got, ha_resolve_storage_engine_name(hton));
963
for (int i=0; i < got; i ++)
965
my_xid x=info->list[i].get_my_xid();
966
if (!x) // not "mine" - that is generated by external TM
968
xid_cache_insert(info->list+i, XA_PREPARED);
969
info->found_foreign_xids++;
974
info->found_my_xids++;
978
if (info->commit_list ?
979
hash_search(info->commit_list, (unsigned char *)&x, sizeof(x)) != 0 :
980
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
982
hton->commit_by_xid(hton, info->list+i);
986
hton->rollback_by_xid(hton, info->list+i);
996
int ha_recover(HASH *commit_list)
998
struct xarecover_st info;
999
info.found_foreign_xids= info.found_my_xids= 0;
1000
info.commit_list= commit_list;
1001
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1004
/* commit_list and tc_heuristic_recover cannot be set both */
1005
assert(info.commit_list==0 || tc_heuristic_recover==0);
1006
/* if either is set, total_ha_2pc must be set too */
1007
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1009
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1012
if (info.commit_list)
1013
errmsg_printf(ERRMSG_LVL_INFO, _("Starting crash recovery..."));
1016
#ifndef WILL_BE_DELETED_LATER
1019
for now, only InnoDB supports 2pc. It means we can always safely
1020
rollback all pending transactions, without risking inconsistent data
1023
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1024
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1029
for (info.len= MAX_XID_LIST_SIZE ;
1030
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1032
info.list=(XID *)malloc(info.len*sizeof(XID));
1036
errmsg_printf(ERRMSG_LVL_ERROR, ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1040
plugin_foreach(NULL, xarecover_handlerton,
1041
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1043
free((unsigned char*)info.list);
1044
if (info.found_foreign_xids)
1045
errmsg_printf(ERRMSG_LVL_WARN, _("Found %d prepared XA transactions"),
1046
info.found_foreign_xids);
1047
if (info.dry_run && info.found_my_xids)
1049
errmsg_printf(ERRMSG_LVL_ERROR,
1050
_("Found %d prepared transactions! It means that drizzled "
1051
"was not shut down properly last time and critical "
1052
"recovery information (last binlog or %s file) was "
1053
"manually deleted after a crash. You have to start "
1054
"drizzled with the --tc-heuristic-recover switch to "
1055
"commit or rollback pending transactions."),
1056
info.found_my_xids, opt_tc_log_file);
1059
if (info.commit_list)
1060
errmsg_printf(ERRMSG_LVL_INFO, _("Crash recovery finished."));
1065
return the list of XID's to a client, the same way SHOW commands do.
1068
I didn't find in XA specs that an RM cannot return the same XID twice,
1069
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1070
It can be easily fixed later, if necessary.
1072
bool mysql_xa_recover(Session *session)
1074
List<Item> field_list;
1075
Protocol *protocol= session->protocol;
1079
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1080
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1081
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1082
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1084
if (protocol->send_fields(&field_list,
1085
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1088
pthread_mutex_lock(&LOCK_xid_cache);
1089
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1091
if (xs->xa_state==XA_PREPARED)
1093
protocol->prepare_for_resend();
1094
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1095
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1096
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1097
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1099
if (protocol->write())
1101
pthread_mutex_unlock(&LOCK_xid_cache);
1107
pthread_mutex_unlock(&LOCK_xid_cache);
1114
This function should be called when MySQL sends rows of a SELECT result set
1115
or the EOF mark to the client. It releases a possible adaptive hash index
1116
S-latch held by session in InnoDB and also releases a possible InnoDB query
1117
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a session to
1118
keep them over several calls of the InnoDB handler interface when a join
1119
is executed. But when we let the control to pass to the client they have
1120
to be released because if the application program uses mysql_use_result(),
1121
it may deadlock on the S-latch if the application on another connection
1122
performs another SQL query. In MySQL-4.1 this is even more important because
1123
there a connection can have several SELECT queries open at the same time.
1125
@param session the thread handle of the current connection
1130
static bool release_temporary_latches(Session *session, plugin_ref plugin,
1131
void *unused __attribute__((unused)))
1133
handlerton *hton= plugin_data(plugin, handlerton *);
1135
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1136
hton->release_temporary_latches(hton, session);
1142
int ha_release_temporary_latches(Session *session)
1144
plugin_foreach(session, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1150
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
1153
Session_TRANS *trans= &session->transaction.all;
1154
Ha_trx_info *ha_info, *ha_info_next;
1158
rolling back to savepoint in all storage engines that were part of the
1159
transaction when the savepoint was set
1161
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1164
handlerton *ht= ha_info->ht();
1166
assert(ht->savepoint_set != 0);
1167
if ((err= ht->savepoint_rollback(ht, session,
1168
(unsigned char *)(sv+1)+ht->savepoint_offset)))
1170
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1173
status_var_increment(session->status_var.ha_savepoint_rollback_count);
1174
trans->no_2pc|= ht->prepare == 0;
1177
rolling back the transaction in all storage engines that were not part of
1178
the transaction when the savepoint was set
1180
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1181
ha_info= ha_info_next)
1184
handlerton *ht= ha_info->ht();
1185
if ((err= ht->rollback(ht, session, !(0))))
1187
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1190
status_var_increment(session->status_var.ha_rollback_count);
1191
ha_info_next= ha_info->next();
1192
ha_info->reset(); /* keep it conveniently zero-filled */
1194
trans->ha_list= sv->ha_list;
1200
according to the sql standard (ISO/IEC 9075-2:2003)
1201
section "4.33.4 SQL-statements and transaction states",
1202
SAVEPOINT is *not* transaction-initiating SQL-statement
1204
int ha_savepoint(Session *session, SAVEPOINT *sv)
1207
Session_TRANS *trans= &session->transaction.all;
1208
Ha_trx_info *ha_info= trans->ha_list;
1209
for (; ha_info; ha_info= ha_info->next())
1212
handlerton *ht= ha_info->ht();
1214
if (! ht->savepoint_set)
1216
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1220
if ((err= ht->savepoint_set(ht, session, (unsigned char *)(sv+1)+ht->savepoint_offset)))
1222
my_error(ER_GET_ERRNO, MYF(0), err);
1225
status_var_increment(session->status_var.ha_savepoint_count);
1228
Remember the list of registered storage engines. All new
1229
engines are prepended to the beginning of the list.
1231
sv->ha_list= trans->ha_list;
1235
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
1238
Ha_trx_info *ha_info= sv->ha_list;
1240
for (; ha_info; ha_info= ha_info->next())
1243
handlerton *ht= ha_info->ht();
1244
/* Savepoint life time is enclosed into transaction life time. */
1246
if (!ht->savepoint_release)
1248
if ((err= ht->savepoint_release(ht, session,
1249
(unsigned char *)(sv+1) + ht->savepoint_offset)))
1251
my_error(ER_GET_ERRNO, MYF(0), err);
1259
static bool snapshot_handlerton(Session *session, plugin_ref plugin, void *arg)
1261
handlerton *hton= plugin_data(plugin, handlerton *);
1262
if (hton->state == SHOW_OPTION_YES &&
1263
hton->start_consistent_snapshot)
1265
hton->start_consistent_snapshot(hton, session);
1266
*((bool *)arg)= false;
1271
int ha_start_consistent_snapshot(Session *session)
1275
plugin_foreach(session, snapshot_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1278
Same idea as when one wants to CREATE TABLE in one engine which does not
1282
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1283
"This Drizzle server does not support any "
1284
"consistent-read capable storage engine");
1289
static bool flush_handlerton(Session *session __attribute__((unused)),
1291
void *arg __attribute__((unused)))
1293
handlerton *hton= plugin_data(plugin, handlerton *);
1294
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1295
hton->flush_logs(hton))
1301
bool ha_flush_logs(handlerton *db_type)
1303
if (db_type == NULL)
1305
if (plugin_foreach(NULL, flush_handlerton,
1306
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1311
if (db_type->state != SHOW_OPTION_YES ||
1312
(db_type->flush_logs && db_type->flush_logs(db_type)))
1318
static const char *check_lowercase_names(handler *file, const char *path,
1321
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1324
/* Ensure that table handler get path in lower case */
1325
if (tmp_path != path)
1326
strcpy(tmp_path, path);
1329
we only should turn into lowercase database/table part
1330
so start the process after homedirectory
1332
my_casedn_str(files_charset_info, tmp_path + drizzle_data_home_len);
1338
An interceptor to hijack the text of the error message without
1339
setting an error in the thread. We need the text to present it
1340
in the form of a warning to the user.
1343
struct Ha_delete_table_error_handler: public Internal_error_handler
1346
virtual bool handle_error(uint32_t sql_errno,
1347
const char *message,
1348
DRIZZLE_ERROR::enum_warning_level level,
1350
char buff[DRIZZLE_ERRMSG_SIZE];
1355
Ha_delete_table_error_handler::
1356
handle_error(uint32_t sql_errno __attribute__((unused)),
1357
const char *message,
1358
DRIZZLE_ERROR::enum_warning_level level __attribute__((unused)),
1359
Session *session __attribute__((unused)))
1361
/* Grab the error message */
1362
strncpy(buff, message, sizeof(buff)-1);
1367
struct handlerton_delete_table_args {
1374
static bool deletetable_handlerton(Session *unused1 __attribute__((unused)),
1378
struct handlerton_delete_table_args *dtargs= (struct handlerton_delete_table_args *) args;
1380
Session *session= dtargs->session;
1381
const char *path= dtargs->path;
1384
char tmp_path[FN_REFLEN];
1386
if(dtargs->error!=ENOENT) /* already deleted table */
1389
handlerton *table_type= plugin_data(plugin, handlerton *);
1394
if(!(table_type->state == SHOW_OPTION_YES && table_type->create))
1397
if ((file= table_type->create(table_type, NULL, session->mem_root)))
1402
path= check_lowercase_names(file, path, tmp_path);
1403
int error= file->ha_delete_table(path);
1407
dtargs->error= error;
1409
delete dtargs->file;
1418
This should return ENOENT if the file doesn't exists.
1419
The .frm file will be deleted only if we return 0 or ENOENT
1421
int ha_delete_table(Session *session, const char *path,
1422
const char *db, const char *alias, bool generate_warning)
1424
TABLE_SHARE dummy_share;
1427
struct handlerton_delete_table_args dtargs;
1428
dtargs.error= ENOENT;
1429
dtargs.session= session;
1433
plugin_foreach(NULL, deletetable_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1436
memset(&dummy_table, 0, sizeof(dummy_table));
1437
memset(&dummy_share, 0, sizeof(dummy_share));
1438
dummy_table.s= &dummy_share;
1440
if (dtargs.error && generate_warning)
1443
Because file->print_error() use my_error() to generate the error message
1444
we use an internal error handler to intercept it and store the text
1445
in a temporary buffer. Later the message will be presented to user
1448
Ha_delete_table_error_handler ha_delete_table_error_handler;
1450
/* Fill up strucutures that print_error may need */
1451
dummy_share.path.str= (char*) path;
1452
dummy_share.path.length= strlen(path);
1453
dummy_share.db.str= (char*) db;
1454
dummy_share.db.length= strlen(db);
1455
dummy_share.table_name.str= (char*) alias;
1456
dummy_share.table_name.length= strlen(alias);
1457
dummy_table.alias= alias;
1459
handler *file= dtargs.file;
1460
file->change_table_ptr(&dummy_table, &dummy_share);
1462
session->push_internal_handler(&ha_delete_table_error_handler);
1463
file->print_error(dtargs.error, 0);
1465
session->pop_internal_handler();
1468
XXX: should we convert *all* errors to warnings here?
1469
What if the error is fatal?
1471
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_ERROR, dtargs.error,
1472
ha_delete_table_error_handler.buff);
1478
return dtargs.error;
53
1481
/****************************************************************************
54
** General Cursor functions
1482
** General handler functions
55
1483
****************************************************************************/
56
Cursor::Cursor(plugin::StorageEngine &engine_arg,
57
TableShare &share_arg)
58
: table_share(&share_arg), table(0),
59
estimation_rows_to_insert(0), engine(&engine_arg),
60
ref(0), in_range_check_pushed_down(false),
61
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
62
ref_length(sizeof(internal::my_off_t)),
64
locked(false), implicit_emptied(0),
65
next_insert_id(0), insert_id_for_cur_row(0)
70
assert(locked == false);
71
/* TODO: assert(inited == NONE); */
75
Cursor *Cursor::clone(memory::Root *mem_root)
77
Cursor *new_handler= table->s->db_type()->getCursor(*table->s, mem_root);
1484
handler *handler::clone(MEM_ROOT *mem_root)
1486
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
80
Allocate Cursor->ref here because otherwise ha_open will allocate it
1488
Allocate handler->ref here because otherwise ha_open will allocate it
81
1489
on this->table->mem_root and we will not be able to reclaim that memory
82
when the clone Cursor object is destroyed.
1490
when the clone handler object is destroyed.
84
1492
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
1185
3356
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1187
3358
*range_info= mrr_cur_range.ptr;
3363
/* **************************************************************************
3364
* DS-MRR implementation
3365
***************************************************************************/
3368
DS-MRR: Initialize and start MRR scan
3370
Initialize and start the MRR scan. Depending on the mode parameter, this
3371
may use default or DS-MRR implementation.
3373
@param h Table handler to be used
3374
@param key Index to be used
3375
@param seq_funcs Interval sequence enumeration functions
3376
@param seq_init_param Interval sequence enumeration parameter
3377
@param n_ranges Number of ranges in the sequence.
3378
@param mode HA_MRR_* modes to use
3379
@param buf INOUT Buffer to use
3381
@retval 0 Ok, Scan started.
3385
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3386
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3387
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
3391
Item *pushed_cond= NULL;
3393
keyno= h->active_index;
3395
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3397
use_default_impl= true;
3398
return(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3399
n_ranges, mode, buf));
3401
rowids_buf= buf->buffer;
3402
//psergey-todo: don't add key_length as it is not needed anymore
3403
rowids_buf += key->key_length + h->ref_length;
3405
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3406
rowids_buf_end= buf->buffer_end;
3408
elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3409
rowids_buf_last= rowids_buf +
3410
((rowids_buf_end - rowids_buf)/ elem_size)*
3412
rowids_buf_end= rowids_buf_last;
3414
/* Create a separate handler object to do rndpos() calls. */
3415
Session *session= current_session;
3416
if (!(new_h2= h->clone(session->mem_root)) ||
3417
new_h2->ha_external_lock(session, F_RDLCK))
3423
if (keyno == h->pushed_idx_cond_keyno)
3424
pushed_cond= h->pushed_idx_cond;
3425
if (h->ha_index_end())
3432
table->prepare_for_position();
3433
new_h2->extra(HA_EXTRA_KEYREAD);
3435
if (h2->ha_index_init(keyno, false) ||
3436
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3439
use_default_impl= false;
3442
h2->idx_cond_push(keyno, pushed_cond);
3443
if (dsmrr_fill_buffer(new_h2))
3447
If the above call has scanned through all intervals in *seq, then
3448
adjust *buf to indicate that the remaining buffer space will not be used.
3451
buf->end_of_used_area= rowids_buf_last;
3453
if (h->ha_rnd_init(false))
3458
h2->ha_index_or_rnd_end();
3459
h2->ha_external_lock(session, F_UNLCK);
3466
void DsMrr_impl::dsmrr_close()
3470
h2->ha_external_lock(current_session, F_UNLCK);
3475
use_default_impl= true;
3480
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
3482
return ((handler*)h)->cmp_ref(a, b);
3487
DS-MRR: Fill the buffer with rowids and sort it by rowid
3489
{This is an internal function of DiskSweep MRR implementation}
3490
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3491
buffer. When the buffer is full or scan is completed, sort the buffer by
3494
The function assumes that rowids buffer is empty when it is invoked.
3496
@param h Table handler
3498
@retval 0 OK, the next portion of rowids is in the buffer,
3503
int DsMrr_impl::dsmrr_fill_buffer(handler *unused __attribute__((unused)))
3508
rowids_buf_cur= rowids_buf;
3509
while ((rowids_buf_cur < rowids_buf_end) &&
3510
!(res= h2->handler::multi_range_read_next(&range_info)))
3512
/* Put rowid, or {rowid, range_id} pair into the buffer */
3513
h2->position(table->record[0]);
3514
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3515
rowids_buf_cur += h->ref_length;
3519
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3520
rowids_buf_cur += sizeof(void*);
3524
if (res && res != HA_ERR_END_OF_FILE)
3526
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3528
/* Sort the buffer contents by rowid */
3529
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3530
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3532
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3534
rowids_buf_last= rowids_buf_cur;
3535
rowids_buf_cur= rowids_buf;
3541
DS-MRR implementation: multi_range_read_next() function
3544
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
3548
if (use_default_impl)
3549
return h->handler::multi_range_read_next(range_info);
3551
if (rowids_buf_cur == rowids_buf_last)
3555
res= HA_ERR_END_OF_FILE;
3558
res= dsmrr_fill_buffer(h);
3563
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3564
if (rowids_buf_cur == rowids_buf_last)
3566
res= HA_ERR_END_OF_FILE;
3570
res= h->rnd_pos(table->record[0], rowids_buf_cur);
3571
rowids_buf_cur += h->ref_length;
3574
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3575
rowids_buf_cur += sizeof(void*);
3586
DS-MRR implementation: multi_range_read_info() function
3588
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
3589
uint32_t *flags, COST_VECT *cost)
3592
uint32_t def_flags= *flags;
3593
uint32_t def_bufsz= *bufsz;
3595
/* Get cost/flags/mem_usage of default MRR implementation */
3596
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3600
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3601
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3603
/* Default implementation is choosen */
3612
DS-MRR Implementation: multi_range_read_info_const() function
3615
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
3616
void *seq_init_param, uint32_t n_ranges,
3617
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
3620
uint32_t def_flags= *flags;
3621
uint32_t def_bufsz= *bufsz;
3622
/* Get cost/flags/mem_usage of default MRR implementation */
3623
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3624
n_ranges, &def_bufsz,
3626
if (rows == HA_POS_ERROR)
3628
/* Default implementation can't perform MRR scan => we can't either */
3633
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3634
use the default MRR implementation (we need it for UPDATE/DELETE).
3635
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3637
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3638
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3645
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3652
Check if key has partially-covered columns
3654
We can't use DS-MRR to perform range scans when the ranges are over
3655
partially-covered keys, because we'll not have full key part values
3656
(we'll have their prefixes from the index) and will not be able to check
3657
if we've reached the end the range.
3659
@param keyno Key to check
3662
Allow use of DS-MRR in cases where the index has partially-covered
3663
components but they are not used for scanning.
3669
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
3671
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3672
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3673
for (; kp != kp_end; kp++)
3675
if (!kp->field->part_of_key.is_set(keyno))
3683
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
3685
Make the choice between using Default MRR implementation and DS-MRR.
3686
This function contains common functionality factored out of dsmrr_info()
3687
and dsmrr_info_const(). The function assumes that the default MRR
3688
implementation's applicability requirements are satisfied.
3690
@param keyno Index number
3691
@param rows E(full rows to be retrieved)
3692
@param flags IN MRR flags provided by the MRR user
3693
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3694
else the value is not modified
3695
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3696
else the value is not modified
3697
@param cost IN Cost of default MRR implementation
3698
OUT If DS-MRR is choosen, cost of DS-MRR scan
3699
else the value is not modified
3701
@retval true Default MRR implementation should be used
3702
@retval false DS-MRR implementation should be used
3705
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3706
uint32_t *bufsz, COST_VECT *cost)
3708
COST_VECT dsmrr_cost;
3710
Session *session= current_session;
3711
if ((session->variables.optimizer_use_mrr == 2) ||
3712
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3713
(keyno == table->s->primary_key &&
3714
h->primary_key_is_clustered()) ||
3715
key_uses_partial_cols(keyno))
3717
/* Use the default implementation */
3718
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3722
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3724
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3730
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
3731
DS-MRR and Default implementations cost. This allows one to force use of
3732
DS-MRR whenever it is applicable without affecting other cost-based
3735
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
3736
dsmrr_cost.total_cost() > cost->total_cost())
3739
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
3741
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
3742
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
3748
/* Use the default MRR implementation */
3755
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
3759
Get cost of DS-MRR scan
3761
@param keynr Index to be used
3762
@param rows E(Number of rows to be scanned)
3763
@param flags Scan parameters (HA_MRR_* flags)
3764
@param buffer_size INOUT Buffer size
3765
@param cost OUT The cost
3768
@retval true Error, DS-MRR cannot be used (the buffer is too small
3772
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
3773
uint32_t *buffer_size, COST_VECT *cost)
3775
uint32_t max_buff_entries, elem_size;
3776
ha_rows rows_in_full_step, rows_in_last_step;
3777
uint32_t n_full_steps;
3778
double index_read_cost;
3780
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
3781
max_buff_entries = *buffer_size / elem_size;
3783
if (!max_buff_entries)
3784
return true; /* Buffer has not enough space for even 1 rowid */
3786
/* Number of iterations we'll make with full buffer */
3787
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
3790
Get numbers of rows we'll be processing in
3791
- non-last sweep, with full buffer
3792
- last iteration, with non-full buffer
3794
rows_in_full_step= max_buff_entries;
3795
rows_in_last_step= rows % max_buff_entries;
3797
/* Adjust buffer size if we expect to use only part of the buffer */
3800
get_sort_and_sweep_cost(table, rows, cost);
3801
cost->multiply(n_full_steps);
3806
*buffer_size= cmax((ulong)*buffer_size,
3807
(size_t)(1.2*rows_in_last_step) * elem_size +
3808
h->ref_length + table->key_info[keynr].key_length);
3811
COST_VECT last_step_cost;
3812
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
3813
cost->add(&last_step_cost);
3815
if (n_full_steps != 0)
3816
cost->mem_cost= *buffer_size;
3818
cost->mem_cost= (double)rows_in_last_step * elem_size;
3820
/* Total cost of all index accesses */
3821
index_read_cost= h->index_only_read_time(keynr, (double)rows);
3822
cost->add_io(index_read_cost, 1 /* Random seeks */);
3828
Get cost of one sort-and-sweep step
3831
get_sort_and_sweep_cost()
3832
table Table being accessed
3833
nrows Number of rows to be sorted and retrieved
3837
Get cost of these operations:
3838
- sort an array of #nrows ROWIDs using qsort
3839
- read #nrows records from table in a sweep.
3843
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3847
get_sweep_read_cost(table, nrows, false, cost);
3848
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3849
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3852
cost->cpu_cost += cmp_op * log2(cmp_op);
3860
Get cost of reading nrows table records in a "disk sweep"
3862
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3863
for an ordered sequence of rowids.
3865
We assume hard disk IO. The read is performed as follows:
3867
1. The disk head is moved to the needed cylinder
3868
2. The controller waits for the plate to rotate
3869
3. The data is transferred
3871
Time to do #3 is insignificant compared to #2+#1.
3873
Time to move the disk head is proportional to head travel distance.
3875
Time to wait for the plate to rotate depends on whether the disk head
3878
If disk head wasn't moved, the wait time is proportional to distance
3879
between the previous block and the block we're reading.
3881
If the head was moved, we don't know how much we'll need to wait for the
3882
plate to rotate. We assume the wait time to be a variate with a mean of
3883
0.5 of full rotation time.
3885
Our cost units are "random disk seeks". The cost of random disk seek is
3886
actually not a constant, it depends one range of cylinders we're going
3887
to access. We make it constant by introducing a fuzzy concept of "typical
3888
datafile length" (it's fuzzy as it's hard to tell whether it should
3889
include index file, temp.tables etc). Then random seek cost is:
3891
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3893
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3895
@param table Table to be accessed
3896
@param nrows Number of rows to retrieve
3897
@param interrupted true <=> Assume that the disk sweep will be
3898
interrupted by other disk IO. false - otherwise.
3899
@param cost OUT The cost.
3902
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3906
if (table->file->primary_key_is_clustered())
3908
cost->io_count= table->file->read_time(table->s->primary_key,
3909
(uint) nrows, nrows);
3914
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3916
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3917
if (busy_blocks < 1.0)
3920
cost->io_count= busy_blocks;
3924
/* Assume reading is done in one 'sweep' */
3925
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3926
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);