23
23
Handler-calling-functions
30
#include "drizzled/my_hash.h"
26
#include "drizzled/server_includes.h"
27
#include "libdrizzleclient/libdrizzle.h"
28
#include "mysys/hash.h"
31
29
#include "drizzled/error.h"
32
30
#include "drizzled/gettext.h"
31
#include "drizzled/data_home.h"
33
32
#include "drizzled/probes.h"
34
33
#include "drizzled/sql_parse.h"
35
#include "drizzled/optimizer/cost_vector.h"
34
#include "drizzled/cost_vect.h"
36
36
#include "drizzled/session.h"
37
37
#include "drizzled/sql_base.h"
38
#include "drizzled/transaction_services.h"
38
#include "drizzled/replicator.h"
39
39
#include "drizzled/lock.h"
40
40
#include "drizzled/item/int.h"
41
41
#include "drizzled/item/empty_string.h"
42
#include "drizzled/unireg.h" // for mysql_frm_type
42
43
#include "drizzled/field/timestamp.h"
43
#include "drizzled/message/table.pb.h"
44
#include "drizzled/plugin/client.h"
45
#include "drizzled/internal/my_sys.h"
46
#include "drizzled/plugin/event_observer.h"
44
#include "drizzled/serialize/table.pb.h"
46
#if defined(CMATH_NAMESPACE)
47
using namespace CMATH_NAMESPACE;
51
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
53
/* number of entries in handlertons[] */
55
/* number of storage engines (from handlertons[]) that support 2pc */
56
uint32_t total_ha_2pc= 0;
57
/* size of savepoint storage area (see ha_init) */
58
uint32_t savepoint_alloc_size= 0;
60
const char *ha_row_type[] = {
61
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
64
const char *tx_isolation_names[] =
65
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
68
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
69
tx_isolation_names, NULL};
71
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
72
uint32_t known_extensions_id= 0;
76
Register handler error messages for use with my_error().
84
int ha_init_errors(void)
86
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
89
/* Allocate a pointer array for the error message strings. */
90
/* Zerofill it to avoid uninitialized gaps. */
91
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
93
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
95
/* Set the dedicated error messages. */
96
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
97
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
98
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
99
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
100
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
101
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
102
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
103
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
104
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
105
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
106
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
107
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
108
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
109
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
110
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
111
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
112
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
113
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
114
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
115
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
116
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
117
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
118
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
119
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
120
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
121
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
122
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
123
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
124
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
125
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
126
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
127
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
128
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
129
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
130
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
131
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
132
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
133
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
134
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
135
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
136
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
138
/* Register the error messages for use with my_error(). */
139
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
144
Unregister handler error messages.
151
static int ha_finish_errors(void)
153
const char **errmsgs;
155
/* Allocate a pointer array for the error message strings. */
156
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
158
free((unsigned char*) errmsgs);
166
assert(total_ha < MAX_HA);
168
Check if there is a transaction-capable storage engine besides the
169
binary log (which is considered a transaction-capable storage engine in
172
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
173
savepoint_alloc_size+= sizeof(SAVEPOINT);
182
This should be eventualy based on the graceful shutdown flag.
183
So if flag is equal to HA_PANIC_CLOSE, the deallocate
186
if (ha_finish_errors())
192
static bool dropdb_handlerton(Session *,
196
handlerton *hton= plugin_data(plugin, handlerton *);
197
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
198
hton->drop_database(hton, (char *)path);
203
void ha_drop_database(char* path)
205
plugin_foreach(NULL, dropdb_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
209
static bool closecon_handlerton(Session *session, plugin_ref plugin,
212
handlerton *hton= plugin_data(plugin, handlerton *);
214
there's no need to rollback here as all transactions must
215
be rolled back already
217
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
218
session_get_ha_data(session, hton))
219
hton->close_connection(hton, session);
226
don't bother to rollback here, it's done already
228
void ha_close_connection(Session* session)
230
plugin_foreach(session, closecon_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
233
/* ========================================================================
234
======================= TRANSACTIONS ===================================*/
237
Transaction handling in the server
238
==================================
240
In each client connection, MySQL maintains two transactional
242
- a statement transaction,
243
- a standard, also called normal transaction.
247
"Statement transaction" is a non-standard term that comes
248
from the times when MySQL supported BerkeleyDB storage engine.
250
First of all, it should be said that in BerkeleyDB auto-commit
251
mode auto-commits operations that are atomic to the storage
252
engine itself, such as a write of a record, and are too
253
high-granular to be atomic from the application perspective
254
(MySQL). One SQL statement could involve many BerkeleyDB
255
auto-committed operations and thus BerkeleyDB auto-commit was of
258
Secondly, instead of SQL standard savepoints, BerkeleyDB
259
provided the concept of "nested transactions". In a nutshell,
260
transactions could be arbitrarily nested, but when the parent
261
transaction was committed or aborted, all its child (nested)
262
transactions were handled committed or aborted as well.
263
Commit of a nested transaction, in turn, made its changes
264
visible, but not durable: it destroyed the nested transaction,
265
all its changes would become available to the parent and
266
currently active nested transactions of this parent.
268
So the mechanism of nested transactions was employed to
269
provide "all or nothing" guarantee of SQL statements
270
required by the standard.
271
A nested transaction would be created at start of each SQL
272
statement, and destroyed (committed or aborted) at statement
273
end. Such nested transaction was internally referred to as
274
a "statement transaction" and gave birth to the term.
276
<Historical note ends>
278
Since then a statement transaction is started for each statement
279
that accesses transactional tables or uses the binary log. If
280
the statement succeeds, the statement transaction is committed.
281
If the statement fails, the transaction is rolled back. Commits
282
of statement transactions are not durable -- each such
283
transaction is nested in the normal transaction, and if the
284
normal transaction is rolled back, the effects of all enclosed
285
statement transactions are undone as well. Technically,
286
a statement transaction can be viewed as a savepoint which is
287
maintained automatically in order to make effects of one
290
The normal transaction is started by the user and is ended
291
usually upon a user request as well. The normal transaction
292
encloses transactions of all statements issued between
293
its beginning and its end.
294
In autocommit mode, the normal transaction is equivalent
295
to the statement transaction.
297
Since MySQL supports PSEA (pluggable storage engine
298
architecture), more than one transactional engine can be
299
active at a time. Hence transactions, from the server
300
point of view, are always distributed. In particular,
301
transactional state is maintained independently for each
302
engine. In order to commit a transaction the two phase
303
commit protocol is employed.
305
Not all statements are executed in context of a transaction.
306
Administrative and status information statements do not modify
307
engine data, and thus do not start a statement transaction and
308
also have no effect on the normal transaction. Examples of such
309
statements are SHOW STATUS and RESET SLAVE.
311
Similarly DDL statements are not transactional,
312
and therefore a transaction is [almost] never started for a DDL
313
statement. The difference between a DDL statement and a purely
314
administrative statement though is that a DDL statement always
315
commits the current transaction before proceeding, if there is
318
At last, SQL statements that work with non-transactional
319
engines also have no effect on the transaction state of the
320
connection. Even though they are written to the binary log,
321
and the binary log is, overall, transactional, the writes
322
are done in "write-through" mode, directly to the binlog
323
file, followed with a OS cache sync, in other words,
324
bypassing the binlog undo log (translog).
325
They do not commit the current normal transaction.
326
A failure of a statement that uses non-transactional tables
327
would cause a rollback of the statement transaction, but
328
in case there no non-transactional tables are used,
329
no statement transaction is started.
334
The server stores its transaction-related data in
335
session->transaction. This structure has two members of type
336
Session_TRANS. These members correspond to the statement and
337
normal transactions respectively:
339
- session->transaction.stmt contains a list of engines
340
that are participating in the given statement
341
- session->transaction.all contains a list of engines that
342
have participated in any of the statement transactions started
343
within the context of the normal transaction.
344
Each element of the list contains a pointer to the storage
345
engine, engine-specific transactional data, and engine-specific
348
In autocommit mode session->transaction.all is empty.
349
Instead, data of session->transaction.stmt is
350
used to commit/rollback the normal transaction.
352
The list of registered engines has a few important properties:
353
- no engine is registered in the list twice
354
- engines are present in the list a reverse temporal order --
355
new participants are always added to the beginning of the list.
357
Transaction life cycle
358
----------------------
360
When a new connection is established, session->transaction
361
members are initialized to an empty state.
362
If a statement uses any tables, all affected engines
363
are registered in the statement engine list. In
364
non-autocommit mode, the same engines are registered in
365
the normal transaction list.
366
At the end of the statement, the server issues a commit
367
or a roll back for all engines in the statement list.
368
At this point transaction flags of an engine, if any, are
369
propagated from the statement list to the list of the normal
371
When commit/rollback is finished, the statement list is
372
cleared. It will be filled in again by the next statement,
373
and emptied again at the next statement's end.
375
The normal transaction is committed in a similar way
376
(by going over all engines in session->transaction.all list)
377
but at different times:
378
- upon COMMIT SQL statement is issued by the user
379
- implicitly, by the server, at the beginning of a DDL statement
380
or SET AUTOCOMMIT={0|1} statement.
382
The normal transaction can be rolled back as well:
383
- if the user has requested so, by issuing ROLLBACK SQL
385
- if one of the storage engines requested a rollback
386
by setting session->transaction_rollback_request. This may
387
happen in case, e.g., when the transaction in the engine was
388
chosen a victim of the internal deadlock resolution algorithm
389
and rolled back internally. When such a situation happens, there
390
is little the server can do and the only option is to rollback
391
transactions in all other participating engines. In this case
392
the rollback is accompanied by an error sent to the user.
394
As follows from the use cases above, the normal transaction
395
is never committed when there is an outstanding statement
396
transaction. In most cases there is no conflict, since
397
commits of the normal transaction are issued by a stand-alone
398
administrative or DDL statement, thus no outstanding statement
399
transaction of the previous statement exists. Besides,
400
all statements that manipulate with the normal transaction
401
are prohibited in stored functions and triggers, therefore
402
no conflicting situation can occur in a sub-statement either.
403
The remaining rare cases when the server explicitly has
404
to commit the statement transaction prior to committing the normal
405
one cover error-handling scenarios (see for example
408
When committing a statement or a normal transaction, the server
409
either uses the two-phase commit protocol, or issues a commit
410
in each engine independently. The two-phase commit protocol
412
- all participating engines support two-phase commit (provide
413
handlerton::prepare PSEA API call) and
414
- transactions in at least two engines modify data (i.e. are
417
Note that the two phase commit is used for
418
statement transactions, even though they are not durable anyway.
419
This is done to ensure logical consistency of data in a multiple-
421
For example, imagine that some day MySQL supports unique
422
constraint checks deferred till the end of statement. In such
423
case a commit in one of the engines may yield ER_DUP_KEY,
424
and MySQL should be able to gracefully abort statement
425
transactions of other participants.
427
After the normal transaction has been committed,
428
session->transaction.all list is cleared.
430
When a connection is closed, the current normal transaction, if
433
Roles and responsibilities
434
--------------------------
436
The server has no way to know that an engine participates in
437
the statement and a transaction has been started
438
in it unless the engine says so. Thus, in order to be
439
a part of a transaction, the engine must "register" itself.
440
This is done by invoking trans_register_ha() server call.
441
Normally the engine registers itself whenever handler::external_lock()
442
is called. trans_register_ha() can be invoked many times: if
443
an engine is already registered, the call does nothing.
444
In case autocommit is not set, the engine must register itself
445
twice -- both in the statement list and in the normal transaction
447
In which list to register is a parameter of trans_register_ha().
449
Note, that although the registration interface in itself is
450
fairly clear, the current usage practice often leads to undesired
451
effects. E.g. since a call to trans_register_ha() in most engines
452
is embedded into implementation of handler::external_lock(), some
453
DDL statements start a transaction (at least from the server
454
point of view) even though they are not expected to. E.g.
455
CREATE TABLE does not start a transaction, since
456
handler::external_lock() is never called during CREATE TABLE. But
457
CREATE TABLE ... SELECT does, since handler::external_lock() is
458
called for the table that is being selected from. This has no
459
practical effects currently, but must be kept in mind
462
Once an engine is registered, the server will do the rest
465
During statement execution, whenever any of data-modifying
466
PSEA API methods is used, e.g. handler::write_row() or
467
handler::update_row(), the read-write flag is raised in the
468
statement transaction for the involved engine.
469
Currently All PSEA calls are "traced", and the data can not be
470
changed in a way other than issuing a PSEA call. Important:
471
unless this invariant is preserved the server will not know that
472
a transaction in a given engine is read-write and will not
473
involve the two-phase commit protocol!
475
At the end of a statement, server call
476
ha_autocommit_or_rollback() is invoked. This call in turn
477
invokes handlerton::prepare() for every involved engine.
478
Prepare is followed by a call to handlerton::commit_one_phase()
479
If a one-phase commit will suffice, handlerton::prepare() is not
480
invoked and the server only calls handlerton::commit_one_phase().
481
At statement commit, the statement-related read-write engine
482
flag is propagated to the corresponding flag in the normal
483
transaction. When the commit is complete, the list of registered
486
Rollback is handled in a similar fashion.
488
Additional notes on DDL and the normal transaction.
489
---------------------------------------------------
491
DDLs and operations with non-transactional engines
492
do not "register" in session->transaction lists, and thus do not
493
modify the transaction state. Besides, each DDL in
494
MySQL is prefixed with an implicit normal transaction commit
495
(a call to end_active_trans()), and thus leaves nothing
497
However, as it has been pointed out with CREATE TABLE .. SELECT,
498
some DDL statements can start a *new* transaction.
500
Behaviour of the server in this case is currently badly
502
DDL statements use a form of "semantic" logging
503
to maintain atomicity: if CREATE TABLE .. SELECT failed,
504
the newly created table is deleted.
505
In addition, some DDL statements issue interim transaction
506
commits: e.g. ALTER Table issues a commit after data is copied
507
from the original table to the internal temporary table. Other
508
statements, e.g. CREATE TABLE ... SELECT do not always commit
510
And finally there is a group of DDL statements such as
511
RENAME/DROP Table that doesn't start a new transaction
514
This diversity makes it hard to say what will happen if
515
by chance a stored function is invoked during a DDL --
516
whether any modifications it makes will be committed or not
517
is not clear. Fortunately, SQL grammar of few DDLs allows
518
invocation of a stored function.
520
A consistent behaviour is perhaps to always commit the normal
521
transaction after all DDLs, just like the statement transaction
522
is always committed at the end of all statements.
526
Register a storage engine for a transaction.
528
Every storage engine MUST call this function when it starts
529
a transaction or a statement (that is it must be called both for the
530
"beginning of transaction" and "beginning of statement").
531
Only storage engines registered for the transaction/statement
532
will know when to commit/rollback it.
535
trans_register_ha is idempotent - storage engine may register many
536
times per transaction.
539
void trans_register_ha(Session *session, bool all, handlerton *ht_arg)
541
Session_TRANS *trans;
542
Ha_trx_info *ha_info;
546
trans= &session->transaction.all;
547
session->server_status|= SERVER_STATUS_IN_TRANS;
550
trans= &session->transaction.stmt;
552
ha_info= session->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
554
if (ha_info->is_started())
555
return; /* already registered, return */
557
ha_info->register_ha(trans, ht_arg);
559
trans->no_2pc|=(ht_arg->prepare==0);
560
if (session->transaction.xid_state.xid.is_null())
561
session->transaction.xid_state.xid.set(session->query_id);
570
1 error, transaction was rolled back
572
int ha_prepare(Session *session)
575
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
576
Ha_trx_info *ha_info= trans->ha_list;
579
for (; ha_info; ha_info= ha_info->next())
582
handlerton *ht= ha_info->ht();
583
status_var_increment(session->status_var.ha_prepare_count);
586
if ((err= ht->prepare(ht, session, all)))
588
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
589
ha_rollback_trans(session, all);
596
push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
597
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
598
ha_resolve_storage_engine_name(ht));
606
Check if we can skip the two-phase commit.
608
A helper function to evaluate if two-phase commit is mandatory.
609
As a side effect, propagates the read-only/read-write flags
610
of the statement transaction to its enclosing normal transaction.
612
@retval true we must run a two-phase commit. Returned
613
if we have at least two engines with read-write changes.
614
@retval false Don't need two-phase commit. Even if we have two
615
transactional engines, we can run two independent
616
commits if changes in one of the engines are read-only.
621
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
624
/* The number of storage engines that have actual changes. */
625
unsigned rw_ha_count= 0;
626
Ha_trx_info *ha_info;
628
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
630
if (ha_info->is_trx_read_write())
635
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->ht()->slot].ha_info[1];
636
assert(ha_info != ha_info_all);
638
Merge read-only/read-write information about statement
639
transaction to its enclosing normal transaction. Do this
640
only if in a real transaction -- that is, if we know
641
that ha_info_all is registered in session->transaction.all.
642
Since otherwise we only clutter the normal transaction flags.
644
if (ha_info_all->is_started()) /* false if autocommit. */
645
ha_info_all->coalesce_trx_with(ha_info);
647
else if (rw_ha_count > 1)
650
It is a normal transaction, so we don't need to merge read/write
651
information up, and the need for two-phase commit has been
652
already established. Break the loop prematurely.
657
return rw_ha_count > 1;
665
1 transaction was rolled back
667
2 error during commit, data may be inconsistent
670
Since we don't support nested statement transactions in 5.0,
671
we can't commit or rollback stmt transactions while we are inside
672
stored functions or triggers. So we simply do nothing now.
673
TODO: This should be fixed in later ( >= 5.1) releases.
675
int ha_commit_trans(Session *session, bool all)
677
int error= 0, cookie= 0;
679
'all' means that this is either an explicit commit issued by
680
user, or an implicit commit issued by a DDL.
682
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
683
bool is_real_trans= all || session->transaction.all.ha_list == 0;
684
Ha_trx_info *ha_info= trans->ha_list;
687
We must not commit the normal transaction if a statement
688
transaction is pending. Otherwise statement transaction
689
flags will not get propagated to its normal transaction's
692
assert(session->transaction.stmt.ha_list == NULL ||
693
trans == &session->transaction.stmt);
699
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
701
ha_rollback_trans(session, all);
705
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
707
if (!trans->no_2pc && must_2pc)
709
for (; ha_info && !error; ha_info= ha_info->next())
712
handlerton *ht= ha_info->ht();
714
Do not call two-phase commit if this particular
715
transaction is read-only. This allows for simpler
716
implementation in engines that are always read-only.
718
if (! ha_info->is_trx_read_write())
721
Sic: we know that prepare() is not NULL since otherwise
722
trans->no_2pc would have been set.
724
if ((err= ht->prepare(ht, session, all)))
726
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
729
status_var_increment(session->status_var.ha_prepare_count);
733
ha_rollback_trans(session, all);
738
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
741
start_waiting_global_read_lock(session);
748
This function does not care about global read lock. A caller should.
750
int ha_commit_one_phase(Session *session, bool all)
753
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
754
bool is_real_trans=all || session->transaction.all.ha_list == 0;
755
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
758
for (; ha_info; ha_info= ha_info_next)
761
handlerton *ht= ha_info->ht();
762
if ((err= ht->commit(ht, session, all)))
764
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
767
status_var_increment(session->status_var.ha_commit_count);
768
ha_info_next= ha_info->next();
769
ha_info->reset(); /* keep it conveniently zero-filled */
774
session->transaction.xid_state.xid.null();
777
session->variables.tx_isolation=session->session_tx_isolation;
778
session->transaction.cleanup();
785
int ha_rollback_trans(Session *session, bool all)
788
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
789
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
790
bool is_real_trans=all || session->transaction.all.ha_list == 0;
793
We must not rollback the normal transaction if a statement
794
transaction is pending.
796
assert(session->transaction.stmt.ha_list == NULL ||
797
trans == &session->transaction.stmt);
801
for (; ha_info; ha_info= ha_info_next)
804
handlerton *ht= ha_info->ht();
805
if ((err= ht->rollback(ht, session, all)))
807
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
810
status_var_increment(session->status_var.ha_rollback_count);
811
ha_info_next= ha_info->next();
812
ha_info->reset(); /* keep it conveniently zero-filled */
817
session->transaction.xid_state.xid.null();
820
session->variables.tx_isolation=session->session_tx_isolation;
821
session->transaction.cleanup();
825
session->transaction_rollback_request= false;
828
If a non-transactional table was updated, warn; don't warn if this is a
829
slave thread (because when a slave thread executes a ROLLBACK, it has
830
been read from the binary log, so it's 100% sure and normal to produce
831
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
832
slave SQL thread, it would not stop the thread but just be printed in
833
the error log; but we don't want users to wonder why they have this
834
message in the error log, so we don't send it.
836
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
837
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
838
ER_WARNING_NOT_COMPLETE_ROLLBACK,
839
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
844
This is used to commit or rollback a single statement depending on
848
Note that if the autocommit is on, then the following call inside
849
InnoDB will commit or rollback the whole transaction (= the statement). The
850
autocommit mechanism built into InnoDB is based on counting locks, but if
851
the user has used LOCK TABLES then that mechanism does not know to do the
854
int ha_autocommit_or_rollback(Session *session, int error)
856
if (session->transaction.stmt.ha_list)
860
if (ha_commit_trans(session, 0))
865
(void) ha_rollback_trans(session, 0);
866
if (session->transaction_rollback_request)
867
(void) ha_rollback(session);
870
session->variables.tx_isolation=session->session_tx_isolation;
881
static bool xacommit_handlerton(Session *,
885
handlerton *hton= plugin_data(plugin, handlerton *);
886
if (hton->state == SHOW_OPTION_YES && hton->recover)
888
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
889
((struct xahton_st *)arg)->result= 0;
894
static bool xarollback_handlerton(Session *,
898
handlerton *hton= plugin_data(plugin, handlerton *);
899
if (hton->state == SHOW_OPTION_YES && hton->recover)
901
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
902
((struct xahton_st *)arg)->result= 0;
908
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
910
struct xahton_st xaop;
914
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
915
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
921
recover() step of xa.
924
there are three modes of operation:
925
- automatic recover after a crash
926
in this case commit_list != 0, tc_heuristic_recover==0
927
all xids from commit_list are committed, others are rolled back
928
- manual (heuristic) recover
929
in this case commit_list==0, tc_heuristic_recover != 0
930
DBA has explicitly specified that all prepared transactions should
931
be committed (or rolled back).
932
- no recovery (MySQL did not detect a crash)
933
in this case commit_list==0, tc_heuristic_recover == 0
934
there should be no prepared transactions in this case.
938
int len, found_foreign_xids, found_my_xids;
944
static bool xarecover_handlerton(Session *,
948
handlerton *hton= plugin_data(plugin, handlerton *);
949
struct xarecover_st *info= (struct xarecover_st *) arg;
952
if (hton->state == SHOW_OPTION_YES && hton->recover)
954
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
956
errmsg_printf(ERRMSG_LVL_INFO, _("Found %d prepared transaction(s) in %s"),
957
got, ha_resolve_storage_engine_name(hton));
958
for (int i=0; i < got; i ++)
960
my_xid x=info->list[i].get_my_xid();
961
if (!x) // not "mine" - that is generated by external TM
963
xid_cache_insert(info->list+i, XA_PREPARED);
964
info->found_foreign_xids++;
969
info->found_my_xids++;
973
if (info->commit_list ?
974
hash_search(info->commit_list, (unsigned char *)&x, sizeof(x)) != 0 :
975
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
977
hton->commit_by_xid(hton, info->list+i);
981
hton->rollback_by_xid(hton, info->list+i);
991
int ha_recover(HASH *commit_list)
993
struct xarecover_st info;
994
info.found_foreign_xids= info.found_my_xids= 0;
995
info.commit_list= commit_list;
996
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
999
/* commit_list and tc_heuristic_recover cannot be set both */
1000
assert(info.commit_list==0 || tc_heuristic_recover==0);
1001
/* if either is set, total_ha_2pc must be set too */
1002
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1004
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1007
if (info.commit_list)
1008
errmsg_printf(ERRMSG_LVL_INFO, _("Starting crash recovery..."));
1011
#ifndef WILL_BE_DELETED_LATER
1014
for now, only InnoDB supports 2pc. It means we can always safely
1015
rollback all pending transactions, without risking inconsistent data
1018
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1019
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1024
for (info.len= MAX_XID_LIST_SIZE ;
1025
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1027
info.list=(XID *)malloc(info.len*sizeof(XID));
1031
errmsg_printf(ERRMSG_LVL_ERROR, ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1035
plugin_foreach(NULL, xarecover_handlerton,
1036
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1038
free((unsigned char*)info.list);
1039
if (info.found_foreign_xids)
1040
errmsg_printf(ERRMSG_LVL_WARN, _("Found %d prepared XA transactions"),
1041
info.found_foreign_xids);
1042
if (info.dry_run && info.found_my_xids)
1044
errmsg_printf(ERRMSG_LVL_ERROR,
1045
_("Found %d prepared transactions! It means that drizzled "
1046
"was not shut down properly last time and critical "
1047
"recovery information (last binlog or %s file) was "
1048
"manually deleted after a crash. You have to start "
1049
"drizzled with the --tc-heuristic-recover switch to "
1050
"commit or rollback pending transactions."),
1051
info.found_my_xids, opt_tc_log_file);
1054
if (info.commit_list)
1055
errmsg_printf(ERRMSG_LVL_INFO, _("Crash recovery finished."));
1060
return the list of XID's to a client, the same way SHOW commands do.
1063
I didn't find in XA specs that an RM cannot return the same XID twice,
1064
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1065
It can be easily fixed later, if necessary.
1067
bool mysql_xa_recover(Session *session)
1069
List<Item> field_list;
1070
Protocol *protocol= session->protocol;
1074
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1075
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1076
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1077
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1079
if (protocol->send_fields(&field_list,
1080
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1083
pthread_mutex_lock(&LOCK_xid_cache);
1084
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1086
if (xs->xa_state==XA_PREPARED)
1088
protocol->prepare_for_resend();
1089
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1090
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1091
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1092
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1094
if (protocol->write())
1096
pthread_mutex_unlock(&LOCK_xid_cache);
1102
pthread_mutex_unlock(&LOCK_xid_cache);
1109
This function should be called when MySQL sends rows of a SELECT result set
1110
or the EOF mark to the client. It releases a possible adaptive hash index
1111
S-latch held by session in InnoDB and also releases a possible InnoDB query
1112
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a session to
1113
keep them over several calls of the InnoDB handler interface when a join
1114
is executed. But when we let the control to pass to the client they have
1115
to be released because if the application program uses mysql_use_result(),
1116
it may deadlock on the S-latch if the application on another connection
1117
performs another SQL query. In MySQL-4.1 this is even more important because
1118
there a connection can have several SELECT queries open at the same time.
1120
@param session the thread handle of the current connection
1125
static bool release_temporary_latches(Session *session, plugin_ref plugin,
1128
handlerton *hton= plugin_data(plugin, handlerton *);
1130
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1131
hton->release_temporary_latches(hton, session);
1137
int ha_release_temporary_latches(Session *session)
1139
plugin_foreach(session, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1145
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
1148
Session_TRANS *trans= &session->transaction.all;
1149
Ha_trx_info *ha_info, *ha_info_next;
1153
rolling back to savepoint in all storage engines that were part of the
1154
transaction when the savepoint was set
1156
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1159
handlerton *ht= ha_info->ht();
1161
assert(ht->savepoint_set != 0);
1162
if ((err= ht->savepoint_rollback(ht, session,
1163
(unsigned char *)(sv+1)+ht->savepoint_offset)))
1165
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1168
status_var_increment(session->status_var.ha_savepoint_rollback_count);
1169
trans->no_2pc|= ht->prepare == 0;
1172
rolling back the transaction in all storage engines that were not part of
1173
the transaction when the savepoint was set
1175
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1176
ha_info= ha_info_next)
1179
handlerton *ht= ha_info->ht();
1180
if ((err= ht->rollback(ht, session, !(0))))
1182
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1185
status_var_increment(session->status_var.ha_rollback_count);
1186
ha_info_next= ha_info->next();
1187
ha_info->reset(); /* keep it conveniently zero-filled */
1189
trans->ha_list= sv->ha_list;
1195
according to the sql standard (ISO/IEC 9075-2:2003)
1196
section "4.33.4 SQL-statements and transaction states",
1197
SAVEPOINT is *not* transaction-initiating SQL-statement
1199
int ha_savepoint(Session *session, SAVEPOINT *sv)
1202
Session_TRANS *trans= &session->transaction.all;
1203
Ha_trx_info *ha_info= trans->ha_list;
1204
for (; ha_info; ha_info= ha_info->next())
1207
handlerton *ht= ha_info->ht();
1209
if (! ht->savepoint_set)
1211
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1215
if ((err= ht->savepoint_set(ht, session, (unsigned char *)(sv+1)+ht->savepoint_offset)))
1217
my_error(ER_GET_ERRNO, MYF(0), err);
1220
status_var_increment(session->status_var.ha_savepoint_count);
1223
Remember the list of registered storage engines. All new
1224
engines are prepended to the beginning of the list.
1226
sv->ha_list= trans->ha_list;
1230
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
1233
Ha_trx_info *ha_info= sv->ha_list;
1235
for (; ha_info; ha_info= ha_info->next())
1238
handlerton *ht= ha_info->ht();
1239
/* Savepoint life time is enclosed into transaction life time. */
1241
if (!ht->savepoint_release)
1243
if ((err= ht->savepoint_release(ht, session,
1244
(unsigned char *)(sv+1) + ht->savepoint_offset)))
1246
my_error(ER_GET_ERRNO, MYF(0), err);
1254
static bool snapshot_handlerton(Session *session, plugin_ref plugin, void *arg)
1256
handlerton *hton= plugin_data(plugin, handlerton *);
1257
if (hton->state == SHOW_OPTION_YES &&
1258
hton->start_consistent_snapshot)
1260
hton->start_consistent_snapshot(hton, session);
1261
*((bool *)arg)= false;
1266
int ha_start_consistent_snapshot(Session *session)
1270
plugin_foreach(session, snapshot_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1273
Same idea as when one wants to CREATE TABLE in one engine which does not
1277
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1278
"This Drizzle server does not support any "
1279
"consistent-read capable storage engine");
1284
static bool flush_handlerton(Session *,
1288
handlerton *hton= plugin_data(plugin, handlerton *);
1289
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1290
hton->flush_logs(hton))
1296
bool ha_flush_logs(handlerton *db_type)
1298
if (db_type == NULL)
1300
if (plugin_foreach(NULL, flush_handlerton,
1301
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1306
if (db_type->state != SHOW_OPTION_YES ||
1307
(db_type->flush_logs && db_type->flush_logs(db_type)))
1313
static const char *check_lowercase_names(handler *file, const char *path,
1316
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1319
/* Ensure that table handler get path in lower case */
1320
if (tmp_path != path)
1321
strcpy(tmp_path, path);
1324
we only should turn into lowercase database/table part
1325
so start the process after homedirectory
1327
my_casedn_str(files_charset_info, tmp_path + drizzle_data_home_len);
1333
An interceptor to hijack the text of the error message without
1334
setting an error in the thread. We need the text to present it
1335
in the form of a warning to the user.
1338
struct Ha_delete_table_error_handler: public Internal_error_handler
1341
Ha_delete_table_error_handler() : Internal_error_handler() {}
1342
virtual bool handle_error(uint32_t sql_errno,
1343
const char *message,
1344
DRIZZLE_ERROR::enum_warning_level level,
1346
char buff[DRIZZLE_ERRMSG_SIZE];
1351
Ha_delete_table_error_handler::
1352
handle_error(uint32_t ,
1353
const char *message,
1354
DRIZZLE_ERROR::enum_warning_level ,
1357
/* Grab the error message */
1358
strncpy(buff, message, sizeof(buff)-1);
1363
struct handlerton_delete_table_args {
1370
static bool deletetable_handlerton(Session *,
1374
struct handlerton_delete_table_args *dtargs= (struct handlerton_delete_table_args *) args;
1376
Session *session= dtargs->session;
1377
const char *path= dtargs->path;
1380
char tmp_path[FN_REFLEN];
1382
if(dtargs->error!=ENOENT) /* already deleted table */
1385
handlerton *table_type= plugin_data(plugin, handlerton *);
1390
if(!(table_type->state == SHOW_OPTION_YES && table_type->create))
1393
if ((file= table_type->create(table_type, NULL, session->mem_root)))
1398
path= check_lowercase_names(file, path, tmp_path);
1399
int error= file->ha_delete_table(path);
1403
dtargs->error= error;
1405
delete dtargs->file;
1414
This should return ENOENT if the file doesn't exists.
1415
The .frm file will be deleted only if we return 0 or ENOENT
1417
int ha_delete_table(Session *session, const char *path,
1418
const char *db, const char *alias, bool generate_warning)
1420
TABLE_SHARE dummy_share;
1423
struct handlerton_delete_table_args dtargs;
1424
dtargs.error= ENOENT;
1425
dtargs.session= session;
1429
plugin_foreach(NULL, deletetable_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1432
memset(&dummy_table, 0, sizeof(dummy_table));
1433
memset(&dummy_share, 0, sizeof(dummy_share));
1434
dummy_table.s= &dummy_share;
1436
if (dtargs.error && generate_warning)
1439
Because file->print_error() use my_error() to generate the error message
1440
we use an internal error handler to intercept it and store the text
1441
in a temporary buffer. Later the message will be presented to user
1444
Ha_delete_table_error_handler ha_delete_table_error_handler;
1446
/* Fill up strucutures that print_error may need */
1447
dummy_share.path.str= (char*) path;
1448
dummy_share.path.length= strlen(path);
1449
dummy_share.db.str= (char*) db;
1450
dummy_share.db.length= strlen(db);
1451
dummy_share.table_name.str= (char*) alias;
1452
dummy_share.table_name.length= strlen(alias);
1453
dummy_table.alias= alias;
1457
handler *file= dtargs.file;
1458
file->change_table_ptr(&dummy_table, &dummy_share);
1460
session->push_internal_handler(&ha_delete_table_error_handler);
1461
file->print_error(dtargs.error, 0);
1463
session->pop_internal_handler();
1466
dtargs.error= -1; /* General form of fail. maybe bad FRM */
1469
XXX: should we convert *all* errors to warnings here?
1470
What if the error is fatal?
1472
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_ERROR, dtargs.error,
1473
ha_delete_table_error_handler.buff);
1479
return dtargs.error;
53
1482
/****************************************************************************
54
** General Cursor functions
1483
** General handler functions
55
1484
****************************************************************************/
56
Cursor::Cursor(plugin::StorageEngine &engine_arg,
60
estimation_rows_to_insert(0),
62
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
63
ref_length(sizeof(internal::my_off_t)),
66
next_insert_id(0), insert_id_for_cur_row(0)
71
assert(locked == false);
72
/* TODO: assert(inited == NONE); */
77
* @note this only used in
78
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
79
* of the writing of this comment. -Brian
81
Cursor *Cursor::clone(memory::Root *mem_root)
83
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
1485
handler *handler::clone(MEM_ROOT *mem_root)
1487
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
86
Allocate Cursor->ref here because otherwise ha_open will allocate it
1489
Allocate handler->ref here because otherwise ha_open will allocate it
87
1490
on this->table->mem_root and we will not be able to reclaim that memory
88
when the clone Cursor object is destroyed.
1491
when the clone handler object is destroyed.
90
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1493
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
93
TableIdentifier identifier(getTable()->getShare()->getSchemaName(),
94
getTable()->getShare()->getTableName(),
95
getTable()->getShare()->getType());
97
if (new_handler && !new_handler->ha_open(identifier,
98
getTable()->getDBStat(),
1495
if (new_handler && !new_handler->ha_open(table,
1496
table->s->normalized_path.str,
99
1498
HA_OPEN_IGNORE_IF_LOCKED))
100
1499
return new_handler;
106
given a buffer with a key value, and a map of keyparts
107
that are present in this value, returns the length of the value
109
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
111
/* works only with key prefixes */
112
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
114
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
115
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
118
while (key_part_found < end_key_part_found && keypart_map_arg)
120
length+= key_part_found->store_length;
121
keypart_map_arg >>= 1;
127
int Cursor::startIndexScan(uint32_t idx, bool sorted)
1503
int handler::ha_index_init(uint32_t idx, bool sorted)
130
assert(inited == NONE);
131
if (!(result= doStartIndexScan(idx, sorted)))
1506
assert(inited==NONE);
1507
if (!(result= index_init(idx, sorted)))
133
1509
end_range= NULL;
137
int Cursor::endIndexScan()
1513
int handler::ha_index_end()
139
1515
assert(inited==INDEX);
141
1517
end_range= NULL;
142
return(doEndIndexScan());
1518
return(index_end());
145
int Cursor::startTableScan(bool scan)
1521
int handler::ha_rnd_init(bool scan)
148
1524
assert(inited==NONE || (inited==RND && scan));
149
inited= (result= doStartTableScan(scan)) ? NONE: RND;
1525
inited= (result= rnd_init(scan)) ? NONE: RND;
154
int Cursor::endTableScan()
1529
int handler::ha_rnd_end()
156
1531
assert(inited==RND);
158
return(doEndTableScan());
161
int Cursor::ha_index_or_rnd_end()
163
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
166
void Cursor::ha_start_bulk_insert(ha_rows rows)
1536
int handler::ha_index_or_rnd_end()
1538
return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0;
1541
handler::Table_flags handler::ha_table_flags() const
1543
return cached_table_flags;
1546
void handler::ha_start_bulk_insert(ha_rows rows)
168
1548
estimation_rows_to_insert= rows;
169
1549
start_bulk_insert(rows);
172
int Cursor::ha_end_bulk_insert()
1552
int handler::ha_end_bulk_insert()
174
1554
estimation_rows_to_insert= 0;
175
1555
return end_bulk_insert();
178
const key_map *Cursor::keys_to_use_for_scanning()
1558
void handler::change_table_ptr(Table *table_arg, TABLE_SHARE *share)
1564
const key_map *handler::keys_to_use_for_scanning()
180
1566
return &key_map_empty;
183
bool Cursor::has_transactions()
185
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
188
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
190
(getTable()->in_use->status_var.*offset)++;
193
void **Cursor::ha_data(Session *session) const
195
return session->getEngineData(getEngine());
198
bool Cursor::is_fatal_error(int error, uint32_t flags)
1569
bool handler::has_transactions()
1571
return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0;
1574
void handler::ha_statistic_increment(ulong SSV::*offset) const
1576
status_var_increment(table->in_use->status_var.*offset);
1579
void **handler::ha_data(Session *session) const
1581
return session_ha_data(session, ht);
1584
Session *handler::ha_session(void) const
1586
assert(!table || !table->in_use || table->in_use == current_session);
1587
return (table && table->in_use) ? table->in_use : current_session;
1591
bool handler::is_fatal_error(int error, uint32_t flags)
201
1594
((flags & HA_CHECK_DUP_KEY) &&
1142
3353
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1144
3355
*range_info= mrr_cur_range.ptr;
3360
/* **************************************************************************
3361
* DS-MRR implementation
3362
***************************************************************************/
3365
DS-MRR: Initialize and start MRR scan
3367
Initialize and start the MRR scan. Depending on the mode parameter, this
3368
may use default or DS-MRR implementation.
3370
@param h Table handler to be used
3371
@param key Index to be used
3372
@param seq_funcs Interval sequence enumeration functions
3373
@param seq_init_param Interval sequence enumeration parameter
3374
@param n_ranges Number of ranges in the sequence.
3375
@param mode HA_MRR_* modes to use
3376
@param buf INOUT Buffer to use
3378
@retval 0 Ok, Scan started.
3382
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
3383
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3384
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
3388
Item *pushed_cond= NULL;
3390
keyno= h_in->active_index;
3392
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3394
use_default_impl= true;
3395
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
3396
n_ranges, mode, buf));
3398
rowids_buf= buf->buffer;
3399
//psergey-todo: don't add key_length as it is not needed anymore
3400
rowids_buf += key->key_length + h_in->ref_length;
3402
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3403
rowids_buf_end= buf->buffer_end;
3405
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
3406
rowids_buf_last= rowids_buf +
3407
((rowids_buf_end - rowids_buf)/ elem_size)*
3409
rowids_buf_end= rowids_buf_last;
3411
/* Create a separate handler object to do rndpos() calls. */
3412
Session *session= current_session;
3413
if (!(new_h2= h_in->clone(session->mem_root)) ||
3414
new_h2->ha_external_lock(session, F_RDLCK))
3420
if (keyno == h_in->pushed_idx_cond_keyno)
3421
pushed_cond= h_in->pushed_idx_cond;
3422
if (h_in->ha_index_end())
3429
table->prepare_for_position();
3430
new_h2->extra(HA_EXTRA_KEYREAD);
3432
if (h2->ha_index_init(keyno, false) ||
3433
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3436
use_default_impl= false;
3439
h2->idx_cond_push(keyno, pushed_cond);
3440
if (dsmrr_fill_buffer(new_h2))
3444
If the above call has scanned through all intervals in *seq, then
3445
adjust *buf to indicate that the remaining buffer space will not be used.
3448
buf->end_of_used_area= rowids_buf_last;
3450
if (h_in->ha_rnd_init(false))
3455
h2->ha_index_or_rnd_end();
3456
h2->ha_external_lock(session, F_UNLCK);
3463
void DsMrr_impl::dsmrr_close()
3467
h2->ha_external_lock(current_session, F_UNLCK);
3472
use_default_impl= true;
3477
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
3479
return ((handler*)h)->cmp_ref(a, b);
3484
DS-MRR: Fill the buffer with rowids and sort it by rowid
3486
{This is an internal function of DiskSweep MRR implementation}
3487
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3488
buffer. When the buffer is full or scan is completed, sort the buffer by
3491
The function assumes that rowids buffer is empty when it is invoked.
3493
@param h Table handler
3495
@retval 0 OK, the next portion of rowids is in the buffer,
3500
int DsMrr_impl::dsmrr_fill_buffer(handler *)
3505
rowids_buf_cur= rowids_buf;
3506
while ((rowids_buf_cur < rowids_buf_end) &&
3507
!(res= h2->handler::multi_range_read_next(&range_info)))
3509
/* Put rowid, or {rowid, range_id} pair into the buffer */
3510
h2->position(table->record[0]);
3511
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3512
rowids_buf_cur += h->ref_length;
3516
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3517
rowids_buf_cur += sizeof(void*);
3521
if (res && res != HA_ERR_END_OF_FILE)
3523
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3525
/* Sort the buffer contents by rowid */
3526
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3527
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3529
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3531
rowids_buf_last= rowids_buf_cur;
3532
rowids_buf_cur= rowids_buf;
3538
DS-MRR implementation: multi_range_read_next() function
3541
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
3545
if (use_default_impl)
3546
return h_in->handler::multi_range_read_next(range_info);
3548
if (rowids_buf_cur == rowids_buf_last)
3552
res= HA_ERR_END_OF_FILE;
3555
res= dsmrr_fill_buffer(h);
3560
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3561
if (rowids_buf_cur == rowids_buf_last)
3563
res= HA_ERR_END_OF_FILE;
3567
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
3568
rowids_buf_cur += h_in->ref_length;
3571
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3572
rowids_buf_cur += sizeof(void*);
3583
DS-MRR implementation: multi_range_read_info() function
3585
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
3586
uint32_t *flags, COST_VECT *cost)
3589
uint32_t def_flags= *flags;
3590
uint32_t def_bufsz= *bufsz;
3592
/* Get cost/flags/mem_usage of default MRR implementation */
3593
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3597
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3598
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3600
/* Default implementation is choosen */
3609
DS-MRR Implementation: multi_range_read_info_const() function
3612
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
3613
void *seq_init_param, uint32_t n_ranges,
3614
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
3617
uint32_t def_flags= *flags;
3618
uint32_t def_bufsz= *bufsz;
3619
/* Get cost/flags/mem_usage of default MRR implementation */
3620
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3621
n_ranges, &def_bufsz,
3623
if (rows == HA_POS_ERROR)
3625
/* Default implementation can't perform MRR scan => we can't either */
3630
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3631
use the default MRR implementation (we need it for UPDATE/DELETE).
3632
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3634
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3635
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3642
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3649
Check if key has partially-covered columns
3651
We can't use DS-MRR to perform range scans when the ranges are over
3652
partially-covered keys, because we'll not have full key part values
3653
(we'll have their prefixes from the index) and will not be able to check
3654
if we've reached the end the range.
3656
@param keyno Key to check
3659
Allow use of DS-MRR in cases where the index has partially-covered
3660
components but they are not used for scanning.
3666
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
3668
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3669
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3670
for (; kp != kp_end; kp++)
3672
if (!kp->field->part_of_key.is_set(keyno))
3680
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
3682
Make the choice between using Default MRR implementation and DS-MRR.
3683
This function contains common functionality factored out of dsmrr_info()
3684
and dsmrr_info_const(). The function assumes that the default MRR
3685
implementation's applicability requirements are satisfied.
3687
@param keyno Index number
3688
@param rows E(full rows to be retrieved)
3689
@param flags IN MRR flags provided by the MRR user
3690
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3691
else the value is not modified
3692
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3693
else the value is not modified
3694
@param cost IN Cost of default MRR implementation
3695
OUT If DS-MRR is choosen, cost of DS-MRR scan
3696
else the value is not modified
3698
@retval true Default MRR implementation should be used
3699
@retval false DS-MRR implementation should be used
3702
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3703
uint32_t *bufsz, COST_VECT *cost)
3705
COST_VECT dsmrr_cost;
3707
Session *session= current_session;
3708
if ((session->variables.optimizer_use_mrr == 2) ||
3709
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3710
(keyno == table->s->primary_key &&
3711
h->primary_key_is_clustered()) ||
3712
key_uses_partial_cols(keyno))
3714
/* Use the default implementation */
3715
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3719
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3721
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3727
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
3728
DS-MRR and Default implementations cost. This allows one to force use of
3729
DS-MRR whenever it is applicable without affecting other cost-based
3732
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
3733
dsmrr_cost.total_cost() > cost->total_cost())
3736
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
3738
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
3739
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
3745
/* Use the default MRR implementation */
3752
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
3756
Get cost of DS-MRR scan
3758
@param keynr Index to be used
3759
@param rows E(Number of rows to be scanned)
3760
@param flags Scan parameters (HA_MRR_* flags)
3761
@param buffer_size INOUT Buffer size
3762
@param cost OUT The cost
3765
@retval true Error, DS-MRR cannot be used (the buffer is too small
3769
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
3770
uint32_t *buffer_size, COST_VECT *cost)
3772
uint32_t max_buff_entries, elem_size;
3773
ha_rows rows_in_full_step, rows_in_last_step;
3774
uint32_t n_full_steps;
3775
double index_read_cost;
3777
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
3778
max_buff_entries = *buffer_size / elem_size;
3780
if (!max_buff_entries)
3781
return true; /* Buffer has not enough space for even 1 rowid */
3783
/* Number of iterations we'll make with full buffer */
3784
n_full_steps= (uint32_t)floor(rows2double(rows) / max_buff_entries);
3787
Get numbers of rows we'll be processing in
3788
- non-last sweep, with full buffer
3789
- last iteration, with non-full buffer
3791
rows_in_full_step= max_buff_entries;
3792
rows_in_last_step= rows % max_buff_entries;
3794
/* Adjust buffer size if we expect to use only part of the buffer */
3797
get_sort_and_sweep_cost(table, rows, cost);
3798
cost->multiply(n_full_steps);
3803
*buffer_size= cmax((ulong)*buffer_size,
3804
(size_t)(1.2*rows_in_last_step) * elem_size +
3805
h->ref_length + table->key_info[keynr].key_length);
3808
COST_VECT last_step_cost;
3809
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
3810
cost->add(&last_step_cost);
3812
if (n_full_steps != 0)
3813
cost->mem_cost= *buffer_size;
3815
cost->mem_cost= (double)rows_in_last_step * elem_size;
3817
/* Total cost of all index accesses */
3818
index_read_cost= h->index_only_read_time(keynr, (double)rows);
3819
cost->add_io(index_read_cost, 1 /* Random seeks */);
3825
Get cost of one sort-and-sweep step
3828
get_sort_and_sweep_cost()
3829
table Table being accessed
3830
nrows Number of rows to be sorted and retrieved
3834
Get cost of these operations:
3835
- sort an array of #nrows ROWIDs using qsort
3836
- read #nrows records from table in a sweep.
3840
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3844
get_sweep_read_cost(table, nrows, false, cost);
3845
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3846
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3849
cost->cpu_cost += cmp_op * log2(cmp_op);
3857
Get cost of reading nrows table records in a "disk sweep"
3859
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3860
for an ordered sequence of rowids.
3862
We assume hard disk IO. The read is performed as follows:
3864
1. The disk head is moved to the needed cylinder
3865
2. The controller waits for the plate to rotate
3866
3. The data is transferred
3868
Time to do #3 is insignificant compared to #2+#1.
3870
Time to move the disk head is proportional to head travel distance.
3872
Time to wait for the plate to rotate depends on whether the disk head
3875
If disk head wasn't moved, the wait time is proportional to distance
3876
between the previous block and the block we're reading.
3878
If the head was moved, we don't know how much we'll need to wait for the
3879
plate to rotate. We assume the wait time to be a variate with a mean of
3880
0.5 of full rotation time.
3882
Our cost units are "random disk seeks". The cost of random disk seek is
3883
actually not a constant, it depends one range of cylinders we're going
3884
to access. We make it constant by introducing a fuzzy concept of "typical
3885
datafile length" (it's fuzzy as it's hard to tell whether it should
3886
include index file, temp.tables etc). Then random seek cost is:
3888
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3890
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3892
@param table Table to be accessed
3893
@param nrows Number of rows to retrieve
3894
@param interrupted true <=> Assume that the disk sweep will be
3895
interrupted by other disk IO. false - otherwise.
3896
@param cost OUT The cost.
3899
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3903
if (table->file->primary_key_is_clustered())
3905
cost->io_count= table->file->read_time(table->s->primary_key,
3906
(uint32_t) nrows, nrows);
3911
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3913
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3914
if (busy_blocks < 1.0)
3917
cost->io_count= busy_blocks;
3921
/* Assume reading is done in one 'sweep' */
3922
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3923
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);