23
23
Handler-calling-functions
26
#include "drizzled/server_includes.h"
27
#include "libdrizzleclient/libdrizzle.h"
28
#include "mysys/hash.h"
30
29
#include "drizzled/error.h"
31
#include "drizzled/field/epoch.h"
32
30
#include "drizzled/gettext.h"
33
#include "drizzled/internal/my_sys.h"
34
#include "drizzled/item/empty_string.h"
35
#include "drizzled/item/int.h"
36
#include "drizzled/lock.h"
37
#include "drizzled/message/table.h"
38
#include "drizzled/my_hash.h"
39
#include "drizzled/optimizer/cost_vector.h"
40
#include "drizzled/plugin/client.h"
41
#include "drizzled/plugin/event_observer.h"
42
#include "drizzled/plugin/storage_engine.h"
31
#include "drizzled/data_home.h"
43
32
#include "drizzled/probes.h"
33
#include "drizzled/sql_parse.h"
34
#include "drizzled/cost_vect.h"
44
36
#include "drizzled/session.h"
45
37
#include "drizzled/sql_base.h"
46
#include "drizzled/sql_parse.h"
47
#include "drizzled/transaction_services.h"
38
#include "drizzled/replicator.h"
39
#include "drizzled/lock.h"
40
#include "drizzled/item/int.h"
41
#include "drizzled/item/empty_string.h"
42
#include "drizzled/unireg.h" // for mysql_frm_type
43
#include "drizzled/field/timestamp.h"
44
#include "drizzled/serialize/table.pb.h"
46
#if defined(CMATH_NAMESPACE)
47
using namespace CMATH_NAMESPACE;
51
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
53
/* number of entries in handlertons[] */
55
/* number of storage engines (from handlertons[]) that support 2pc */
56
uint32_t total_ha_2pc= 0;
57
/* size of savepoint storage area (see ha_init) */
58
uint32_t savepoint_alloc_size= 0;
60
const char *ha_row_type[] = {
61
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
64
const char *tx_isolation_names[] =
65
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
68
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
69
tx_isolation_names, NULL};
71
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
72
uint32_t known_extensions_id= 0;
76
Register handler error messages for use with my_error().
84
int ha_init_errors(void)
86
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
89
/* Allocate a pointer array for the error message strings. */
90
/* Zerofill it to avoid uninitialized gaps. */
91
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
93
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
95
/* Set the dedicated error messages. */
96
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
97
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
98
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
99
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
100
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
101
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
102
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
103
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
104
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
105
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
106
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
107
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
108
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
109
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
110
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
111
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
112
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
113
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
114
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
115
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
116
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
117
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
118
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
119
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
120
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
121
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
122
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
123
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
124
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
125
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
126
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
127
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
128
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
129
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
130
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
131
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
132
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
133
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
134
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
135
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
136
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
138
/* Register the error messages for use with my_error(). */
139
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
144
Unregister handler error messages.
151
static int ha_finish_errors(void)
153
const char **errmsgs;
155
/* Allocate a pointer array for the error message strings. */
156
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
158
free((unsigned char*) errmsgs);
166
assert(total_ha < MAX_HA);
168
Check if there is a transaction-capable storage engine besides the
169
binary log (which is considered a transaction-capable storage engine in
172
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
173
savepoint_alloc_size+= sizeof(SAVEPOINT);
182
This should be eventualy based on the graceful shutdown flag.
183
So if flag is equal to HA_PANIC_CLOSE, the deallocate
186
if (ha_finish_errors())
192
static bool dropdb_handlerton(Session *,
196
handlerton *hton= plugin_data(plugin, handlerton *);
197
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
198
hton->drop_database(hton, (char *)path);
203
void ha_drop_database(char* path)
205
plugin_foreach(NULL, dropdb_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
209
static bool closecon_handlerton(Session *session, plugin_ref plugin,
212
handlerton *hton= plugin_data(plugin, handlerton *);
214
there's no need to rollback here as all transactions must
215
be rolled back already
217
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
218
session_get_ha_data(session, hton))
219
hton->close_connection(hton, session);
226
don't bother to rollback here, it's done already
228
void ha_close_connection(Session* session)
230
plugin_foreach(session, closecon_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
233
/* ========================================================================
234
======================= TRANSACTIONS ===================================*/
237
Transaction handling in the server
238
==================================
240
In each client connection, MySQL maintains two transactional
242
- a statement transaction,
243
- a standard, also called normal transaction.
247
"Statement transaction" is a non-standard term that comes
248
from the times when MySQL supported BerkeleyDB storage engine.
250
First of all, it should be said that in BerkeleyDB auto-commit
251
mode auto-commits operations that are atomic to the storage
252
engine itself, such as a write of a record, and are too
253
high-granular to be atomic from the application perspective
254
(MySQL). One SQL statement could involve many BerkeleyDB
255
auto-committed operations and thus BerkeleyDB auto-commit was of
258
Secondly, instead of SQL standard savepoints, BerkeleyDB
259
provided the concept of "nested transactions". In a nutshell,
260
transactions could be arbitrarily nested, but when the parent
261
transaction was committed or aborted, all its child (nested)
262
transactions were handled committed or aborted as well.
263
Commit of a nested transaction, in turn, made its changes
264
visible, but not durable: it destroyed the nested transaction,
265
all its changes would become available to the parent and
266
currently active nested transactions of this parent.
268
So the mechanism of nested transactions was employed to
269
provide "all or nothing" guarantee of SQL statements
270
required by the standard.
271
A nested transaction would be created at start of each SQL
272
statement, and destroyed (committed or aborted) at statement
273
end. Such nested transaction was internally referred to as
274
a "statement transaction" and gave birth to the term.
276
<Historical note ends>
278
Since then a statement transaction is started for each statement
279
that accesses transactional tables or uses the binary log. If
280
the statement succeeds, the statement transaction is committed.
281
If the statement fails, the transaction is rolled back. Commits
282
of statement transactions are not durable -- each such
283
transaction is nested in the normal transaction, and if the
284
normal transaction is rolled back, the effects of all enclosed
285
statement transactions are undone as well. Technically,
286
a statement transaction can be viewed as a savepoint which is
287
maintained automatically in order to make effects of one
290
The normal transaction is started by the user and is ended
291
usually upon a user request as well. The normal transaction
292
encloses transactions of all statements issued between
293
its beginning and its end.
294
In autocommit mode, the normal transaction is equivalent
295
to the statement transaction.
297
Since MySQL supports PSEA (pluggable storage engine
298
architecture), more than one transactional engine can be
299
active at a time. Hence transactions, from the server
300
point of view, are always distributed. In particular,
301
transactional state is maintained independently for each
302
engine. In order to commit a transaction the two phase
303
commit protocol is employed.
305
Not all statements are executed in context of a transaction.
306
Administrative and status information statements do not modify
307
engine data, and thus do not start a statement transaction and
308
also have no effect on the normal transaction. Examples of such
309
statements are SHOW STATUS and RESET SLAVE.
311
Similarly DDL statements are not transactional,
312
and therefore a transaction is [almost] never started for a DDL
313
statement. The difference between a DDL statement and a purely
314
administrative statement though is that a DDL statement always
315
commits the current transaction before proceeding, if there is
318
At last, SQL statements that work with non-transactional
319
engines also have no effect on the transaction state of the
320
connection. Even though they are written to the binary log,
321
and the binary log is, overall, transactional, the writes
322
are done in "write-through" mode, directly to the binlog
323
file, followed with a OS cache sync, in other words,
324
bypassing the binlog undo log (translog).
325
They do not commit the current normal transaction.
326
A failure of a statement that uses non-transactional tables
327
would cause a rollback of the statement transaction, but
328
in case there no non-transactional tables are used,
329
no statement transaction is started.
334
The server stores its transaction-related data in
335
session->transaction. This structure has two members of type
336
Session_TRANS. These members correspond to the statement and
337
normal transactions respectively:
339
- session->transaction.stmt contains a list of engines
340
that are participating in the given statement
341
- session->transaction.all contains a list of engines that
342
have participated in any of the statement transactions started
343
within the context of the normal transaction.
344
Each element of the list contains a pointer to the storage
345
engine, engine-specific transactional data, and engine-specific
348
In autocommit mode session->transaction.all is empty.
349
Instead, data of session->transaction.stmt is
350
used to commit/rollback the normal transaction.
352
The list of registered engines has a few important properties:
353
- no engine is registered in the list twice
354
- engines are present in the list a reverse temporal order --
355
new participants are always added to the beginning of the list.
357
Transaction life cycle
358
----------------------
360
When a new connection is established, session->transaction
361
members are initialized to an empty state.
362
If a statement uses any tables, all affected engines
363
are registered in the statement engine list. In
364
non-autocommit mode, the same engines are registered in
365
the normal transaction list.
366
At the end of the statement, the server issues a commit
367
or a roll back for all engines in the statement list.
368
At this point transaction flags of an engine, if any, are
369
propagated from the statement list to the list of the normal
371
When commit/rollback is finished, the statement list is
372
cleared. It will be filled in again by the next statement,
373
and emptied again at the next statement's end.
375
The normal transaction is committed in a similar way
376
(by going over all engines in session->transaction.all list)
377
but at different times:
378
- upon COMMIT SQL statement is issued by the user
379
- implicitly, by the server, at the beginning of a DDL statement
380
or SET AUTOCOMMIT={0|1} statement.
382
The normal transaction can be rolled back as well:
383
- if the user has requested so, by issuing ROLLBACK SQL
385
- if one of the storage engines requested a rollback
386
by setting session->transaction_rollback_request. This may
387
happen in case, e.g., when the transaction in the engine was
388
chosen a victim of the internal deadlock resolution algorithm
389
and rolled back internally. When such a situation happens, there
390
is little the server can do and the only option is to rollback
391
transactions in all other participating engines. In this case
392
the rollback is accompanied by an error sent to the user.
394
As follows from the use cases above, the normal transaction
395
is never committed when there is an outstanding statement
396
transaction. In most cases there is no conflict, since
397
commits of the normal transaction are issued by a stand-alone
398
administrative or DDL statement, thus no outstanding statement
399
transaction of the previous statement exists. Besides,
400
all statements that manipulate with the normal transaction
401
are prohibited in stored functions and triggers, therefore
402
no conflicting situation can occur in a sub-statement either.
403
The remaining rare cases when the server explicitly has
404
to commit the statement transaction prior to committing the normal
405
one cover error-handling scenarios (see for example
408
When committing a statement or a normal transaction, the server
409
either uses the two-phase commit protocol, or issues a commit
410
in each engine independently. The two-phase commit protocol
412
- all participating engines support two-phase commit (provide
413
handlerton::prepare PSEA API call) and
414
- transactions in at least two engines modify data (i.e. are
417
Note that the two phase commit is used for
418
statement transactions, even though they are not durable anyway.
419
This is done to ensure logical consistency of data in a multiple-
421
For example, imagine that some day MySQL supports unique
422
constraint checks deferred till the end of statement. In such
423
case a commit in one of the engines may yield ER_DUP_KEY,
424
and MySQL should be able to gracefully abort statement
425
transactions of other participants.
427
After the normal transaction has been committed,
428
session->transaction.all list is cleared.
430
When a connection is closed, the current normal transaction, if
433
Roles and responsibilities
434
--------------------------
436
The server has no way to know that an engine participates in
437
the statement and a transaction has been started
438
in it unless the engine says so. Thus, in order to be
439
a part of a transaction, the engine must "register" itself.
440
This is done by invoking trans_register_ha() server call.
441
Normally the engine registers itself whenever handler::external_lock()
442
is called. trans_register_ha() can be invoked many times: if
443
an engine is already registered, the call does nothing.
444
In case autocommit is not set, the engine must register itself
445
twice -- both in the statement list and in the normal transaction
447
In which list to register is a parameter of trans_register_ha().
449
Note, that although the registration interface in itself is
450
fairly clear, the current usage practice often leads to undesired
451
effects. E.g. since a call to trans_register_ha() in most engines
452
is embedded into implementation of handler::external_lock(), some
453
DDL statements start a transaction (at least from the server
454
point of view) even though they are not expected to. E.g.
455
CREATE TABLE does not start a transaction, since
456
handler::external_lock() is never called during CREATE TABLE. But
457
CREATE TABLE ... SELECT does, since handler::external_lock() is
458
called for the table that is being selected from. This has no
459
practical effects currently, but must be kept in mind
462
Once an engine is registered, the server will do the rest
465
During statement execution, whenever any of data-modifying
466
PSEA API methods is used, e.g. handler::write_row() or
467
handler::update_row(), the read-write flag is raised in the
468
statement transaction for the involved engine.
469
Currently All PSEA calls are "traced", and the data can not be
470
changed in a way other than issuing a PSEA call. Important:
471
unless this invariant is preserved the server will not know that
472
a transaction in a given engine is read-write and will not
473
involve the two-phase commit protocol!
475
At the end of a statement, server call
476
ha_autocommit_or_rollback() is invoked. This call in turn
477
invokes handlerton::prepare() for every involved engine.
478
Prepare is followed by a call to handlerton::commit_one_phase()
479
If a one-phase commit will suffice, handlerton::prepare() is not
480
invoked and the server only calls handlerton::commit_one_phase().
481
At statement commit, the statement-related read-write engine
482
flag is propagated to the corresponding flag in the normal
483
transaction. When the commit is complete, the list of registered
486
Rollback is handled in a similar fashion.
488
Additional notes on DDL and the normal transaction.
489
---------------------------------------------------
491
DDLs and operations with non-transactional engines
492
do not "register" in session->transaction lists, and thus do not
493
modify the transaction state. Besides, each DDL in
494
MySQL is prefixed with an implicit normal transaction commit
495
(a call to end_active_trans()), and thus leaves nothing
497
However, as it has been pointed out with CREATE TABLE .. SELECT,
498
some DDL statements can start a *new* transaction.
500
Behaviour of the server in this case is currently badly
502
DDL statements use a form of "semantic" logging
503
to maintain atomicity: if CREATE TABLE .. SELECT failed,
504
the newly created table is deleted.
505
In addition, some DDL statements issue interim transaction
506
commits: e.g. ALTER Table issues a commit after data is copied
507
from the original table to the internal temporary table. Other
508
statements, e.g. CREATE TABLE ... SELECT do not always commit
510
And finally there is a group of DDL statements such as
511
RENAME/DROP Table that doesn't start a new transaction
514
This diversity makes it hard to say what will happen if
515
by chance a stored function is invoked during a DDL --
516
whether any modifications it makes will be committed or not
517
is not clear. Fortunately, SQL grammar of few DDLs allows
518
invocation of a stored function.
520
A consistent behaviour is perhaps to always commit the normal
521
transaction after all DDLs, just like the statement transaction
522
is always committed at the end of all statements.
526
Register a storage engine for a transaction.
528
Every storage engine MUST call this function when it starts
529
a transaction or a statement (that is it must be called both for the
530
"beginning of transaction" and "beginning of statement").
531
Only storage engines registered for the transaction/statement
532
will know when to commit/rollback it.
535
trans_register_ha is idempotent - storage engine may register many
536
times per transaction.
539
void trans_register_ha(Session *session, bool all, handlerton *ht_arg)
541
Session_TRANS *trans;
542
Ha_trx_info *ha_info;
546
trans= &session->transaction.all;
547
session->server_status|= SERVER_STATUS_IN_TRANS;
550
trans= &session->transaction.stmt;
552
ha_info= session->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
554
if (ha_info->is_started())
555
return; /* already registered, return */
557
ha_info->register_ha(trans, ht_arg);
559
trans->no_2pc|=(ht_arg->prepare==0);
560
if (session->transaction.xid_state.xid.is_null())
561
session->transaction.xid_state.xid.set(session->query_id);
570
1 error, transaction was rolled back
572
int ha_prepare(Session *session)
575
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
576
Ha_trx_info *ha_info= trans->ha_list;
579
for (; ha_info; ha_info= ha_info->next())
582
handlerton *ht= ha_info->ht();
583
status_var_increment(session->status_var.ha_prepare_count);
586
if ((err= ht->prepare(ht, session, all)))
588
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
589
ha_rollback_trans(session, all);
596
push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
597
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
598
ha_resolve_storage_engine_name(ht));
606
Check if we can skip the two-phase commit.
608
A helper function to evaluate if two-phase commit is mandatory.
609
As a side effect, propagates the read-only/read-write flags
610
of the statement transaction to its enclosing normal transaction.
612
@retval true we must run a two-phase commit. Returned
613
if we have at least two engines with read-write changes.
614
@retval false Don't need two-phase commit. Even if we have two
615
transactional engines, we can run two independent
616
commits if changes in one of the engines are read-only.
621
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
624
/* The number of storage engines that have actual changes. */
625
unsigned rw_ha_count= 0;
626
Ha_trx_info *ha_info;
628
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
630
if (ha_info->is_trx_read_write())
635
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->ht()->slot].ha_info[1];
636
assert(ha_info != ha_info_all);
638
Merge read-only/read-write information about statement
639
transaction to its enclosing normal transaction. Do this
640
only if in a real transaction -- that is, if we know
641
that ha_info_all is registered in session->transaction.all.
642
Since otherwise we only clutter the normal transaction flags.
644
if (ha_info_all->is_started()) /* false if autocommit. */
645
ha_info_all->coalesce_trx_with(ha_info);
647
else if (rw_ha_count > 1)
650
It is a normal transaction, so we don't need to merge read/write
651
information up, and the need for two-phase commit has been
652
already established. Break the loop prematurely.
657
return rw_ha_count > 1;
665
1 transaction was rolled back
667
2 error during commit, data may be inconsistent
670
Since we don't support nested statement transactions in 5.0,
671
we can't commit or rollback stmt transactions while we are inside
672
stored functions or triggers. So we simply do nothing now.
673
TODO: This should be fixed in later ( >= 5.1) releases.
675
int ha_commit_trans(Session *session, bool all)
677
int error= 0, cookie= 0;
679
'all' means that this is either an explicit commit issued by
680
user, or an implicit commit issued by a DDL.
682
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
683
bool is_real_trans= all || session->transaction.all.ha_list == 0;
684
Ha_trx_info *ha_info= trans->ha_list;
687
We must not commit the normal transaction if a statement
688
transaction is pending. Otherwise statement transaction
689
flags will not get propagated to its normal transaction's
692
assert(session->transaction.stmt.ha_list == NULL ||
693
trans == &session->transaction.stmt);
699
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
701
ha_rollback_trans(session, all);
705
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
707
if (!trans->no_2pc && must_2pc)
709
for (; ha_info && !error; ha_info= ha_info->next())
712
handlerton *ht= ha_info->ht();
714
Do not call two-phase commit if this particular
715
transaction is read-only. This allows for simpler
716
implementation in engines that are always read-only.
718
if (! ha_info->is_trx_read_write())
721
Sic: we know that prepare() is not NULL since otherwise
722
trans->no_2pc would have been set.
724
if ((err= ht->prepare(ht, session, all)))
726
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
729
status_var_increment(session->status_var.ha_prepare_count);
733
ha_rollback_trans(session, all);
738
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
741
start_waiting_global_read_lock(session);
748
This function does not care about global read lock. A caller should.
750
int ha_commit_one_phase(Session *session, bool all)
753
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
754
bool is_real_trans=all || session->transaction.all.ha_list == 0;
755
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
758
for (; ha_info; ha_info= ha_info_next)
761
handlerton *ht= ha_info->ht();
762
if ((err= ht->commit(ht, session, all)))
764
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
767
status_var_increment(session->status_var.ha_commit_count);
768
ha_info_next= ha_info->next();
769
ha_info->reset(); /* keep it conveniently zero-filled */
774
session->transaction.xid_state.xid.null();
777
session->variables.tx_isolation=session->session_tx_isolation;
778
session->transaction.cleanup();
785
int ha_rollback_trans(Session *session, bool all)
788
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
789
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
790
bool is_real_trans=all || session->transaction.all.ha_list == 0;
793
We must not rollback the normal transaction if a statement
794
transaction is pending.
796
assert(session->transaction.stmt.ha_list == NULL ||
797
trans == &session->transaction.stmt);
801
for (; ha_info; ha_info= ha_info_next)
804
handlerton *ht= ha_info->ht();
805
if ((err= ht->rollback(ht, session, all)))
807
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
810
status_var_increment(session->status_var.ha_rollback_count);
811
ha_info_next= ha_info->next();
812
ha_info->reset(); /* keep it conveniently zero-filled */
817
session->transaction.xid_state.xid.null();
820
session->variables.tx_isolation=session->session_tx_isolation;
821
session->transaction.cleanup();
825
session->transaction_rollback_request= false;
828
If a non-transactional table was updated, warn; don't warn if this is a
829
slave thread (because when a slave thread executes a ROLLBACK, it has
830
been read from the binary log, so it's 100% sure and normal to produce
831
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
832
slave SQL thread, it would not stop the thread but just be printed in
833
the error log; but we don't want users to wonder why they have this
834
message in the error log, so we don't send it.
836
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
837
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
838
ER_WARNING_NOT_COMPLETE_ROLLBACK,
839
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
844
This is used to commit or rollback a single statement depending on
848
Note that if the autocommit is on, then the following call inside
849
InnoDB will commit or rollback the whole transaction (= the statement). The
850
autocommit mechanism built into InnoDB is based on counting locks, but if
851
the user has used LOCK TABLES then that mechanism does not know to do the
854
int ha_autocommit_or_rollback(Session *session, int error)
856
if (session->transaction.stmt.ha_list)
860
if (ha_commit_trans(session, 0))
865
(void) ha_rollback_trans(session, 0);
866
if (session->transaction_rollback_request)
867
(void) ha_rollback(session);
870
session->variables.tx_isolation=session->session_tx_isolation;
881
static bool xacommit_handlerton(Session *,
885
handlerton *hton= plugin_data(plugin, handlerton *);
886
if (hton->state == SHOW_OPTION_YES && hton->recover)
888
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
889
((struct xahton_st *)arg)->result= 0;
894
static bool xarollback_handlerton(Session *,
898
handlerton *hton= plugin_data(plugin, handlerton *);
899
if (hton->state == SHOW_OPTION_YES && hton->recover)
901
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
902
((struct xahton_st *)arg)->result= 0;
908
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
910
struct xahton_st xaop;
914
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
915
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
921
recover() step of xa.
924
there are three modes of operation:
925
- automatic recover after a crash
926
in this case commit_list != 0, tc_heuristic_recover==0
927
all xids from commit_list are committed, others are rolled back
928
- manual (heuristic) recover
929
in this case commit_list==0, tc_heuristic_recover != 0
930
DBA has explicitly specified that all prepared transactions should
931
be committed (or rolled back).
932
- no recovery (MySQL did not detect a crash)
933
in this case commit_list==0, tc_heuristic_recover == 0
934
there should be no prepared transactions in this case.
938
int len, found_foreign_xids, found_my_xids;
944
static bool xarecover_handlerton(Session *,
948
handlerton *hton= plugin_data(plugin, handlerton *);
949
struct xarecover_st *info= (struct xarecover_st *) arg;
952
if (hton->state == SHOW_OPTION_YES && hton->recover)
954
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
956
errmsg_printf(ERRMSG_LVL_INFO, _("Found %d prepared transaction(s) in %s"),
957
got, ha_resolve_storage_engine_name(hton));
958
for (int i=0; i < got; i ++)
960
my_xid x=info->list[i].get_my_xid();
961
if (!x) // not "mine" - that is generated by external TM
963
xid_cache_insert(info->list+i, XA_PREPARED);
964
info->found_foreign_xids++;
969
info->found_my_xids++;
973
if (info->commit_list ?
974
hash_search(info->commit_list, (unsigned char *)&x, sizeof(x)) != 0 :
975
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
977
hton->commit_by_xid(hton, info->list+i);
981
hton->rollback_by_xid(hton, info->list+i);
991
int ha_recover(HASH *commit_list)
993
struct xarecover_st info;
994
info.found_foreign_xids= info.found_my_xids= 0;
995
info.commit_list= commit_list;
996
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
999
/* commit_list and tc_heuristic_recover cannot be set both */
1000
assert(info.commit_list==0 || tc_heuristic_recover==0);
1001
/* if either is set, total_ha_2pc must be set too */
1002
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1004
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1007
if (info.commit_list)
1008
errmsg_printf(ERRMSG_LVL_INFO, _("Starting crash recovery..."));
1011
#ifndef WILL_BE_DELETED_LATER
1014
for now, only InnoDB supports 2pc. It means we can always safely
1015
rollback all pending transactions, without risking inconsistent data
1018
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1019
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1024
for (info.len= MAX_XID_LIST_SIZE ;
1025
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1027
info.list=(XID *)malloc(info.len*sizeof(XID));
1031
errmsg_printf(ERRMSG_LVL_ERROR, ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1035
plugin_foreach(NULL, xarecover_handlerton,
1036
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1038
free((unsigned char*)info.list);
1039
if (info.found_foreign_xids)
1040
errmsg_printf(ERRMSG_LVL_WARN, _("Found %d prepared XA transactions"),
1041
info.found_foreign_xids);
1042
if (info.dry_run && info.found_my_xids)
1044
errmsg_printf(ERRMSG_LVL_ERROR,
1045
_("Found %d prepared transactions! It means that drizzled "
1046
"was not shut down properly last time and critical "
1047
"recovery information (last binlog or %s file) was "
1048
"manually deleted after a crash. You have to start "
1049
"drizzled with the --tc-heuristic-recover switch to "
1050
"commit or rollback pending transactions."),
1051
info.found_my_xids, opt_tc_log_file);
1054
if (info.commit_list)
1055
errmsg_printf(ERRMSG_LVL_INFO, _("Crash recovery finished."));
1060
return the list of XID's to a client, the same way SHOW commands do.
1063
I didn't find in XA specs that an RM cannot return the same XID twice,
1064
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1065
It can be easily fixed later, if necessary.
1067
bool mysql_xa_recover(Session *session)
1069
List<Item> field_list;
1070
Protocol *protocol= session->protocol;
1074
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1075
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1076
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1077
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1079
if (protocol->send_fields(&field_list,
1080
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1083
pthread_mutex_lock(&LOCK_xid_cache);
1084
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1086
if (xs->xa_state==XA_PREPARED)
1088
protocol->prepare_for_resend();
1089
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1090
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1091
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1092
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1094
if (protocol->write())
1096
pthread_mutex_unlock(&LOCK_xid_cache);
1102
pthread_mutex_unlock(&LOCK_xid_cache);
1109
This function should be called when MySQL sends rows of a SELECT result set
1110
or the EOF mark to the client. It releases a possible adaptive hash index
1111
S-latch held by session in InnoDB and also releases a possible InnoDB query
1112
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a session to
1113
keep them over several calls of the InnoDB handler interface when a join
1114
is executed. But when we let the control to pass to the client they have
1115
to be released because if the application program uses mysql_use_result(),
1116
it may deadlock on the S-latch if the application on another connection
1117
performs another SQL query. In MySQL-4.1 this is even more important because
1118
there a connection can have several SELECT queries open at the same time.
1120
@param session the thread handle of the current connection
1125
static bool release_temporary_latches(Session *session, plugin_ref plugin,
1128
handlerton *hton= plugin_data(plugin, handlerton *);
1130
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1131
hton->release_temporary_latches(hton, session);
1137
int ha_release_temporary_latches(Session *session)
1139
plugin_foreach(session, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1145
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
1148
Session_TRANS *trans= &session->transaction.all;
1149
Ha_trx_info *ha_info, *ha_info_next;
1153
rolling back to savepoint in all storage engines that were part of the
1154
transaction when the savepoint was set
1156
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1159
handlerton *ht= ha_info->ht();
1161
assert(ht->savepoint_set != 0);
1162
if ((err= ht->savepoint_rollback(ht, session,
1163
(unsigned char *)(sv+1)+ht->savepoint_offset)))
1165
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1168
status_var_increment(session->status_var.ha_savepoint_rollback_count);
1169
trans->no_2pc|= ht->prepare == 0;
1172
rolling back the transaction in all storage engines that were not part of
1173
the transaction when the savepoint was set
1175
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1176
ha_info= ha_info_next)
1179
handlerton *ht= ha_info->ht();
1180
if ((err= ht->rollback(ht, session, !(0))))
1182
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1185
status_var_increment(session->status_var.ha_rollback_count);
1186
ha_info_next= ha_info->next();
1187
ha_info->reset(); /* keep it conveniently zero-filled */
1189
trans->ha_list= sv->ha_list;
1195
according to the sql standard (ISO/IEC 9075-2:2003)
1196
section "4.33.4 SQL-statements and transaction states",
1197
SAVEPOINT is *not* transaction-initiating SQL-statement
1199
int ha_savepoint(Session *session, SAVEPOINT *sv)
1202
Session_TRANS *trans= &session->transaction.all;
1203
Ha_trx_info *ha_info= trans->ha_list;
1204
for (; ha_info; ha_info= ha_info->next())
1207
handlerton *ht= ha_info->ht();
1209
if (! ht->savepoint_set)
1211
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1215
if ((err= ht->savepoint_set(ht, session, (unsigned char *)(sv+1)+ht->savepoint_offset)))
1217
my_error(ER_GET_ERRNO, MYF(0), err);
1220
status_var_increment(session->status_var.ha_savepoint_count);
1223
Remember the list of registered storage engines. All new
1224
engines are prepended to the beginning of the list.
1226
sv->ha_list= trans->ha_list;
1230
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
1233
Ha_trx_info *ha_info= sv->ha_list;
1235
for (; ha_info; ha_info= ha_info->next())
1238
handlerton *ht= ha_info->ht();
1239
/* Savepoint life time is enclosed into transaction life time. */
1241
if (!ht->savepoint_release)
1243
if ((err= ht->savepoint_release(ht, session,
1244
(unsigned char *)(sv+1) + ht->savepoint_offset)))
1246
my_error(ER_GET_ERRNO, MYF(0), err);
1254
static bool snapshot_handlerton(Session *session, plugin_ref plugin, void *arg)
1256
handlerton *hton= plugin_data(plugin, handlerton *);
1257
if (hton->state == SHOW_OPTION_YES &&
1258
hton->start_consistent_snapshot)
1260
hton->start_consistent_snapshot(hton, session);
1261
*((bool *)arg)= false;
1266
int ha_start_consistent_snapshot(Session *session)
1270
plugin_foreach(session, snapshot_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1273
Same idea as when one wants to CREATE TABLE in one engine which does not
1277
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1278
"This Drizzle server does not support any "
1279
"consistent-read capable storage engine");
1284
static bool flush_handlerton(Session *,
1288
handlerton *hton= plugin_data(plugin, handlerton *);
1289
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1290
hton->flush_logs(hton))
1296
bool ha_flush_logs(handlerton *db_type)
1298
if (db_type == NULL)
1300
if (plugin_foreach(NULL, flush_handlerton,
1301
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1306
if (db_type->state != SHOW_OPTION_YES ||
1307
(db_type->flush_logs && db_type->flush_logs(db_type)))
1313
static const char *check_lowercase_names(handler *file, const char *path,
1316
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1319
/* Ensure that table handler get path in lower case */
1320
if (tmp_path != path)
1321
strcpy(tmp_path, path);
1324
we only should turn into lowercase database/table part
1325
so start the process after homedirectory
1327
my_casedn_str(files_charset_info, tmp_path + drizzle_data_home_len);
1333
An interceptor to hijack the text of the error message without
1334
setting an error in the thread. We need the text to present it
1335
in the form of a warning to the user.
1338
struct Ha_delete_table_error_handler: public Internal_error_handler
1341
virtual bool handle_error(uint32_t sql_errno,
1342
const char *message,
1343
DRIZZLE_ERROR::enum_warning_level level,
1345
char buff[DRIZZLE_ERRMSG_SIZE];
1350
Ha_delete_table_error_handler::
1351
handle_error(uint32_t ,
1352
const char *message,
1353
DRIZZLE_ERROR::enum_warning_level ,
1356
/* Grab the error message */
1357
strncpy(buff, message, sizeof(buff)-1);
1362
struct handlerton_delete_table_args {
1369
static bool deletetable_handlerton(Session *,
1373
struct handlerton_delete_table_args *dtargs= (struct handlerton_delete_table_args *) args;
1375
Session *session= dtargs->session;
1376
const char *path= dtargs->path;
1379
char tmp_path[FN_REFLEN];
1381
if(dtargs->error!=ENOENT) /* already deleted table */
1384
handlerton *table_type= plugin_data(plugin, handlerton *);
1389
if(!(table_type->state == SHOW_OPTION_YES && table_type->create))
1392
if ((file= table_type->create(table_type, NULL, session->mem_root)))
1397
path= check_lowercase_names(file, path, tmp_path);
1398
int error= file->ha_delete_table(path);
1402
dtargs->error= error;
1404
delete dtargs->file;
1413
This should return ENOENT if the file doesn't exists.
1414
The .frm file will be deleted only if we return 0 or ENOENT
1416
int ha_delete_table(Session *session, const char *path,
1417
const char *db, const char *alias, bool generate_warning)
1419
TABLE_SHARE dummy_share;
1422
struct handlerton_delete_table_args dtargs;
1423
dtargs.error= ENOENT;
1424
dtargs.session= session;
1428
plugin_foreach(NULL, deletetable_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1431
memset(&dummy_table, 0, sizeof(dummy_table));
1432
memset(&dummy_share, 0, sizeof(dummy_share));
1433
dummy_table.s= &dummy_share;
1435
if (dtargs.error && generate_warning)
1438
Because file->print_error() use my_error() to generate the error message
1439
we use an internal error handler to intercept it and store the text
1440
in a temporary buffer. Later the message will be presented to user
1443
Ha_delete_table_error_handler ha_delete_table_error_handler;
1445
/* Fill up strucutures that print_error may need */
1446
dummy_share.path.str= (char*) path;
1447
dummy_share.path.length= strlen(path);
1448
dummy_share.db.str= (char*) db;
1449
dummy_share.db.length= strlen(db);
1450
dummy_share.table_name.str= (char*) alias;
1451
dummy_share.table_name.length= strlen(alias);
1452
dummy_table.alias= alias;
1456
handler *file= dtargs.file;
1457
file->change_table_ptr(&dummy_table, &dummy_share);
1459
session->push_internal_handler(&ha_delete_table_error_handler);
1460
file->print_error(dtargs.error, 0);
1462
session->pop_internal_handler();
1465
dtargs.error= -1; /* General form of fail. maybe bad FRM */
1468
XXX: should we convert *all* errors to warnings here?
1469
What if the error is fatal?
1471
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_ERROR, dtargs.error,
1472
ha_delete_table_error_handler.buff);
1478
return dtargs.error;
54
1481
/****************************************************************************
55
** General Cursor functions
1482
** General handler functions
56
1483
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
61
estimation_rows_to_insert(0),
63
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
64
ref_length(sizeof(internal::my_off_t)),
67
next_insert_id(0), insert_id_for_cur_row(0)
72
assert(locked == false);
73
/* TODO: assert(inited == NONE); */
78
* @note this only used in
79
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
80
* of the writing of this comment. -Brian
82
Cursor *Cursor::clone(memory::Root *mem_root)
84
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
1484
handler *handler::clone(MEM_ROOT *mem_root)
1486
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
87
Allocate Cursor->ref here because otherwise ha_open will allocate it
1488
Allocate handler->ref here because otherwise ha_open will allocate it
88
1489
on this->table->mem_root and we will not be able to reclaim that memory
89
when the clone Cursor object is destroyed.
1490
when the clone handler object is destroyed.
91
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1492
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
94
identifier::Table identifier(getTable()->getShare()->getSchemaName(),
95
getTable()->getShare()->getTableName(),
96
getTable()->getShare()->getType());
98
if (new_handler && !new_handler->ha_open(identifier,
99
getTable()->getDBStat(),
1494
if (new_handler && !new_handler->ha_open(table,
1495
table->s->normalized_path.str,
100
1497
HA_OPEN_IGNORE_IF_LOCKED))
101
1498
return new_handler;
107
given a buffer with a key value, and a map of keyparts
108
that are present in this value, returns the length of the value
110
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
112
/* works only with key prefixes */
113
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
115
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
116
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
119
while (key_part_found < end_key_part_found && keypart_map_arg)
121
length+= key_part_found->store_length;
122
keypart_map_arg >>= 1;
128
int Cursor::startIndexScan(uint32_t idx, bool sorted)
1502
int handler::ha_index_init(uint32_t idx, bool sorted)
131
assert(inited == NONE);
132
if (!(result= doStartIndexScan(idx, sorted)))
1505
assert(inited==NONE);
1506
if (!(result= index_init(idx, sorted)))
134
1508
end_range= NULL;
138
int Cursor::endIndexScan()
1512
int handler::ha_index_end()
140
1514
assert(inited==INDEX);
142
1516
end_range= NULL;
143
return(doEndIndexScan());
1517
return(index_end());
146
int Cursor::startTableScan(bool scan)
1520
int handler::ha_rnd_init(bool scan)
149
1523
assert(inited==NONE || (inited==RND && scan));
150
inited= (result= doStartTableScan(scan)) ? NONE: RND;
1524
inited= (result= rnd_init(scan)) ? NONE: RND;
155
int Cursor::endTableScan()
1528
int handler::ha_rnd_end()
157
1530
assert(inited==RND);
159
return(doEndTableScan());
162
int Cursor::ha_index_or_rnd_end()
164
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
167
void Cursor::ha_start_bulk_insert(ha_rows rows)
1535
int handler::ha_index_or_rnd_end()
1537
return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0;
1540
handler::Table_flags handler::ha_table_flags() const
1542
return cached_table_flags;
1545
void handler::ha_start_bulk_insert(ha_rows rows)
169
1547
estimation_rows_to_insert= rows;
170
1548
start_bulk_insert(rows);
173
int Cursor::ha_end_bulk_insert()
1551
int handler::ha_end_bulk_insert()
175
1553
estimation_rows_to_insert= 0;
176
1554
return end_bulk_insert();
179
const key_map *Cursor::keys_to_use_for_scanning()
1557
void handler::change_table_ptr(Table *table_arg, TABLE_SHARE *share)
1563
const key_map *handler::keys_to_use_for_scanning()
181
1565
return &key_map_empty;
184
bool Cursor::has_transactions()
186
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
189
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
191
(getTable()->in_use->status_var.*offset)++;
194
void **Cursor::ha_data(Session *session) const
196
return session->getEngineData(getEngine());
199
bool Cursor::is_fatal_error(int error, uint32_t flags)
1568
bool handler::has_transactions()
1570
return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0;
1573
void handler::ha_statistic_increment(ulong SSV::*offset) const
1575
status_var_increment(table->in_use->status_var.*offset);
1578
void **handler::ha_data(Session *session) const
1580
return session_ha_data(session, ht);
1583
Session *handler::ha_session(void) const
1585
assert(!table || !table->in_use || table->in_use == current_session);
1586
return (table && table->in_use) ? table->in_use : current_session;
1590
bool handler::is_fatal_error(int error, uint32_t flags)
202
1593
((flags & HA_CHECK_DUP_KEY) &&
1149
3352
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1151
3354
*range_info= mrr_cur_range.ptr;
3359
/* **************************************************************************
3360
* DS-MRR implementation
3361
***************************************************************************/
3364
DS-MRR: Initialize and start MRR scan
3366
Initialize and start the MRR scan. Depending on the mode parameter, this
3367
may use default or DS-MRR implementation.
3369
@param h Table handler to be used
3370
@param key Index to be used
3371
@param seq_funcs Interval sequence enumeration functions
3372
@param seq_init_param Interval sequence enumeration parameter
3373
@param n_ranges Number of ranges in the sequence.
3374
@param mode HA_MRR_* modes to use
3375
@param buf INOUT Buffer to use
3377
@retval 0 Ok, Scan started.
3381
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
3382
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3383
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
3387
Item *pushed_cond= NULL;
3389
keyno= h_in->active_index;
3391
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3393
use_default_impl= true;
3394
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
3395
n_ranges, mode, buf));
3397
rowids_buf= buf->buffer;
3398
//psergey-todo: don't add key_length as it is not needed anymore
3399
rowids_buf += key->key_length + h_in->ref_length;
3401
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3402
rowids_buf_end= buf->buffer_end;
3404
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
3405
rowids_buf_last= rowids_buf +
3406
((rowids_buf_end - rowids_buf)/ elem_size)*
3408
rowids_buf_end= rowids_buf_last;
3410
/* Create a separate handler object to do rndpos() calls. */
3411
Session *session= current_session;
3412
if (!(new_h2= h_in->clone(session->mem_root)) ||
3413
new_h2->ha_external_lock(session, F_RDLCK))
3419
if (keyno == h_in->pushed_idx_cond_keyno)
3420
pushed_cond= h_in->pushed_idx_cond;
3421
if (h_in->ha_index_end())
3428
table->prepare_for_position();
3429
new_h2->extra(HA_EXTRA_KEYREAD);
3431
if (h2->ha_index_init(keyno, false) ||
3432
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3435
use_default_impl= false;
3438
h2->idx_cond_push(keyno, pushed_cond);
3439
if (dsmrr_fill_buffer(new_h2))
3443
If the above call has scanned through all intervals in *seq, then
3444
adjust *buf to indicate that the remaining buffer space will not be used.
3447
buf->end_of_used_area= rowids_buf_last;
3449
if (h_in->ha_rnd_init(false))
3454
h2->ha_index_or_rnd_end();
3455
h2->ha_external_lock(session, F_UNLCK);
3462
void DsMrr_impl::dsmrr_close()
3466
h2->ha_external_lock(current_session, F_UNLCK);
3471
use_default_impl= true;
3476
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
3478
return ((handler*)h)->cmp_ref(a, b);
3483
DS-MRR: Fill the buffer with rowids and sort it by rowid
3485
{This is an internal function of DiskSweep MRR implementation}
3486
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3487
buffer. When the buffer is full or scan is completed, sort the buffer by
3490
The function assumes that rowids buffer is empty when it is invoked.
3492
@param h Table handler
3494
@retval 0 OK, the next portion of rowids is in the buffer,
3499
int DsMrr_impl::dsmrr_fill_buffer(handler *)
3504
rowids_buf_cur= rowids_buf;
3505
while ((rowids_buf_cur < rowids_buf_end) &&
3506
!(res= h2->handler::multi_range_read_next(&range_info)))
3508
/* Put rowid, or {rowid, range_id} pair into the buffer */
3509
h2->position(table->record[0]);
3510
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3511
rowids_buf_cur += h->ref_length;
3515
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3516
rowids_buf_cur += sizeof(void*);
3520
if (res && res != HA_ERR_END_OF_FILE)
3522
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3524
/* Sort the buffer contents by rowid */
3525
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3526
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3528
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3530
rowids_buf_last= rowids_buf_cur;
3531
rowids_buf_cur= rowids_buf;
3537
DS-MRR implementation: multi_range_read_next() function
3540
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
3544
if (use_default_impl)
3545
return h_in->handler::multi_range_read_next(range_info);
3547
if (rowids_buf_cur == rowids_buf_last)
3551
res= HA_ERR_END_OF_FILE;
3554
res= dsmrr_fill_buffer(h);
3559
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3560
if (rowids_buf_cur == rowids_buf_last)
3562
res= HA_ERR_END_OF_FILE;
3566
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
3567
rowids_buf_cur += h_in->ref_length;
3570
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3571
rowids_buf_cur += sizeof(void*);
3582
DS-MRR implementation: multi_range_read_info() function
3584
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
3585
uint32_t *flags, COST_VECT *cost)
3588
uint32_t def_flags= *flags;
3589
uint32_t def_bufsz= *bufsz;
3591
/* Get cost/flags/mem_usage of default MRR implementation */
3592
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3596
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3597
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3599
/* Default implementation is choosen */
3608
DS-MRR Implementation: multi_range_read_info_const() function
3611
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
3612
void *seq_init_param, uint32_t n_ranges,
3613
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
3616
uint32_t def_flags= *flags;
3617
uint32_t def_bufsz= *bufsz;
3618
/* Get cost/flags/mem_usage of default MRR implementation */
3619
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3620
n_ranges, &def_bufsz,
3622
if (rows == HA_POS_ERROR)
3624
/* Default implementation can't perform MRR scan => we can't either */
3629
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3630
use the default MRR implementation (we need it for UPDATE/DELETE).
3631
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3633
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3634
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3641
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3648
Check if key has partially-covered columns
3650
We can't use DS-MRR to perform range scans when the ranges are over
3651
partially-covered keys, because we'll not have full key part values
3652
(we'll have their prefixes from the index) and will not be able to check
3653
if we've reached the end the range.
3655
@param keyno Key to check
3658
Allow use of DS-MRR in cases where the index has partially-covered
3659
components but they are not used for scanning.
3665
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
3667
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3668
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3669
for (; kp != kp_end; kp++)
3671
if (!kp->field->part_of_key.is_set(keyno))
3679
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
3681
Make the choice between using Default MRR implementation and DS-MRR.
3682
This function contains common functionality factored out of dsmrr_info()
3683
and dsmrr_info_const(). The function assumes that the default MRR
3684
implementation's applicability requirements are satisfied.
3686
@param keyno Index number
3687
@param rows E(full rows to be retrieved)
3688
@param flags IN MRR flags provided by the MRR user
3689
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3690
else the value is not modified
3691
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3692
else the value is not modified
3693
@param cost IN Cost of default MRR implementation
3694
OUT If DS-MRR is choosen, cost of DS-MRR scan
3695
else the value is not modified
3697
@retval true Default MRR implementation should be used
3698
@retval false DS-MRR implementation should be used
3701
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3702
uint32_t *bufsz, COST_VECT *cost)
3704
COST_VECT dsmrr_cost;
3706
Session *session= current_session;
3707
if ((session->variables.optimizer_use_mrr == 2) ||
3708
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3709
(keyno == table->s->primary_key &&
3710
h->primary_key_is_clustered()) ||
3711
key_uses_partial_cols(keyno))
3713
/* Use the default implementation */
3714
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3718
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3720
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3726
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
3727
DS-MRR and Default implementations cost. This allows one to force use of
3728
DS-MRR whenever it is applicable without affecting other cost-based
3731
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
3732
dsmrr_cost.total_cost() > cost->total_cost())
3735
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
3737
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
3738
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
3744
/* Use the default MRR implementation */
3751
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
3755
Get cost of DS-MRR scan
3757
@param keynr Index to be used
3758
@param rows E(Number of rows to be scanned)
3759
@param flags Scan parameters (HA_MRR_* flags)
3760
@param buffer_size INOUT Buffer size
3761
@param cost OUT The cost
3764
@retval true Error, DS-MRR cannot be used (the buffer is too small
3768
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
3769
uint32_t *buffer_size, COST_VECT *cost)
3771
uint32_t max_buff_entries, elem_size;
3772
ha_rows rows_in_full_step, rows_in_last_step;
3773
uint32_t n_full_steps;
3774
double index_read_cost;
3776
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
3777
max_buff_entries = *buffer_size / elem_size;
3779
if (!max_buff_entries)
3780
return true; /* Buffer has not enough space for even 1 rowid */
3782
/* Number of iterations we'll make with full buffer */
3783
n_full_steps= (uint32_t)floor(rows2double(rows) / max_buff_entries);
3786
Get numbers of rows we'll be processing in
3787
- non-last sweep, with full buffer
3788
- last iteration, with non-full buffer
3790
rows_in_full_step= max_buff_entries;
3791
rows_in_last_step= rows % max_buff_entries;
3793
/* Adjust buffer size if we expect to use only part of the buffer */
3796
get_sort_and_sweep_cost(table, rows, cost);
3797
cost->multiply(n_full_steps);
3802
*buffer_size= cmax((ulong)*buffer_size,
3803
(size_t)(1.2*rows_in_last_step) * elem_size +
3804
h->ref_length + table->key_info[keynr].key_length);
3807
COST_VECT last_step_cost;
3808
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
3809
cost->add(&last_step_cost);
3811
if (n_full_steps != 0)
3812
cost->mem_cost= *buffer_size;
3814
cost->mem_cost= (double)rows_in_last_step * elem_size;
3816
/* Total cost of all index accesses */
3817
index_read_cost= h->index_only_read_time(keynr, (double)rows);
3818
cost->add_io(index_read_cost, 1 /* Random seeks */);
3824
Get cost of one sort-and-sweep step
3827
get_sort_and_sweep_cost()
3828
table Table being accessed
3829
nrows Number of rows to be sorted and retrieved
3833
Get cost of these operations:
3834
- sort an array of #nrows ROWIDs using qsort
3835
- read #nrows records from table in a sweep.
3839
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3843
get_sweep_read_cost(table, nrows, false, cost);
3844
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3845
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3848
cost->cpu_cost += cmp_op * log2(cmp_op);
3856
Get cost of reading nrows table records in a "disk sweep"
3858
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3859
for an ordered sequence of rowids.
3861
We assume hard disk IO. The read is performed as follows:
3863
1. The disk head is moved to the needed cylinder
3864
2. The controller waits for the plate to rotate
3865
3. The data is transferred
3867
Time to do #3 is insignificant compared to #2+#1.
3869
Time to move the disk head is proportional to head travel distance.
3871
Time to wait for the plate to rotate depends on whether the disk head
3874
If disk head wasn't moved, the wait time is proportional to distance
3875
between the previous block and the block we're reading.
3877
If the head was moved, we don't know how much we'll need to wait for the
3878
plate to rotate. We assume the wait time to be a variate with a mean of
3879
0.5 of full rotation time.
3881
Our cost units are "random disk seeks". The cost of random disk seek is
3882
actually not a constant, it depends one range of cylinders we're going
3883
to access. We make it constant by introducing a fuzzy concept of "typical
3884
datafile length" (it's fuzzy as it's hard to tell whether it should
3885
include index file, temp.tables etc). Then random seek cost is:
3887
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3889
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3891
@param table Table to be accessed
3892
@param nrows Number of rows to retrieve
3893
@param interrupted true <=> Assume that the disk sweep will be
3894
interrupted by other disk IO. false - otherwise.
3895
@param cost OUT The cost.
3898
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3902
if (table->file->primary_key_is_clustered())
3904
cost->io_count= table->file->read_time(table->s->primary_key,
3905
(uint32_t) nrows, nrows);
3910
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3912
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3913
if (busy_blocks < 1.0)
3916
cost->io_count= busy_blocks;
3920
/* Assume reading is done in one 'sweep' */
3921
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3922
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);