23
23
Handler-calling-functions
30
#include "drizzled/error.h"
31
#include "drizzled/field/epoch.h"
32
#include "drizzled/gettext.h"
33
#include "drizzled/internal/my_sys.h"
34
#include "drizzled/item/empty_string.h"
35
#include "drizzled/item/int.h"
36
#include "drizzled/lock.h"
37
#include "drizzled/message/table.h"
38
#include "drizzled/my_hash.h"
39
#include "drizzled/optimizer/cost_vector.h"
40
#include "drizzled/plugin/client.h"
41
#include "drizzled/plugin/event_observer.h"
42
#include "drizzled/plugin/storage_engine.h"
43
#include "drizzled/probes.h"
44
#include "drizzled/session.h"
45
#include "drizzled/sql_base.h"
46
#include "drizzled/sql_parse.h"
47
#include "drizzled/transaction_services.h"
26
#include <drizzled/server_includes.h>
27
#include <libdrizzleclient/libdrizzle.h>
28
#include <mysys/hash.h>
29
#include <drizzled/error.h>
30
#include <drizzled/gettext.h>
31
#include <drizzled/data_home.h>
32
#include <drizzled/probes.h>
33
#include <drizzled/sql_parse.h>
34
#include <drizzled/cost_vect.h>
36
#include <drizzled/session.h>
37
#include <drizzled/sql_base.h>
38
#include <drizzled/replicator.h>
39
#include <drizzled/lock.h>
40
#include <drizzled/item/int.h>
41
#include <drizzled/item/empty_string.h>
42
#include <drizzled/unireg.h> // for mysql_frm_type
43
#include <drizzled/serialize/table.pb.h>
45
#if defined(CMATH_NAMESPACE)
46
using namespace CMATH_NAMESPACE;
50
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
52
/* number of entries in handlertons[] */
54
/* number of storage engines (from handlertons[]) that support 2pc */
55
uint32_t total_ha_2pc= 0;
56
/* size of savepoint storage area (see ha_init) */
57
uint32_t savepoint_alloc_size= 0;
59
const char *ha_row_type[] = {
60
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
63
const char *tx_isolation_names[] =
64
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
67
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
68
tx_isolation_names, NULL};
70
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
71
uint32_t known_extensions_id= 0;
75
Register handler error messages for use with my_error().
83
int ha_init_errors(void)
85
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
88
/* Allocate a pointer array for the error message strings. */
89
/* Zerofill it to avoid uninitialized gaps. */
90
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*))))
92
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *));
94
/* Set the dedicated error messages. */
95
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
96
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
97
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
98
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
99
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
100
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
101
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
102
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
103
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
104
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
105
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
106
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
107
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
108
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
109
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
110
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
111
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
112
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
113
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
114
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
115
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
116
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
117
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
118
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
119
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
120
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
121
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
122
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
123
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
124
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
125
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
126
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
127
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
128
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
129
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
130
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
131
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
132
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
133
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
134
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
135
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
137
/* Register the error messages for use with my_error(). */
138
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
143
Unregister handler error messages.
150
static int ha_finish_errors(void)
152
const char **errmsgs;
154
/* Allocate a pointer array for the error message strings. */
155
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
157
free((unsigned char*) errmsgs);
165
assert(total_ha < MAX_HA);
167
Check if there is a transaction-capable storage engine besides the
168
binary log (which is considered a transaction-capable storage engine in
171
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
172
savepoint_alloc_size+= sizeof(SAVEPOINT);
181
This should be eventualy based on the graceful shutdown flag.
182
So if flag is equal to HA_PANIC_CLOSE, the deallocate
185
if (ha_finish_errors())
191
static bool dropdb_handlerton(Session *,
195
handlerton *hton= plugin_data(plugin, handlerton *);
196
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
197
hton->drop_database(hton, (char *)path);
202
void ha_drop_database(char* path)
204
plugin_foreach(NULL, dropdb_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
208
static bool closecon_handlerton(Session *session, plugin_ref plugin,
211
handlerton *hton= plugin_data(plugin, handlerton *);
213
there's no need to rollback here as all transactions must
214
be rolled back already
216
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
217
session_get_ha_data(session, hton))
218
hton->close_connection(hton, session);
225
don't bother to rollback here, it's done already
227
void ha_close_connection(Session* session)
229
plugin_foreach(session, closecon_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
232
/* ========================================================================
233
======================= TRANSACTIONS ===================================*/
236
Transaction handling in the server
237
==================================
239
In each client connection, MySQL maintains two transactional
241
- a statement transaction,
242
- a standard, also called normal transaction.
246
"Statement transaction" is a non-standard term that comes
247
from the times when MySQL supported BerkeleyDB storage engine.
249
First of all, it should be said that in BerkeleyDB auto-commit
250
mode auto-commits operations that are atomic to the storage
251
engine itself, such as a write of a record, and are too
252
high-granular to be atomic from the application perspective
253
(MySQL). One SQL statement could involve many BerkeleyDB
254
auto-committed operations and thus BerkeleyDB auto-commit was of
257
Secondly, instead of SQL standard savepoints, BerkeleyDB
258
provided the concept of "nested transactions". In a nutshell,
259
transactions could be arbitrarily nested, but when the parent
260
transaction was committed or aborted, all its child (nested)
261
transactions were handled committed or aborted as well.
262
Commit of a nested transaction, in turn, made its changes
263
visible, but not durable: it destroyed the nested transaction,
264
all its changes would become available to the parent and
265
currently active nested transactions of this parent.
267
So the mechanism of nested transactions was employed to
268
provide "all or nothing" guarantee of SQL statements
269
required by the standard.
270
A nested transaction would be created at start of each SQL
271
statement, and destroyed (committed or aborted) at statement
272
end. Such nested transaction was internally referred to as
273
a "statement transaction" and gave birth to the term.
275
<Historical note ends>
277
Since then a statement transaction is started for each statement
278
that accesses transactional tables or uses the binary log. If
279
the statement succeeds, the statement transaction is committed.
280
If the statement fails, the transaction is rolled back. Commits
281
of statement transactions are not durable -- each such
282
transaction is nested in the normal transaction, and if the
283
normal transaction is rolled back, the effects of all enclosed
284
statement transactions are undone as well. Technically,
285
a statement transaction can be viewed as a savepoint which is
286
maintained automatically in order to make effects of one
289
The normal transaction is started by the user and is ended
290
usually upon a user request as well. The normal transaction
291
encloses transactions of all statements issued between
292
its beginning and its end.
293
In autocommit mode, the normal transaction is equivalent
294
to the statement transaction.
296
Since MySQL supports PSEA (pluggable storage engine
297
architecture), more than one transactional engine can be
298
active at a time. Hence transactions, from the server
299
point of view, are always distributed. In particular,
300
transactional state is maintained independently for each
301
engine. In order to commit a transaction the two phase
302
commit protocol is employed.
304
Not all statements are executed in context of a transaction.
305
Administrative and status information statements do not modify
306
engine data, and thus do not start a statement transaction and
307
also have no effect on the normal transaction. Examples of such
308
statements are SHOW STATUS and RESET SLAVE.
310
Similarly DDL statements are not transactional,
311
and therefore a transaction is [almost] never started for a DDL
312
statement. The difference between a DDL statement and a purely
313
administrative statement though is that a DDL statement always
314
commits the current transaction before proceeding, if there is
317
At last, SQL statements that work with non-transactional
318
engines also have no effect on the transaction state of the
319
connection. Even though they are written to the binary log,
320
and the binary log is, overall, transactional, the writes
321
are done in "write-through" mode, directly to the binlog
322
file, followed with a OS cache sync, in other words,
323
bypassing the binlog undo log (translog).
324
They do not commit the current normal transaction.
325
A failure of a statement that uses non-transactional tables
326
would cause a rollback of the statement transaction, but
327
in case there no non-transactional tables are used,
328
no statement transaction is started.
333
The server stores its transaction-related data in
334
session->transaction. This structure has two members of type
335
Session_TRANS. These members correspond to the statement and
336
normal transactions respectively:
338
- session->transaction.stmt contains a list of engines
339
that are participating in the given statement
340
- session->transaction.all contains a list of engines that
341
have participated in any of the statement transactions started
342
within the context of the normal transaction.
343
Each element of the list contains a pointer to the storage
344
engine, engine-specific transactional data, and engine-specific
347
In autocommit mode session->transaction.all is empty.
348
Instead, data of session->transaction.stmt is
349
used to commit/rollback the normal transaction.
351
The list of registered engines has a few important properties:
352
- no engine is registered in the list twice
353
- engines are present in the list a reverse temporal order --
354
new participants are always added to the beginning of the list.
356
Transaction life cycle
357
----------------------
359
When a new connection is established, session->transaction
360
members are initialized to an empty state.
361
If a statement uses any tables, all affected engines
362
are registered in the statement engine list. In
363
non-autocommit mode, the same engines are registered in
364
the normal transaction list.
365
At the end of the statement, the server issues a commit
366
or a roll back for all engines in the statement list.
367
At this point transaction flags of an engine, if any, are
368
propagated from the statement list to the list of the normal
370
When commit/rollback is finished, the statement list is
371
cleared. It will be filled in again by the next statement,
372
and emptied again at the next statement's end.
374
The normal transaction is committed in a similar way
375
(by going over all engines in session->transaction.all list)
376
but at different times:
377
- upon COMMIT SQL statement is issued by the user
378
- implicitly, by the server, at the beginning of a DDL statement
379
or SET AUTOCOMMIT={0|1} statement.
381
The normal transaction can be rolled back as well:
382
- if the user has requested so, by issuing ROLLBACK SQL
384
- if one of the storage engines requested a rollback
385
by setting session->transaction_rollback_request. This may
386
happen in case, e.g., when the transaction in the engine was
387
chosen a victim of the internal deadlock resolution algorithm
388
and rolled back internally. When such a situation happens, there
389
is little the server can do and the only option is to rollback
390
transactions in all other participating engines. In this case
391
the rollback is accompanied by an error sent to the user.
393
As follows from the use cases above, the normal transaction
394
is never committed when there is an outstanding statement
395
transaction. In most cases there is no conflict, since
396
commits of the normal transaction are issued by a stand-alone
397
administrative or DDL statement, thus no outstanding statement
398
transaction of the previous statement exists. Besides,
399
all statements that manipulate with the normal transaction
400
are prohibited in stored functions and triggers, therefore
401
no conflicting situation can occur in a sub-statement either.
402
The remaining rare cases when the server explicitly has
403
to commit the statement transaction prior to committing the normal
404
one cover error-handling scenarios (see for example
407
When committing a statement or a normal transaction, the server
408
either uses the two-phase commit protocol, or issues a commit
409
in each engine independently. The two-phase commit protocol
411
- all participating engines support two-phase commit (provide
412
handlerton::prepare PSEA API call) and
413
- transactions in at least two engines modify data (i.e. are
416
Note that the two phase commit is used for
417
statement transactions, even though they are not durable anyway.
418
This is done to ensure logical consistency of data in a multiple-
420
For example, imagine that some day MySQL supports unique
421
constraint checks deferred till the end of statement. In such
422
case a commit in one of the engines may yield ER_DUP_KEY,
423
and MySQL should be able to gracefully abort statement
424
transactions of other participants.
426
After the normal transaction has been committed,
427
session->transaction.all list is cleared.
429
When a connection is closed, the current normal transaction, if
432
Roles and responsibilities
433
--------------------------
435
The server has no way to know that an engine participates in
436
the statement and a transaction has been started
437
in it unless the engine says so. Thus, in order to be
438
a part of a transaction, the engine must "register" itself.
439
This is done by invoking trans_register_ha() server call.
440
Normally the engine registers itself whenever handler::external_lock()
441
is called. trans_register_ha() can be invoked many times: if
442
an engine is already registered, the call does nothing.
443
In case autocommit is not set, the engine must register itself
444
twice -- both in the statement list and in the normal transaction
446
In which list to register is a parameter of trans_register_ha().
448
Note, that although the registration interface in itself is
449
fairly clear, the current usage practice often leads to undesired
450
effects. E.g. since a call to trans_register_ha() in most engines
451
is embedded into implementation of handler::external_lock(), some
452
DDL statements start a transaction (at least from the server
453
point of view) even though they are not expected to. E.g.
454
CREATE TABLE does not start a transaction, since
455
handler::external_lock() is never called during CREATE TABLE. But
456
CREATE TABLE ... SELECT does, since handler::external_lock() is
457
called for the table that is being selected from. This has no
458
practical effects currently, but must be kept in mind
461
Once an engine is registered, the server will do the rest
464
During statement execution, whenever any of data-modifying
465
PSEA API methods is used, e.g. handler::write_row() or
466
handler::update_row(), the read-write flag is raised in the
467
statement transaction for the involved engine.
468
Currently All PSEA calls are "traced", and the data can not be
469
changed in a way other than issuing a PSEA call. Important:
470
unless this invariant is preserved the server will not know that
471
a transaction in a given engine is read-write and will not
472
involve the two-phase commit protocol!
474
At the end of a statement, server call
475
ha_autocommit_or_rollback() is invoked. This call in turn
476
invokes handlerton::prepare() for every involved engine.
477
Prepare is followed by a call to handlerton::commit_one_phase()
478
If a one-phase commit will suffice, handlerton::prepare() is not
479
invoked and the server only calls handlerton::commit_one_phase().
480
At statement commit, the statement-related read-write engine
481
flag is propagated to the corresponding flag in the normal
482
transaction. When the commit is complete, the list of registered
485
Rollback is handled in a similar fashion.
487
Additional notes on DDL and the normal transaction.
488
---------------------------------------------------
490
DDLs and operations with non-transactional engines
491
do not "register" in session->transaction lists, and thus do not
492
modify the transaction state. Besides, each DDL in
493
MySQL is prefixed with an implicit normal transaction commit
494
(a call to end_active_trans()), and thus leaves nothing
496
However, as it has been pointed out with CREATE TABLE .. SELECT,
497
some DDL statements can start a *new* transaction.
499
Behaviour of the server in this case is currently badly
501
DDL statements use a form of "semantic" logging
502
to maintain atomicity: if CREATE TABLE .. SELECT failed,
503
the newly created table is deleted.
504
In addition, some DDL statements issue interim transaction
505
commits: e.g. ALTER Table issues a commit after data is copied
506
from the original table to the internal temporary table. Other
507
statements, e.g. CREATE TABLE ... SELECT do not always commit
509
And finally there is a group of DDL statements such as
510
RENAME/DROP Table that doesn't start a new transaction
513
This diversity makes it hard to say what will happen if
514
by chance a stored function is invoked during a DDL --
515
whether any modifications it makes will be committed or not
516
is not clear. Fortunately, SQL grammar of few DDLs allows
517
invocation of a stored function.
519
A consistent behaviour is perhaps to always commit the normal
520
transaction after all DDLs, just like the statement transaction
521
is always committed at the end of all statements.
525
Register a storage engine for a transaction.
527
Every storage engine MUST call this function when it starts
528
a transaction or a statement (that is it must be called both for the
529
"beginning of transaction" and "beginning of statement").
530
Only storage engines registered for the transaction/statement
531
will know when to commit/rollback it.
534
trans_register_ha is idempotent - storage engine may register many
535
times per transaction.
538
void trans_register_ha(Session *session, bool all, handlerton *ht_arg)
540
Session_TRANS *trans;
541
Ha_trx_info *ha_info;
545
trans= &session->transaction.all;
546
session->server_status|= SERVER_STATUS_IN_TRANS;
549
trans= &session->transaction.stmt;
551
ha_info= session->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
553
if (ha_info->is_started())
554
return; /* already registered, return */
556
ha_info->register_ha(trans, ht_arg);
558
trans->no_2pc|=(ht_arg->prepare==0);
559
if (session->transaction.xid_state.xid.is_null())
560
session->transaction.xid_state.xid.set(session->query_id);
569
1 error, transaction was rolled back
571
int ha_prepare(Session *session)
574
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
575
Ha_trx_info *ha_info= trans->ha_list;
578
for (; ha_info; ha_info= ha_info->next())
581
handlerton *ht= ha_info->ht();
582
status_var_increment(session->status_var.ha_prepare_count);
585
if ((err= ht->prepare(ht, session, all)))
587
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
588
ha_rollback_trans(session, all);
595
push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
596
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
597
ha_resolve_storage_engine_name(ht));
605
Check if we can skip the two-phase commit.
607
A helper function to evaluate if two-phase commit is mandatory.
608
As a side effect, propagates the read-only/read-write flags
609
of the statement transaction to its enclosing normal transaction.
611
@retval true we must run a two-phase commit. Returned
612
if we have at least two engines with read-write changes.
613
@retval false Don't need two-phase commit. Even if we have two
614
transactional engines, we can run two independent
615
commits if changes in one of the engines are read-only.
620
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list,
623
/* The number of storage engines that have actual changes. */
624
unsigned rw_ha_count= 0;
625
Ha_trx_info *ha_info;
627
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
629
if (ha_info->is_trx_read_write())
634
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->ht()->slot].ha_info[1];
635
assert(ha_info != ha_info_all);
637
Merge read-only/read-write information about statement
638
transaction to its enclosing normal transaction. Do this
639
only if in a real transaction -- that is, if we know
640
that ha_info_all is registered in session->transaction.all.
641
Since otherwise we only clutter the normal transaction flags.
643
if (ha_info_all->is_started()) /* false if autocommit. */
644
ha_info_all->coalesce_trx_with(ha_info);
646
else if (rw_ha_count > 1)
649
It is a normal transaction, so we don't need to merge read/write
650
information up, and the need for two-phase commit has been
651
already established. Break the loop prematurely.
656
return rw_ha_count > 1;
664
1 transaction was rolled back
666
2 error during commit, data may be inconsistent
669
Since we don't support nested statement transactions in 5.0,
670
we can't commit or rollback stmt transactions while we are inside
671
stored functions or triggers. So we simply do nothing now.
672
TODO: This should be fixed in later ( >= 5.1) releases.
674
int ha_commit_trans(Session *session, bool all)
676
int error= 0, cookie= 0;
678
'all' means that this is either an explicit commit issued by
679
user, or an implicit commit issued by a DDL.
681
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt;
682
bool is_real_trans= all || session->transaction.all.ha_list == 0;
683
Ha_trx_info *ha_info= trans->ha_list;
686
We must not commit the normal transaction if a statement
687
transaction is pending. Otherwise statement transaction
688
flags will not get propagated to its normal transaction's
691
assert(session->transaction.stmt.ha_list == NULL ||
692
trans == &session->transaction.stmt);
698
if (is_real_trans && wait_if_global_read_lock(session, 0, 0))
700
ha_rollback_trans(session, all);
704
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all);
706
if (!trans->no_2pc && must_2pc)
708
for (; ha_info && !error; ha_info= ha_info->next())
711
handlerton *ht= ha_info->ht();
713
Do not call two-phase commit if this particular
714
transaction is read-only. This allows for simpler
715
implementation in engines that are always read-only.
717
if (! ha_info->is_trx_read_write())
720
Sic: we know that prepare() is not NULL since otherwise
721
trans->no_2pc would have been set.
723
if ((err= ht->prepare(ht, session, all)))
725
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
728
status_var_increment(session->status_var.ha_prepare_count);
732
ha_rollback_trans(session, all);
737
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0;
740
start_waiting_global_read_lock(session);
747
This function does not care about global read lock. A caller should.
749
int ha_commit_one_phase(Session *session, bool all)
752
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
753
bool is_real_trans=all || session->transaction.all.ha_list == 0;
754
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
757
for (; ha_info; ha_info= ha_info_next)
760
handlerton *ht= ha_info->ht();
761
if ((err= ht->commit(ht, session, all)))
763
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
766
status_var_increment(session->status_var.ha_commit_count);
767
ha_info_next= ha_info->next();
768
ha_info->reset(); /* keep it conveniently zero-filled */
773
session->transaction.xid_state.xid.null();
776
session->variables.tx_isolation=session->session_tx_isolation;
777
session->transaction.cleanup();
784
int ha_rollback_trans(Session *session, bool all)
787
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt;
788
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
789
bool is_real_trans=all || session->transaction.all.ha_list == 0;
792
We must not rollback the normal transaction if a statement
793
transaction is pending.
795
assert(session->transaction.stmt.ha_list == NULL ||
796
trans == &session->transaction.stmt);
800
for (; ha_info; ha_info= ha_info_next)
803
handlerton *ht= ha_info->ht();
804
if ((err= ht->rollback(ht, session, all)))
806
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
809
status_var_increment(session->status_var.ha_rollback_count);
810
ha_info_next= ha_info->next();
811
ha_info->reset(); /* keep it conveniently zero-filled */
816
session->transaction.xid_state.xid.null();
819
session->variables.tx_isolation=session->session_tx_isolation;
820
session->transaction.cleanup();
824
session->transaction_rollback_request= false;
827
If a non-transactional table was updated, warn; don't warn if this is a
828
slave thread (because when a slave thread executes a ROLLBACK, it has
829
been read from the binary log, so it's 100% sure and normal to produce
830
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
831
slave SQL thread, it would not stop the thread but just be printed in
832
the error log; but we don't want users to wonder why they have this
833
message in the error log, so we don't send it.
835
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION)
836
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN,
837
ER_WARNING_NOT_COMPLETE_ROLLBACK,
838
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
843
This is used to commit or rollback a single statement depending on
847
Note that if the autocommit is on, then the following call inside
848
InnoDB will commit or rollback the whole transaction (= the statement). The
849
autocommit mechanism built into InnoDB is based on counting locks, but if
850
the user has used LOCK TABLES then that mechanism does not know to do the
853
int ha_autocommit_or_rollback(Session *session, int error)
855
if (session->transaction.stmt.ha_list)
859
if (ha_commit_trans(session, 0))
864
(void) ha_rollback_trans(session, 0);
865
if (session->transaction_rollback_request)
866
(void) ha_rollback(session);
869
session->variables.tx_isolation=session->session_tx_isolation;
880
static bool xacommit_handlerton(Session *,
884
handlerton *hton= plugin_data(plugin, handlerton *);
885
if (hton->state == SHOW_OPTION_YES && hton->recover)
887
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
888
((struct xahton_st *)arg)->result= 0;
893
static bool xarollback_handlerton(Session *,
897
handlerton *hton= plugin_data(plugin, handlerton *);
898
if (hton->state == SHOW_OPTION_YES && hton->recover)
900
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
901
((struct xahton_st *)arg)->result= 0;
907
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
909
struct xahton_st xaop;
913
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
914
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
920
recover() step of xa.
923
there are three modes of operation:
924
- automatic recover after a crash
925
in this case commit_list != 0, tc_heuristic_recover==0
926
all xids from commit_list are committed, others are rolled back
927
- manual (heuristic) recover
928
in this case commit_list==0, tc_heuristic_recover != 0
929
DBA has explicitly specified that all prepared transactions should
930
be committed (or rolled back).
931
- no recovery (MySQL did not detect a crash)
932
in this case commit_list==0, tc_heuristic_recover == 0
933
there should be no prepared transactions in this case.
937
int len, found_foreign_xids, found_my_xids;
943
static bool xarecover_handlerton(Session *,
947
handlerton *hton= plugin_data(plugin, handlerton *);
948
struct xarecover_st *info= (struct xarecover_st *) arg;
951
if (hton->state == SHOW_OPTION_YES && hton->recover)
953
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
955
errmsg_printf(ERRMSG_LVL_INFO, _("Found %d prepared transaction(s) in %s"),
956
got, ha_resolve_storage_engine_name(hton));
957
for (int i=0; i < got; i ++)
959
my_xid x=info->list[i].get_my_xid();
960
if (!x) // not "mine" - that is generated by external TM
962
xid_cache_insert(info->list+i, XA_PREPARED);
963
info->found_foreign_xids++;
968
info->found_my_xids++;
972
if (info->commit_list ?
973
hash_search(info->commit_list, (unsigned char *)&x, sizeof(x)) != 0 :
974
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
976
hton->commit_by_xid(hton, info->list+i);
980
hton->rollback_by_xid(hton, info->list+i);
990
int ha_recover(HASH *commit_list)
992
struct xarecover_st info;
993
info.found_foreign_xids= info.found_my_xids= 0;
994
info.commit_list= commit_list;
995
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
998
/* commit_list and tc_heuristic_recover cannot be set both */
999
assert(info.commit_list==0 || tc_heuristic_recover==0);
1000
/* if either is set, total_ha_2pc must be set too */
1001
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1003
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1006
if (info.commit_list)
1007
errmsg_printf(ERRMSG_LVL_INFO, _("Starting crash recovery..."));
1010
#ifndef WILL_BE_DELETED_LATER
1013
for now, only InnoDB supports 2pc. It means we can always safely
1014
rollback all pending transactions, without risking inconsistent data
1017
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1018
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1023
for (info.len= MAX_XID_LIST_SIZE ;
1024
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1026
info.list=(XID *)malloc(info.len*sizeof(XID));
1030
errmsg_printf(ERRMSG_LVL_ERROR, ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1034
plugin_foreach(NULL, xarecover_handlerton,
1035
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1037
free((unsigned char*)info.list);
1038
if (info.found_foreign_xids)
1039
errmsg_printf(ERRMSG_LVL_WARN, _("Found %d prepared XA transactions"),
1040
info.found_foreign_xids);
1041
if (info.dry_run && info.found_my_xids)
1043
errmsg_printf(ERRMSG_LVL_ERROR,
1044
_("Found %d prepared transactions! It means that drizzled "
1045
"was not shut down properly last time and critical "
1046
"recovery information (last binlog or %s file) was "
1047
"manually deleted after a crash. You have to start "
1048
"drizzled with the --tc-heuristic-recover switch to "
1049
"commit or rollback pending transactions."),
1050
info.found_my_xids, opt_tc_log_file);
1053
if (info.commit_list)
1054
errmsg_printf(ERRMSG_LVL_INFO, _("Crash recovery finished."));
1059
return the list of XID's to a client, the same way SHOW commands do.
1062
I didn't find in XA specs that an RM cannot return the same XID twice,
1063
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1064
It can be easily fixed later, if necessary.
1066
bool mysql_xa_recover(Session *session)
1068
List<Item> field_list;
1069
Protocol *protocol= session->protocol;
1073
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1074
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1075
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1076
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1078
if (protocol->send_fields(&field_list,
1079
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1082
pthread_mutex_lock(&LOCK_xid_cache);
1083
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1085
if (xs->xa_state==XA_PREPARED)
1087
protocol->prepare_for_resend();
1088
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1089
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1090
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1091
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1093
if (protocol->write())
1095
pthread_mutex_unlock(&LOCK_xid_cache);
1101
pthread_mutex_unlock(&LOCK_xid_cache);
1108
This function should be called when MySQL sends rows of a SELECT result set
1109
or the EOF mark to the client. It releases a possible adaptive hash index
1110
S-latch held by session in InnoDB and also releases a possible InnoDB query
1111
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a session to
1112
keep them over several calls of the InnoDB handler interface when a join
1113
is executed. But when we let the control to pass to the client they have
1114
to be released because if the application program uses mysql_use_result(),
1115
it may deadlock on the S-latch if the application on another connection
1116
performs another SQL query. In MySQL-4.1 this is even more important because
1117
there a connection can have several SELECT queries open at the same time.
1119
@param session the thread handle of the current connection
1124
static bool release_temporary_latches(Session *session, plugin_ref plugin,
1127
handlerton *hton= plugin_data(plugin, handlerton *);
1129
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1130
hton->release_temporary_latches(hton, session);
1136
int ha_release_temporary_latches(Session *session)
1138
plugin_foreach(session, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1144
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv)
1147
Session_TRANS *trans= &session->transaction.all;
1148
Ha_trx_info *ha_info, *ha_info_next;
1152
rolling back to savepoint in all storage engines that were part of the
1153
transaction when the savepoint was set
1155
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1158
handlerton *ht= ha_info->ht();
1160
assert(ht->savepoint_set != 0);
1161
if ((err= ht->savepoint_rollback(ht, session,
1162
(unsigned char *)(sv+1)+ht->savepoint_offset)))
1164
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1167
status_var_increment(session->status_var.ha_savepoint_rollback_count);
1168
trans->no_2pc|= ht->prepare == 0;
1171
rolling back the transaction in all storage engines that were not part of
1172
the transaction when the savepoint was set
1174
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1175
ha_info= ha_info_next)
1178
handlerton *ht= ha_info->ht();
1179
if ((err= ht->rollback(ht, session, !(0))))
1181
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1184
status_var_increment(session->status_var.ha_rollback_count);
1185
ha_info_next= ha_info->next();
1186
ha_info->reset(); /* keep it conveniently zero-filled */
1188
trans->ha_list= sv->ha_list;
1194
according to the sql standard (ISO/IEC 9075-2:2003)
1195
section "4.33.4 SQL-statements and transaction states",
1196
SAVEPOINT is *not* transaction-initiating SQL-statement
1198
int ha_savepoint(Session *session, SAVEPOINT *sv)
1201
Session_TRANS *trans= &session->transaction.all;
1202
Ha_trx_info *ha_info= trans->ha_list;
1203
for (; ha_info; ha_info= ha_info->next())
1206
handlerton *ht= ha_info->ht();
1208
if (! ht->savepoint_set)
1210
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1214
if ((err= ht->savepoint_set(ht, session, (unsigned char *)(sv+1)+ht->savepoint_offset)))
1216
my_error(ER_GET_ERRNO, MYF(0), err);
1219
status_var_increment(session->status_var.ha_savepoint_count);
1222
Remember the list of registered storage engines. All new
1223
engines are prepended to the beginning of the list.
1225
sv->ha_list= trans->ha_list;
1229
int ha_release_savepoint(Session *session, SAVEPOINT *sv)
1232
Ha_trx_info *ha_info= sv->ha_list;
1234
for (; ha_info; ha_info= ha_info->next())
1237
handlerton *ht= ha_info->ht();
1238
/* Savepoint life time is enclosed into transaction life time. */
1240
if (!ht->savepoint_release)
1242
if ((err= ht->savepoint_release(ht, session,
1243
(unsigned char *)(sv+1) + ht->savepoint_offset)))
1245
my_error(ER_GET_ERRNO, MYF(0), err);
1253
static bool snapshot_handlerton(Session *session, plugin_ref plugin, void *arg)
1255
handlerton *hton= plugin_data(plugin, handlerton *);
1256
if (hton->state == SHOW_OPTION_YES &&
1257
hton->start_consistent_snapshot)
1259
hton->start_consistent_snapshot(hton, session);
1260
*((bool *)arg)= false;
1265
int ha_start_consistent_snapshot(Session *session)
1269
plugin_foreach(session, snapshot_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1272
Same idea as when one wants to CREATE TABLE in one engine which does not
1276
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1277
"This Drizzle server does not support any "
1278
"consistent-read capable storage engine");
1283
static bool flush_handlerton(Session *,
1287
handlerton *hton= plugin_data(plugin, handlerton *);
1288
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1289
hton->flush_logs(hton))
1295
bool ha_flush_logs(handlerton *db_type)
1297
if (db_type == NULL)
1299
if (plugin_foreach(NULL, flush_handlerton,
1300
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1305
if (db_type->state != SHOW_OPTION_YES ||
1306
(db_type->flush_logs && db_type->flush_logs(db_type)))
1312
static const char *check_lowercase_names(handler *file, const char *path,
1315
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1318
/* Ensure that table handler get path in lower case */
1319
if (tmp_path != path)
1320
strcpy(tmp_path, path);
1323
we only should turn into lowercase database/table part
1324
so start the process after homedirectory
1326
my_casedn_str(files_charset_info, tmp_path + drizzle_data_home_len);
1332
An interceptor to hijack the text of the error message without
1333
setting an error in the thread. We need the text to present it
1334
in the form of a warning to the user.
1337
struct Ha_delete_table_error_handler: public Internal_error_handler
1340
virtual bool handle_error(uint32_t sql_errno,
1341
const char *message,
1342
DRIZZLE_ERROR::enum_warning_level level,
1344
char buff[DRIZZLE_ERRMSG_SIZE];
1349
Ha_delete_table_error_handler::
1350
handle_error(uint32_t ,
1351
const char *message,
1352
DRIZZLE_ERROR::enum_warning_level ,
1355
/* Grab the error message */
1356
strncpy(buff, message, sizeof(buff)-1);
1361
struct handlerton_delete_table_args {
1368
static bool deletetable_handlerton(Session *,
1372
struct handlerton_delete_table_args *dtargs= (struct handlerton_delete_table_args *) args;
1374
Session *session= dtargs->session;
1375
const char *path= dtargs->path;
1378
char tmp_path[FN_REFLEN];
1380
if(dtargs->error!=ENOENT) /* already deleted table */
1383
handlerton *table_type= plugin_data(plugin, handlerton *);
1388
if(!(table_type->state == SHOW_OPTION_YES && table_type->create))
1391
if ((file= table_type->create(table_type, NULL, session->mem_root)))
1396
path= check_lowercase_names(file, path, tmp_path);
1397
int error= file->ha_delete_table(path);
1401
dtargs->error= error;
1403
delete dtargs->file;
1412
This should return ENOENT if the file doesn't exists.
1413
The .frm file will be deleted only if we return 0 or ENOENT
1415
int ha_delete_table(Session *session, const char *path,
1416
const char *db, const char *alias, bool generate_warning)
1418
TABLE_SHARE dummy_share;
1421
struct handlerton_delete_table_args dtargs;
1422
dtargs.error= ENOENT;
1423
dtargs.session= session;
1427
plugin_foreach(NULL, deletetable_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1430
memset(&dummy_table, 0, sizeof(dummy_table));
1431
memset(&dummy_share, 0, sizeof(dummy_share));
1432
dummy_table.s= &dummy_share;
1434
if (dtargs.error && generate_warning)
1437
Because file->print_error() use my_error() to generate the error message
1438
we use an internal error handler to intercept it and store the text
1439
in a temporary buffer. Later the message will be presented to user
1442
Ha_delete_table_error_handler ha_delete_table_error_handler;
1444
/* Fill up strucutures that print_error may need */
1445
dummy_share.path.str= (char*) path;
1446
dummy_share.path.length= strlen(path);
1447
dummy_share.db.str= (char*) db;
1448
dummy_share.db.length= strlen(db);
1449
dummy_share.table_name.str= (char*) alias;
1450
dummy_share.table_name.length= strlen(alias);
1451
dummy_table.alias= alias;
1455
handler *file= dtargs.file;
1456
file->change_table_ptr(&dummy_table, &dummy_share);
1458
session->push_internal_handler(&ha_delete_table_error_handler);
1459
file->print_error(dtargs.error, 0);
1461
session->pop_internal_handler();
1464
dtargs.error= -1; /* General form of fail. maybe bad FRM */
1467
XXX: should we convert *all* errors to warnings here?
1468
What if the error is fatal?
1470
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_ERROR, dtargs.error,
1471
ha_delete_table_error_handler.buff);
1477
return dtargs.error;
54
1480
/****************************************************************************
55
** General Cursor functions
1481
** General handler functions
56
1482
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
61
estimation_rows_to_insert(0),
63
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
64
ref_length(sizeof(internal::my_off_t)),
67
next_insert_id(0), insert_id_for_cur_row(0)
72
assert(locked == false);
73
/* TODO: assert(inited == NONE); */
78
* @note this only used in
79
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
80
* of the writing of this comment. -Brian
82
Cursor *Cursor::clone(memory::Root *mem_root)
84
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
1483
handler *handler::clone(MEM_ROOT *mem_root)
1485
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
87
Allocate Cursor->ref here because otherwise ha_open will allocate it
1487
Allocate handler->ref here because otherwise ha_open will allocate it
88
1488
on this->table->mem_root and we will not be able to reclaim that memory
89
when the clone Cursor object is destroyed.
1489
when the clone handler object is destroyed.
91
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1491
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
94
identifier::Table identifier(getTable()->getShare()->getSchemaName(),
95
getTable()->getShare()->getTableName(),
96
getTable()->getShare()->getType());
98
if (new_handler && !new_handler->ha_open(identifier,
99
getTable()->getDBStat(),
1493
if (new_handler && !new_handler->ha_open(table,
1494
table->s->normalized_path.str,
100
1496
HA_OPEN_IGNORE_IF_LOCKED))
101
1497
return new_handler;
107
given a buffer with a key value, and a map of keyparts
108
that are present in this value, returns the length of the value
110
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
112
/* works only with key prefixes */
113
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
115
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
116
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
119
while (key_part_found < end_key_part_found && keypart_map_arg)
121
length+= key_part_found->store_length;
122
keypart_map_arg >>= 1;
128
int Cursor::startIndexScan(uint32_t idx, bool sorted)
1501
int handler::ha_index_init(uint32_t idx, bool sorted)
131
assert(inited == NONE);
132
if (!(result= doStartIndexScan(idx, sorted)))
1504
assert(inited==NONE);
1505
if (!(result= index_init(idx, sorted)))
134
1507
end_range= NULL;
138
int Cursor::endIndexScan()
1511
int handler::ha_index_end()
140
1513
assert(inited==INDEX);
142
1515
end_range= NULL;
143
return(doEndIndexScan());
1516
return(index_end());
146
int Cursor::startTableScan(bool scan)
1519
int handler::ha_rnd_init(bool scan)
149
1522
assert(inited==NONE || (inited==RND && scan));
150
inited= (result= doStartTableScan(scan)) ? NONE: RND;
1523
inited= (result= rnd_init(scan)) ? NONE: RND;
155
int Cursor::endTableScan()
1527
int handler::ha_rnd_end()
157
1529
assert(inited==RND);
159
return(doEndTableScan());
162
int Cursor::ha_index_or_rnd_end()
164
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
167
void Cursor::ha_start_bulk_insert(ha_rows rows)
1534
int handler::ha_index_or_rnd_end()
1536
return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0;
1539
handler::Table_flags handler::ha_table_flags() const
1541
return cached_table_flags;
1544
void handler::ha_start_bulk_insert(ha_rows rows)
169
1546
estimation_rows_to_insert= rows;
170
1547
start_bulk_insert(rows);
173
int Cursor::ha_end_bulk_insert()
1550
int handler::ha_end_bulk_insert()
175
1552
estimation_rows_to_insert= 0;
176
1553
return end_bulk_insert();
179
const key_map *Cursor::keys_to_use_for_scanning()
1556
void handler::change_table_ptr(Table *table_arg, TABLE_SHARE *share)
1562
const key_map *handler::keys_to_use_for_scanning()
181
1564
return &key_map_empty;
184
bool Cursor::has_transactions()
186
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
189
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
191
(getTable()->in_use->status_var.*offset)++;
194
void **Cursor::ha_data(Session *session) const
196
return session->getEngineData(getEngine());
199
bool Cursor::is_fatal_error(int error, uint32_t flags)
1567
bool handler::has_transactions()
1569
return (ha_table_flags() & HA_NO_TRANSACTIONS) == 0;
1572
void handler::ha_statistic_increment(ulong SSV::*offset) const
1574
status_var_increment(table->in_use->status_var.*offset);
1577
void **handler::ha_data(Session *session) const
1579
return session_ha_data(session, ht);
1582
Session *handler::ha_session(void) const
1584
assert(!table || !table->in_use || table->in_use == current_session);
1585
return (table && table->in_use) ? table->in_use : current_session;
1589
bool handler::is_fatal_error(int error, uint32_t flags)
202
1592
((flags & HA_CHECK_DUP_KEY) &&
1149
3351
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1151
3353
*range_info= mrr_cur_range.ptr;
3358
/* **************************************************************************
3359
* DS-MRR implementation
3360
***************************************************************************/
3363
DS-MRR: Initialize and start MRR scan
3365
Initialize and start the MRR scan. Depending on the mode parameter, this
3366
may use default or DS-MRR implementation.
3368
@param h Table handler to be used
3369
@param key Index to be used
3370
@param seq_funcs Interval sequence enumeration functions
3371
@param seq_init_param Interval sequence enumeration parameter
3372
@param n_ranges Number of ranges in the sequence.
3373
@param mode HA_MRR_* modes to use
3374
@param buf INOUT Buffer to use
3376
@retval 0 Ok, Scan started.
3380
int DsMrr_impl::dsmrr_init(handler *h_in, KEY *key,
3381
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3382
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
3386
Item *pushed_cond= NULL;
3388
keyno= h_in->active_index;
3390
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3392
use_default_impl= true;
3393
return(h_in->handler::multi_range_read_init(seq_funcs, seq_init_param,
3394
n_ranges, mode, buf));
3396
rowids_buf= buf->buffer;
3397
//psergey-todo: don't add key_length as it is not needed anymore
3398
rowids_buf += key->key_length + h_in->ref_length;
3400
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3401
rowids_buf_end= buf->buffer_end;
3403
elem_size= h_in->ref_length + (int)is_mrr_assoc * sizeof(void*);
3404
rowids_buf_last= rowids_buf +
3405
((rowids_buf_end - rowids_buf)/ elem_size)*
3407
rowids_buf_end= rowids_buf_last;
3409
/* Create a separate handler object to do rndpos() calls. */
3410
Session *session= current_session;
3411
if (!(new_h2= h_in->clone(session->mem_root)) ||
3412
new_h2->ha_external_lock(session, F_RDLCK))
3418
if (keyno == h_in->pushed_idx_cond_keyno)
3419
pushed_cond= h_in->pushed_idx_cond;
3420
if (h_in->ha_index_end())
3427
table->prepare_for_position();
3428
new_h2->extra(HA_EXTRA_KEYREAD);
3430
if (h2->ha_index_init(keyno, false) ||
3431
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3434
use_default_impl= false;
3437
h2->idx_cond_push(keyno, pushed_cond);
3438
if (dsmrr_fill_buffer(new_h2))
3442
If the above call has scanned through all intervals in *seq, then
3443
adjust *buf to indicate that the remaining buffer space will not be used.
3446
buf->end_of_used_area= rowids_buf_last;
3448
if (h_in->ha_rnd_init(false))
3453
h2->ha_index_or_rnd_end();
3454
h2->ha_external_lock(session, F_UNLCK);
3461
void DsMrr_impl::dsmrr_close()
3465
h2->ha_external_lock(current_session, F_UNLCK);
3470
use_default_impl= true;
3475
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
3477
return ((handler*)h)->cmp_ref(a, b);
3482
DS-MRR: Fill the buffer with rowids and sort it by rowid
3484
{This is an internal function of DiskSweep MRR implementation}
3485
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3486
buffer. When the buffer is full or scan is completed, sort the buffer by
3489
The function assumes that rowids buffer is empty when it is invoked.
3491
@param h Table handler
3493
@retval 0 OK, the next portion of rowids is in the buffer,
3498
int DsMrr_impl::dsmrr_fill_buffer(handler *)
3503
rowids_buf_cur= rowids_buf;
3504
while ((rowids_buf_cur < rowids_buf_end) &&
3505
!(res= h2->handler::multi_range_read_next(&range_info)))
3507
/* Put rowid, or {rowid, range_id} pair into the buffer */
3508
h2->position(table->record[0]);
3509
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3510
rowids_buf_cur += h->ref_length;
3514
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3515
rowids_buf_cur += sizeof(void*);
3519
if (res && res != HA_ERR_END_OF_FILE)
3521
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3523
/* Sort the buffer contents by rowid */
3524
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3525
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3527
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3529
rowids_buf_last= rowids_buf_cur;
3530
rowids_buf_cur= rowids_buf;
3536
DS-MRR implementation: multi_range_read_next() function
3539
int DsMrr_impl::dsmrr_next(handler *h_in, char **range_info)
3543
if (use_default_impl)
3544
return h_in->handler::multi_range_read_next(range_info);
3546
if (rowids_buf_cur == rowids_buf_last)
3550
res= HA_ERR_END_OF_FILE;
3553
res= dsmrr_fill_buffer(h);
3558
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3559
if (rowids_buf_cur == rowids_buf_last)
3561
res= HA_ERR_END_OF_FILE;
3565
res= h_in->rnd_pos(table->record[0], rowids_buf_cur);
3566
rowids_buf_cur += h_in->ref_length;
3569
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3570
rowids_buf_cur += sizeof(void*);
3581
DS-MRR implementation: multi_range_read_info() function
3583
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
3584
uint32_t *flags, COST_VECT *cost)
3587
uint32_t def_flags= *flags;
3588
uint32_t def_bufsz= *bufsz;
3590
/* Get cost/flags/mem_usage of default MRR implementation */
3591
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3595
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3596
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3598
/* Default implementation is choosen */
3607
DS-MRR Implementation: multi_range_read_info_const() function
3610
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
3611
void *seq_init_param, uint32_t n_ranges,
3612
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
3615
uint32_t def_flags= *flags;
3616
uint32_t def_bufsz= *bufsz;
3617
/* Get cost/flags/mem_usage of default MRR implementation */
3618
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3619
n_ranges, &def_bufsz,
3621
if (rows == HA_POS_ERROR)
3623
/* Default implementation can't perform MRR scan => we can't either */
3628
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3629
use the default MRR implementation (we need it for UPDATE/DELETE).
3630
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3632
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3633
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3640
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3647
Check if key has partially-covered columns
3649
We can't use DS-MRR to perform range scans when the ranges are over
3650
partially-covered keys, because we'll not have full key part values
3651
(we'll have their prefixes from the index) and will not be able to check
3652
if we've reached the end the range.
3654
@param keyno Key to check
3657
Allow use of DS-MRR in cases where the index has partially-covered
3658
components but they are not used for scanning.
3664
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
3666
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3667
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3668
for (; kp != kp_end; kp++)
3670
if (!kp->field->part_of_key.is_set(keyno))
3678
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
3680
Make the choice between using Default MRR implementation and DS-MRR.
3681
This function contains common functionality factored out of dsmrr_info()
3682
and dsmrr_info_const(). The function assumes that the default MRR
3683
implementation's applicability requirements are satisfied.
3685
@param keyno Index number
3686
@param rows E(full rows to be retrieved)
3687
@param flags IN MRR flags provided by the MRR user
3688
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3689
else the value is not modified
3690
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3691
else the value is not modified
3692
@param cost IN Cost of default MRR implementation
3693
OUT If DS-MRR is choosen, cost of DS-MRR scan
3694
else the value is not modified
3696
@retval true Default MRR implementation should be used
3697
@retval false DS-MRR implementation should be used
3700
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3701
uint32_t *bufsz, COST_VECT *cost)
3703
COST_VECT dsmrr_cost;
3705
Session *session= current_session;
3706
if ((session->variables.optimizer_use_mrr == 2) ||
3707
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3708
(keyno == table->s->primary_key &&
3709
h->primary_key_is_clustered()) ||
3710
key_uses_partial_cols(keyno))
3712
/* Use the default implementation */
3713
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3717
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3719
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3725
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
3726
DS-MRR and Default implementations cost. This allows one to force use of
3727
DS-MRR whenever it is applicable without affecting other cost-based
3730
if ((force_dsmrr= (session->variables.optimizer_use_mrr == 1)) &&
3731
dsmrr_cost.total_cost() > cost->total_cost())
3734
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
3736
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
3737
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
3743
/* Use the default MRR implementation */
3750
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
3754
Get cost of DS-MRR scan
3756
@param keynr Index to be used
3757
@param rows E(Number of rows to be scanned)
3758
@param flags Scan parameters (HA_MRR_* flags)
3759
@param buffer_size INOUT Buffer size
3760
@param cost OUT The cost
3763
@retval true Error, DS-MRR cannot be used (the buffer is too small
3767
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
3768
uint32_t *buffer_size, COST_VECT *cost)
3770
uint32_t max_buff_entries, elem_size;
3771
ha_rows rows_in_full_step, rows_in_last_step;
3772
uint32_t n_full_steps;
3773
double index_read_cost;
3775
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
3776
max_buff_entries = *buffer_size / elem_size;
3778
if (!max_buff_entries)
3779
return true; /* Buffer has not enough space for even 1 rowid */
3781
/* Number of iterations we'll make with full buffer */
3782
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
3785
Get numbers of rows we'll be processing in
3786
- non-last sweep, with full buffer
3787
- last iteration, with non-full buffer
3789
rows_in_full_step= max_buff_entries;
3790
rows_in_last_step= rows % max_buff_entries;
3792
/* Adjust buffer size if we expect to use only part of the buffer */
3795
get_sort_and_sweep_cost(table, rows, cost);
3796
cost->multiply(n_full_steps);
3801
*buffer_size= cmax((ulong)*buffer_size,
3802
(size_t)(1.2*rows_in_last_step) * elem_size +
3803
h->ref_length + table->key_info[keynr].key_length);
3806
COST_VECT last_step_cost;
3807
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
3808
cost->add(&last_step_cost);
3810
if (n_full_steps != 0)
3811
cost->mem_cost= *buffer_size;
3813
cost->mem_cost= (double)rows_in_last_step * elem_size;
3815
/* Total cost of all index accesses */
3816
index_read_cost= h->index_only_read_time(keynr, (double)rows);
3817
cost->add_io(index_read_cost, 1 /* Random seeks */);
3823
Get cost of one sort-and-sweep step
3826
get_sort_and_sweep_cost()
3827
table Table being accessed
3828
nrows Number of rows to be sorted and retrieved
3832
Get cost of these operations:
3833
- sort an array of #nrows ROWIDs using qsort
3834
- read #nrows records from table in a sweep.
3838
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
3842
get_sweep_read_cost(table, nrows, false, cost);
3843
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
3844
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
3847
cost->cpu_cost += cmp_op * log2(cmp_op);
3855
Get cost of reading nrows table records in a "disk sweep"
3857
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
3858
for an ordered sequence of rowids.
3860
We assume hard disk IO. The read is performed as follows:
3862
1. The disk head is moved to the needed cylinder
3863
2. The controller waits for the plate to rotate
3864
3. The data is transferred
3866
Time to do #3 is insignificant compared to #2+#1.
3868
Time to move the disk head is proportional to head travel distance.
3870
Time to wait for the plate to rotate depends on whether the disk head
3873
If disk head wasn't moved, the wait time is proportional to distance
3874
between the previous block and the block we're reading.
3876
If the head was moved, we don't know how much we'll need to wait for the
3877
plate to rotate. We assume the wait time to be a variate with a mean of
3878
0.5 of full rotation time.
3880
Our cost units are "random disk seeks". The cost of random disk seek is
3881
actually not a constant, it depends one range of cylinders we're going
3882
to access. We make it constant by introducing a fuzzy concept of "typical
3883
datafile length" (it's fuzzy as it's hard to tell whether it should
3884
include index file, temp.tables etc). Then random seek cost is:
3886
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
3888
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
3890
@param table Table to be accessed
3891
@param nrows Number of rows to retrieve
3892
@param interrupted true <=> Assume that the disk sweep will be
3893
interrupted by other disk IO. false - otherwise.
3894
@param cost OUT The cost.
3897
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
3901
if (table->file->primary_key_is_clustered())
3903
cost->io_count= table->file->read_time(table->s->primary_key,
3904
(uint) nrows, nrows);
3909
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
3911
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
3912
if (busy_blocks < 1.0)
3915
cost->io_count= busy_blocks;
3919
/* Assume reading is done in one 'sweep' */
3920
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
3921
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);