1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
19
Handler-calling-functions
22
#ifdef USE_PRAGMA_IMPLEMENTATION
23
#pragma implementation // gcc: Class implementation
26
#include "mysql_priv.h"
27
#include "rpl_filter.h"
28
#include <myisampack.h>
32
While we have legacy_db_type, we have this array to
33
check for dups and to find handlerton from legacy_db_type.
34
Remove when legacy_db_type is finally gone
36
st_plugin_int *hton2plugin[MAX_HA];
38
static handlerton *installed_htons[128];
40
#define BITMAP_STACKBUF_SIZE (128/8)
42
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NullS,0}, {NullS,0} };
44
/* number of entries in handlertons[] */
46
/* number of storage engines (from handlertons[]) that support 2pc */
47
ulong total_ha_2pc= 0;
48
/* size of savepoint storage area (see ha_init) */
49
ulong savepoint_alloc_size= 0;
51
static const LEX_STRING sys_table_aliases[]=
53
{ C_STRING_WITH_LEN("INNOBASE") }, { C_STRING_WITH_LEN("INNODB") },
54
{ C_STRING_WITH_LEN("HEAP") }, { C_STRING_WITH_LEN("MEMORY") },
58
const char *ha_row_type[] = {
59
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
62
const char *tx_isolation_names[] =
63
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
65
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
66
tx_isolation_names, NULL};
68
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
69
uint known_extensions_id= 0;
73
static plugin_ref ha_default_plugin(THD *thd)
75
if (thd->variables.table_plugin)
76
return thd->variables.table_plugin;
77
return my_plugin_lock(thd, &global_system_variables.table_plugin);
82
Return the default storage engine handlerton for thread
84
@param ha_default_handlerton(thd)
85
@param thd current thread
90
handlerton *ha_default_handlerton(THD *thd)
92
plugin_ref plugin= ha_default_plugin(thd);
94
handlerton *hton= plugin_data(plugin, handlerton*);
101
Return the storage engine handlerton for the supplied name
103
@param thd current thread
104
@param name name of storage engine
107
pointer to storage engine plugin handle
109
plugin_ref ha_resolve_by_name(THD *thd, const LEX_STRING *name)
111
const LEX_STRING *table_alias;
115
/* my_strnncoll is a macro and gcc doesn't do early expansion of macro */
116
if (thd && !my_charset_latin1.coll->strnncoll(&my_charset_latin1,
117
(const uchar *)name->str, name->length,
118
(const uchar *)STRING_WITH_LEN("DEFAULT"), 0))
119
return ha_default_plugin(thd);
121
if ((plugin= my_plugin_lock_by_name(thd, name, MYSQL_STORAGE_ENGINE_PLUGIN)))
123
handlerton *hton= plugin_data(plugin, handlerton *);
124
if (!(hton->flags & HTON_NOT_USER_SELECTABLE))
128
unlocking plugin immediately after locking is relatively low cost.
130
plugin_unlock(thd, plugin);
134
We check for the historical aliases.
136
for (table_alias= sys_table_aliases; table_alias->str; table_alias+= 2)
138
if (!my_strnncoll(&my_charset_latin1,
139
(const uchar *)name->str, name->length,
140
(const uchar *)table_alias->str, table_alias->length))
142
name= table_alias + 1;
151
plugin_ref ha_lock_engine(THD *thd, handlerton *hton)
155
st_plugin_int **plugin= hton2plugin + hton->slot;
158
return my_plugin_lock(thd, plugin);
160
return my_plugin_lock(thd, &plugin);
167
handlerton *ha_resolve_by_legacy_type(THD *thd, enum legacy_db_type db_type)
171
case DB_TYPE_DEFAULT:
172
return ha_default_handlerton(thd);
174
if (db_type > DB_TYPE_UNKNOWN && db_type < DB_TYPE_DEFAULT &&
175
(plugin= ha_lock_engine(thd, installed_htons[db_type])))
176
return plugin_data(plugin, handlerton*);
178
case DB_TYPE_UNKNOWN:
185
Use other database handler if databasehandler is not compiled in.
187
handlerton *ha_checktype(THD *thd, enum legacy_db_type database_type,
188
bool no_substitute, bool report_error)
190
handlerton *hton= ha_resolve_by_legacy_type(thd, database_type);
191
if (ha_storage_engine_is_enabled(hton))
198
const char *engine_name= ha_resolve_storage_engine_name(hton);
199
my_error(ER_FEATURE_DISABLED,MYF(0),engine_name,engine_name);
204
switch (database_type) {
206
return ha_resolve_by_legacy_type(thd, DB_TYPE_HASH);
211
return ha_default_handlerton(thd);
215
handler *get_new_handler(TABLE_SHARE *share, MEM_ROOT *alloc,
219
DBUG_ENTER("get_new_handler");
220
DBUG_PRINT("enter", ("alloc: 0x%lx", (long) alloc));
222
if (db_type && db_type->state == SHOW_OPTION_YES && db_type->create)
224
if ((file= db_type->create(db_type, share, alloc)))
229
Try the default table type
230
Here the call to current_thd() is ok as we call this function a lot of
231
times but we enter this branch very seldom.
233
DBUG_RETURN(get_new_handler(share, alloc, ha_default_handlerton(current_thd)));
238
Register handler error messages for use with my_error().
246
int ha_init_errors(void)
248
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
249
const char **errmsgs;
251
/* Allocate a pointer array for the error message strings. */
252
/* Zerofill it to avoid uninitialized gaps. */
253
if (! (errmsgs= (const char**) my_malloc(HA_ERR_ERRORS * sizeof(char*),
254
MYF(MY_WME | MY_ZEROFILL))))
257
/* Set the dedicated error messages. */
258
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
259
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
260
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
261
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
262
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
263
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
264
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
265
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
266
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
267
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
268
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
269
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
270
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
271
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
272
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
273
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
274
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
275
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
276
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
277
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
278
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
279
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
280
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
281
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
282
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
283
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
284
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
285
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
286
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
287
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
288
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
289
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
290
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
291
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
292
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
293
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
294
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
295
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
296
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
297
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
298
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
300
/* Register the error messages for use with my_error(). */
301
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
306
Unregister handler error messages.
313
static int ha_finish_errors(void)
315
const char **errmsgs;
317
/* Allocate a pointer array for the error message strings. */
318
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
320
my_free((uchar*) errmsgs, MYF(0));
325
int ha_finalize_handlerton(st_plugin_int *plugin)
327
handlerton *hton= (handlerton *)plugin->data;
328
DBUG_ENTER("ha_finalize_handlerton");
333
case SHOW_OPTION_DISABLED:
335
case SHOW_OPTION_YES:
336
if (installed_htons[hton->db_type] == hton)
337
installed_htons[hton->db_type]= NULL;
342
hton->panic(hton, HA_PANIC_CLOSE);
344
if (plugin->plugin->deinit)
347
Today we have no defined/special behavior for uninstalling
350
DBUG_PRINT("info", ("Deinitializing plugin: '%s'", plugin->name.str));
351
if (plugin->plugin->deinit(NULL))
353
DBUG_PRINT("warning", ("Plugin '%s' deinit function returned error.",
358
my_free((uchar*)hton, MYF(0));
364
int ha_initialize_handlerton(st_plugin_int *plugin)
367
DBUG_ENTER("ha_initialize_handlerton");
368
DBUG_PRINT("plugin", ("initialize plugin: '%s'", plugin->name.str));
370
hton= (handlerton *)my_malloc(sizeof(handlerton),
371
MYF(MY_WME | MY_ZEROFILL));
373
FIXME: the MY_ZEROFILL flag above doesn't zero all the bytes.
375
This was detected after adding get_backup_engine member to handlerton
376
structure. Apparently get_backup_engine was not NULL even though it was
379
bzero(hton, sizeof(hton));
380
/* Historical Requirement */
381
plugin->data= hton; // shortcut for the future
382
if (plugin->plugin->init)
384
if (plugin->plugin->init(hton))
386
sql_print_error("Plugin '%s' init function returned error.",
393
the switch below and hton->state should be removed when
394
command-line options for plugins will be implemented
396
switch (hton->state) {
399
case SHOW_OPTION_YES:
402
/* now check the db_type for conflict */
403
if (hton->db_type <= DB_TYPE_UNKNOWN ||
404
hton->db_type >= DB_TYPE_DEFAULT ||
405
installed_htons[hton->db_type])
407
int idx= (int) DB_TYPE_FIRST_DYNAMIC;
409
while (idx < (int) DB_TYPE_DEFAULT && installed_htons[idx])
412
if (idx == (int) DB_TYPE_DEFAULT)
414
sql_print_warning("Too many storage engines!");
417
if (hton->db_type != DB_TYPE_UNKNOWN)
418
sql_print_warning("Storage engine '%s' has conflicting typecode. "
419
"Assigning value %d.", plugin->plugin->name, idx);
420
hton->db_type= (enum legacy_db_type) idx;
422
installed_htons[hton->db_type]= hton;
423
tmp= hton->savepoint_offset;
424
hton->savepoint_offset= savepoint_alloc_size;
425
savepoint_alloc_size+= tmp;
426
hton->slot= total_ha++;
427
hton2plugin[hton->slot]=plugin;
434
hton->state= SHOW_OPTION_DISABLED;
439
This is entirely for legacy. We will create a new "disk based" hton and a
440
"memory" hton which will be configurable longterm. We should be able to
441
remove partition and myisammrg.
443
switch (hton->db_type) {
462
DBUG_ENTER("ha_init");
464
DBUG_ASSERT(total_ha < MAX_HA);
466
Check if there is a transaction-capable storage engine besides the
467
binary log (which is considered a transaction-capable storage engine in
470
opt_using_transactions= total_ha>(ulong)opt_bin_log;
471
savepoint_alloc_size+= sizeof(SAVEPOINT);
478
DBUG_ENTER("ha_end");
482
This should be eventualy based on the graceful shutdown flag.
483
So if flag is equal to HA_PANIC_CLOSE, the deallocate
486
if (ha_finish_errors())
492
static my_bool dropdb_handlerton(THD *unused1, plugin_ref plugin,
495
handlerton *hton= plugin_data(plugin, handlerton *);
496
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
497
hton->drop_database(hton, (char *)path);
502
void ha_drop_database(char* path)
504
plugin_foreach(NULL, dropdb_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, path);
508
static my_bool closecon_handlerton(THD *thd, plugin_ref plugin,
511
handlerton *hton= plugin_data(plugin, handlerton *);
513
there's no need to rollback here as all transactions must
514
be rolled back already
516
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
517
thd_get_ha_data(thd, hton))
518
hton->close_connection(hton, thd);
525
don't bother to rollback here, it's done already
527
void ha_close_connection(THD* thd)
529
plugin_foreach(thd, closecon_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, 0);
532
/* ========================================================================
533
======================= TRANSACTIONS ===================================*/
536
Transaction handling in the server
537
==================================
539
In each client connection, MySQL maintains two transactional
541
- a statement transaction,
542
- a standard, also called normal transaction.
546
"Statement transaction" is a non-standard term that comes
547
from the times when MySQL supported BerkeleyDB storage engine.
549
First of all, it should be said that in BerkeleyDB auto-commit
550
mode auto-commits operations that are atomic to the storage
551
engine itself, such as a write of a record, and are too
552
high-granular to be atomic from the application perspective
553
(MySQL). One SQL statement could involve many BerkeleyDB
554
auto-committed operations and thus BerkeleyDB auto-commit was of
557
Secondly, instead of SQL standard savepoints, BerkeleyDB
558
provided the concept of "nested transactions". In a nutshell,
559
transactions could be arbitrarily nested, but when the parent
560
transaction was committed or aborted, all its child (nested)
561
transactions were handled committed or aborted as well.
562
Commit of a nested transaction, in turn, made its changes
563
visible, but not durable: it destroyed the nested transaction,
564
all its changes would become available to the parent and
565
currently active nested transactions of this parent.
567
So the mechanism of nested transactions was employed to
568
provide "all or nothing" guarantee of SQL statements
569
required by the standard.
570
A nested transaction would be created at start of each SQL
571
statement, and destroyed (committed or aborted) at statement
572
end. Such nested transaction was internally referred to as
573
a "statement transaction" and gave birth to the term.
575
<Historical note ends>
577
Since then a statement transaction is started for each statement
578
that accesses transactional tables or uses the binary log. If
579
the statement succeeds, the statement transaction is committed.
580
If the statement fails, the transaction is rolled back. Commits
581
of statement transactions are not durable -- each such
582
transaction is nested in the normal transaction, and if the
583
normal transaction is rolled back, the effects of all enclosed
584
statement transactions are undone as well. Technically,
585
a statement transaction can be viewed as a savepoint which is
586
maintained automatically in order to make effects of one
589
The normal transaction is started by the user and is ended
590
usually upon a user request as well. The normal transaction
591
encloses transactions of all statements issued between
592
its beginning and its end.
593
In autocommit mode, the normal transaction is equivalent
594
to the statement transaction.
596
Since MySQL supports PSEA (pluggable storage engine
597
architecture), more than one transactional engine can be
598
active at a time. Hence transactions, from the server
599
point of view, are always distributed. In particular,
600
transactional state is maintained independently for each
601
engine. In order to commit a transaction the two phase
602
commit protocol is employed.
604
Not all statements are executed in context of a transaction.
605
Administrative and status information statements do not modify
606
engine data, and thus do not start a statement transaction and
607
also have no effect on the normal transaction. Examples of such
608
statements are SHOW STATUS and RESET SLAVE.
610
Similarly DDL statements are not transactional,
611
and therefore a transaction is [almost] never started for a DDL
612
statement. The difference between a DDL statement and a purely
613
administrative statement though is that a DDL statement always
614
commits the current transaction before proceeding, if there is
617
At last, SQL statements that work with non-transactional
618
engines also have no effect on the transaction state of the
619
connection. Even though they are written to the binary log,
620
and the binary log is, overall, transactional, the writes
621
are done in "write-through" mode, directly to the binlog
622
file, followed with a OS cache sync, in other words,
623
bypassing the binlog undo log (translog).
624
They do not commit the current normal transaction.
625
A failure of a statement that uses non-transactional tables
626
would cause a rollback of the statement transaction, but
627
in case there no non-transactional tables are used,
628
no statement transaction is started.
633
The server stores its transaction-related data in
634
thd->transaction. This structure has two members of type
635
THD_TRANS. These members correspond to the statement and
636
normal transactions respectively:
638
- thd->transaction.stmt contains a list of engines
639
that are participating in the given statement
640
- thd->transaction.all contains a list of engines that
641
have participated in any of the statement transactions started
642
within the context of the normal transaction.
643
Each element of the list contains a pointer to the storage
644
engine, engine-specific transactional data, and engine-specific
647
In autocommit mode thd->transaction.all is empty.
648
Instead, data of thd->transaction.stmt is
649
used to commit/rollback the normal transaction.
651
The list of registered engines has a few important properties:
652
- no engine is registered in the list twice
653
- engines are present in the list a reverse temporal order --
654
new participants are always added to the beginning of the list.
656
Transaction life cycle
657
----------------------
659
When a new connection is established, thd->transaction
660
members are initialized to an empty state.
661
If a statement uses any tables, all affected engines
662
are registered in the statement engine list. In
663
non-autocommit mode, the same engines are registered in
664
the normal transaction list.
665
At the end of the statement, the server issues a commit
666
or a roll back for all engines in the statement list.
667
At this point transaction flags of an engine, if any, are
668
propagated from the statement list to the list of the normal
670
When commit/rollback is finished, the statement list is
671
cleared. It will be filled in again by the next statement,
672
and emptied again at the next statement's end.
674
The normal transaction is committed in a similar way
675
(by going over all engines in thd->transaction.all list)
676
but at different times:
677
- upon COMMIT SQL statement is issued by the user
678
- implicitly, by the server, at the beginning of a DDL statement
679
or SET AUTOCOMMIT={0|1} statement.
681
The normal transaction can be rolled back as well:
682
- if the user has requested so, by issuing ROLLBACK SQL
684
- if one of the storage engines requested a rollback
685
by setting thd->transaction_rollback_request. This may
686
happen in case, e.g., when the transaction in the engine was
687
chosen a victim of the internal deadlock resolution algorithm
688
and rolled back internally. When such a situation happens, there
689
is little the server can do and the only option is to rollback
690
transactions in all other participating engines. In this case
691
the rollback is accompanied by an error sent to the user.
693
As follows from the use cases above, the normal transaction
694
is never committed when there is an outstanding statement
695
transaction. In most cases there is no conflict, since
696
commits of the normal transaction are issued by a stand-alone
697
administrative or DDL statement, thus no outstanding statement
698
transaction of the previous statement exists. Besides,
699
all statements that manipulate with the normal transaction
700
are prohibited in stored functions and triggers, therefore
701
no conflicting situation can occur in a sub-statement either.
702
The remaining rare cases when the server explicitly has
703
to commit the statement transaction prior to committing the normal
704
one cover error-handling scenarios (see for example
707
When committing a statement or a normal transaction, the server
708
either uses the two-phase commit protocol, or issues a commit
709
in each engine independently. The two-phase commit protocol
711
- all participating engines support two-phase commit (provide
712
handlerton::prepare PSEA API call) and
713
- transactions in at least two engines modify data (i.e. are
716
Note that the two phase commit is used for
717
statement transactions, even though they are not durable anyway.
718
This is done to ensure logical consistency of data in a multiple-
720
For example, imagine that some day MySQL supports unique
721
constraint checks deferred till the end of statement. In such
722
case a commit in one of the engines may yield ER_DUP_KEY,
723
and MySQL should be able to gracefully abort statement
724
transactions of other participants.
726
After the normal transaction has been committed,
727
thd->transaction.all list is cleared.
729
When a connection is closed, the current normal transaction, if
732
Roles and responsibilities
733
--------------------------
735
The server has no way to know that an engine participates in
736
the statement and a transaction has been started
737
in it unless the engine says so. Thus, in order to be
738
a part of a transaction, the engine must "register" itself.
739
This is done by invoking trans_register_ha() server call.
740
Normally the engine registers itself whenever handler::external_lock()
741
is called. trans_register_ha() can be invoked many times: if
742
an engine is already registered, the call does nothing.
743
In case autocommit is not set, the engine must register itself
744
twice -- both in the statement list and in the normal transaction
746
In which list to register is a parameter of trans_register_ha().
748
Note, that although the registration interface in itself is
749
fairly clear, the current usage practice often leads to undesired
750
effects. E.g. since a call to trans_register_ha() in most engines
751
is embedded into implementation of handler::external_lock(), some
752
DDL statements start a transaction (at least from the server
753
point of view) even though they are not expected to. E.g.
754
CREATE TABLE does not start a transaction, since
755
handler::external_lock() is never called during CREATE TABLE. But
756
CREATE TABLE ... SELECT does, since handler::external_lock() is
757
called for the table that is being selected from. This has no
758
practical effects currently, but must be kept in mind
761
Once an engine is registered, the server will do the rest
764
During statement execution, whenever any of data-modifying
765
PSEA API methods is used, e.g. handler::write_row() or
766
handler::update_row(), the read-write flag is raised in the
767
statement transaction for the involved engine.
768
Currently All PSEA calls are "traced", and the data can not be
769
changed in a way other than issuing a PSEA call. Important:
770
unless this invariant is preserved the server will not know that
771
a transaction in a given engine is read-write and will not
772
involve the two-phase commit protocol!
774
At the end of a statement, server call
775
ha_autocommit_or_rollback() is invoked. This call in turn
776
invokes handlerton::prepare() for every involved engine.
777
Prepare is followed by a call to handlerton::commit_one_phase()
778
If a one-phase commit will suffice, handlerton::prepare() is not
779
invoked and the server only calls handlerton::commit_one_phase().
780
At statement commit, the statement-related read-write engine
781
flag is propagated to the corresponding flag in the normal
782
transaction. When the commit is complete, the list of registered
785
Rollback is handled in a similar fashion.
787
Additional notes on DDL and the normal transaction.
788
---------------------------------------------------
790
DDLs and operations with non-transactional engines
791
do not "register" in thd->transaction lists, and thus do not
792
modify the transaction state. Besides, each DDL in
793
MySQL is prefixed with an implicit normal transaction commit
794
(a call to end_active_trans()), and thus leaves nothing
796
However, as it has been pointed out with CREATE TABLE .. SELECT,
797
some DDL statements can start a *new* transaction.
799
Behaviour of the server in this case is currently badly
801
DDL statements use a form of "semantic" logging
802
to maintain atomicity: if CREATE TABLE .. SELECT failed,
803
the newly created table is deleted.
804
In addition, some DDL statements issue interim transaction
805
commits: e.g. ALTER TABLE issues a commit after data is copied
806
from the original table to the internal temporary table. Other
807
statements, e.g. CREATE TABLE ... SELECT do not always commit
809
And finally there is a group of DDL statements such as
810
RENAME/DROP TABLE that doesn't start a new transaction
813
This diversity makes it hard to say what will happen if
814
by chance a stored function is invoked during a DDL --
815
whether any modifications it makes will be committed or not
816
is not clear. Fortunately, SQL grammar of few DDLs allows
817
invocation of a stored function.
819
A consistent behaviour is perhaps to always commit the normal
820
transaction after all DDLs, just like the statement transaction
821
is always committed at the end of all statements.
825
Register a storage engine for a transaction.
827
Every storage engine MUST call this function when it starts
828
a transaction or a statement (that is it must be called both for the
829
"beginning of transaction" and "beginning of statement").
830
Only storage engines registered for the transaction/statement
831
will know when to commit/rollback it.
834
trans_register_ha is idempotent - storage engine may register many
835
times per transaction.
838
void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
841
Ha_trx_info *ha_info;
842
DBUG_ENTER("trans_register_ha");
843
DBUG_PRINT("enter",("%s", all ? "all" : "stmt"));
847
trans= &thd->transaction.all;
848
thd->server_status|= SERVER_STATUS_IN_TRANS;
851
trans= &thd->transaction.stmt;
853
ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
855
if (ha_info->is_started())
856
DBUG_VOID_RETURN; /* already registered, return */
858
ha_info->register_ha(trans, ht_arg);
860
trans->no_2pc|=(ht_arg->prepare==0);
861
if (thd->transaction.xid_state.xid.is_null())
862
thd->transaction.xid_state.xid.set(thd->query_id);
871
1 error, transaction was rolled back
873
int ha_prepare(THD *thd)
876
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
877
Ha_trx_info *ha_info= trans->ha_list;
878
DBUG_ENTER("ha_prepare");
881
for (; ha_info; ha_info= ha_info->next())
884
handlerton *ht= ha_info->ht();
885
status_var_increment(thd->status_var.ha_prepare_count);
888
if ((err= ht->prepare(ht, thd, all)))
890
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
891
ha_rollback_trans(thd, all);
898
push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
899
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
900
ha_resolve_storage_engine_name(ht));
908
Check if we can skip the two-phase commit.
910
A helper function to evaluate if two-phase commit is mandatory.
911
As a side effect, propagates the read-only/read-write flags
912
of the statement transaction to its enclosing normal transaction.
914
@retval TRUE we must run a two-phase commit. Returned
915
if we have at least two engines with read-write changes.
916
@retval FALSE Don't need two-phase commit. Even if we have two
917
transactional engines, we can run two independent
918
commits if changes in one of the engines are read-only.
923
ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
926
/* The number of storage engines that have actual changes. */
927
unsigned rw_ha_count= 0;
928
Ha_trx_info *ha_info;
930
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
932
if (ha_info->is_trx_read_write())
937
Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
938
DBUG_ASSERT(ha_info != ha_info_all);
940
Merge read-only/read-write information about statement
941
transaction to its enclosing normal transaction. Do this
942
only if in a real transaction -- that is, if we know
943
that ha_info_all is registered in thd->transaction.all.
944
Since otherwise we only clutter the normal transaction flags.
946
if (ha_info_all->is_started()) /* FALSE if autocommit. */
947
ha_info_all->coalesce_trx_with(ha_info);
949
else if (rw_ha_count > 1)
952
It is a normal transaction, so we don't need to merge read/write
953
information up, and the need for two-phase commit has been
954
already established. Break the loop prematurely.
959
return rw_ha_count > 1;
967
1 transaction was rolled back
969
2 error during commit, data may be inconsistent
972
Since we don't support nested statement transactions in 5.0,
973
we can't commit or rollback stmt transactions while we are inside
974
stored functions or triggers. So we simply do nothing now.
975
TODO: This should be fixed in later ( >= 5.1) releases.
977
int ha_commit_trans(THD *thd, bool all)
979
int error= 0, cookie= 0;
981
'all' means that this is either an explicit commit issued by
982
user, or an implicit commit issued by a DDL.
984
THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
985
bool is_real_trans= all || thd->transaction.all.ha_list == 0;
986
Ha_trx_info *ha_info= trans->ha_list;
987
my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
988
DBUG_ENTER("ha_commit_trans");
991
We must not commit the normal transaction if a statement
992
transaction is pending. Otherwise statement transaction
993
flags will not get propagated to its normal transaction's
996
DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
997
trans == &thd->transaction.stmt);
999
if (thd->in_sub_stmt)
1002
Since we don't support nested statement transactions in 5.0,
1003
we can't commit or rollback stmt transactions while we are inside
1004
stored functions or triggers. So we simply do nothing now.
1005
TODO: This should be fixed in later ( >= 5.1) releases.
1010
We assume that all statements which commit or rollback main transaction
1011
are prohibited inside of stored functions or triggers. So they should
1012
bail out with error even before ha_commit_trans() call. To be 100% safe
1013
let us throw error in non-debug builds.
1016
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1023
if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
1025
ha_rollback_trans(thd, all);
1031
&& ! thd->slave_thread
1034
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
1035
ha_rollback_trans(thd, all);
1040
must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
1042
if (!trans->no_2pc && must_2pc)
1044
for (; ha_info && !error; ha_info= ha_info->next())
1047
handlerton *ht= ha_info->ht();
1049
Do not call two-phase commit if this particular
1050
transaction is read-only. This allows for simpler
1051
implementation in engines that are always read-only.
1053
if (! ha_info->is_trx_read_write())
1056
Sic: we know that prepare() is not NULL since otherwise
1057
trans->no_2pc would have been set.
1059
if ((err= ht->prepare(ht, thd, all)))
1061
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1064
status_var_increment(thd->status_var.ha_prepare_count);
1066
DBUG_EXECUTE_IF("crash_commit_after_prepare", abort(););
1067
if (error || (is_real_trans && xid &&
1068
(error= !(cookie= tc_log->log_xid(thd, xid)))))
1070
ha_rollback_trans(thd, all);
1074
DBUG_EXECUTE_IF("crash_commit_after_log", abort(););
1076
error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
1077
DBUG_EXECUTE_IF("crash_commit_before_unlog", abort(););
1079
tc_log->unlog(cookie, xid);
1080
DBUG_EXECUTE_IF("crash_commit_after", abort(););
1083
start_waiting_global_read_lock(thd);
1090
This function does not care about global read lock. A caller should.
1092
int ha_commit_one_phase(THD *thd, bool all)
1095
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1096
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1097
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1098
DBUG_ENTER("ha_commit_one_phase");
1101
for (; ha_info; ha_info= ha_info_next)
1104
handlerton *ht= ha_info->ht();
1105
if ((err= ht->commit(ht, thd, all)))
1107
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1110
status_var_increment(thd->status_var.ha_commit_count);
1111
ha_info_next= ha_info->next();
1112
ha_info->reset(); /* keep it conveniently zero-filled */
1117
thd->transaction.xid_state.xid.null();
1120
thd->variables.tx_isolation=thd->session_tx_isolation;
1121
thd->transaction.cleanup();
1128
int ha_rollback_trans(THD *thd, bool all)
1131
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1132
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1133
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1134
DBUG_ENTER("ha_rollback_trans");
1137
We must not rollback the normal transaction if a statement
1138
transaction is pending.
1140
DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
1141
trans == &thd->transaction.stmt);
1143
if (thd->in_sub_stmt)
1146
If we are inside stored function or trigger we should not commit or
1147
rollback current statement transaction. See comment in ha_commit_trans()
1148
call for more information.
1153
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1158
for (; ha_info; ha_info= ha_info_next)
1161
handlerton *ht= ha_info->ht();
1162
if ((err= ht->rollback(ht, thd, all)))
1164
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1167
status_var_increment(thd->status_var.ha_rollback_count);
1168
ha_info_next= ha_info->next();
1169
ha_info->reset(); /* keep it conveniently zero-filled */
1174
thd->transaction.xid_state.xid.null();
1177
thd->variables.tx_isolation=thd->session_tx_isolation;
1178
thd->transaction.cleanup();
1182
thd->transaction_rollback_request= FALSE;
1185
If a non-transactional table was updated, warn; don't warn if this is a
1186
slave thread (because when a slave thread executes a ROLLBACK, it has
1187
been read from the binary log, so it's 100% sure and normal to produce
1188
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
1189
slave SQL thread, it would not stop the thread but just be printed in
1190
the error log; but we don't want users to wonder why they have this
1191
message in the error log, so we don't send it.
1193
if (is_real_trans && thd->transaction.all.modified_non_trans_table &&
1194
!thd->slave_thread && thd->killed != THD::KILL_CONNECTION)
1195
push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
1196
ER_WARNING_NOT_COMPLETE_ROLLBACK,
1197
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
1202
This is used to commit or rollback a single statement depending on
1206
Note that if the autocommit is on, then the following call inside
1207
InnoDB will commit or rollback the whole transaction (= the statement). The
1208
autocommit mechanism built into InnoDB is based on counting locks, but if
1209
the user has used LOCK TABLES then that mechanism does not know to do the
1212
int ha_autocommit_or_rollback(THD *thd, int error)
1214
DBUG_ENTER("ha_autocommit_or_rollback");
1215
if (thd->transaction.stmt.ha_list)
1219
if (ha_commit_trans(thd, 0))
1224
(void) ha_rollback_trans(thd, 0);
1225
if (thd->transaction_rollback_request && !thd->in_sub_stmt)
1226
(void) ha_rollback(thd);
1229
thd->variables.tx_isolation=thd->session_tx_isolation;
1240
static my_bool xacommit_handlerton(THD *unused1, plugin_ref plugin,
1243
handlerton *hton= plugin_data(plugin, handlerton *);
1244
if (hton->state == SHOW_OPTION_YES && hton->recover)
1246
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
1247
((struct xahton_st *)arg)->result= 0;
1252
static my_bool xarollback_handlerton(THD *unused1, plugin_ref plugin,
1255
handlerton *hton= plugin_data(plugin, handlerton *);
1256
if (hton->state == SHOW_OPTION_YES && hton->recover)
1258
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
1259
((struct xahton_st *)arg)->result= 0;
1265
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
1267
struct xahton_st xaop;
1271
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
1272
MYSQL_STORAGE_ENGINE_PLUGIN, &xaop);
1281
This does not need to be multi-byte safe or anything
1283
static char* xid_to_str(char *buf, XID *xid)
1288
for (i=0; i < xid->gtrid_length+xid->bqual_length; i++)
1290
uchar c=(uchar)xid->data[i];
1291
/* is_next_dig is set if next character is a number */
1292
bool is_next_dig= FALSE;
1293
if (i < XIDDATASIZE)
1295
char ch= xid->data[i+1];
1296
is_next_dig= (ch >= '0' && ch <='9');
1298
if (i == xid->gtrid_length)
1301
if (xid->bqual_length)
1307
if (c < 32 || c > 126)
1311
If next character is a number, write current character with
1312
3 octal numbers to ensure that the next number is not seen
1313
as part of the octal number
1315
if (c > 077 || is_next_dig)
1316
*s++=_dig_vec_lower[c >> 6];
1317
if (c > 007 || is_next_dig)
1318
*s++=_dig_vec_lower[(c >> 3) & 7];
1319
*s++=_dig_vec_lower[c & 7];
1323
if (c == '\'' || c == '\\')
1335
recover() step of xa.
1338
there are three modes of operation:
1339
- automatic recover after a crash
1340
in this case commit_list != 0, tc_heuristic_recover==0
1341
all xids from commit_list are committed, others are rolled back
1342
- manual (heuristic) recover
1343
in this case commit_list==0, tc_heuristic_recover != 0
1344
DBA has explicitly specified that all prepared transactions should
1345
be committed (or rolled back).
1346
- no recovery (MySQL did not detect a crash)
1347
in this case commit_list==0, tc_heuristic_recover == 0
1348
there should be no prepared transactions in this case.
1352
int len, found_foreign_xids, found_my_xids;
1358
static my_bool xarecover_handlerton(THD *unused, plugin_ref plugin,
1361
handlerton *hton= plugin_data(plugin, handlerton *);
1362
struct xarecover_st *info= (struct xarecover_st *) arg;
1365
if (hton->state == SHOW_OPTION_YES && hton->recover)
1367
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
1369
sql_print_information("Found %d prepared transaction(s) in %s",
1370
got, ha_resolve_storage_engine_name(hton));
1371
for (int i=0; i < got; i ++)
1373
my_xid x=info->list[i].get_my_xid();
1374
if (!x) // not "mine" - that is generated by external TM
1377
char buf[XIDDATASIZE*4+6]; // see xid_to_str
1378
sql_print_information("ignore xid %s", xid_to_str(buf, info->list+i));
1380
xid_cache_insert(info->list+i, XA_PREPARED);
1381
info->found_foreign_xids++;
1386
info->found_my_xids++;
1390
if (info->commit_list ?
1391
hash_search(info->commit_list, (uchar *)&x, sizeof(x)) != 0 :
1392
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
1395
char buf[XIDDATASIZE*4+6]; // see xid_to_str
1396
sql_print_information("commit xid %s", xid_to_str(buf, info->list+i));
1398
hton->commit_by_xid(hton, info->list+i);
1403
char buf[XIDDATASIZE*4+6]; // see xid_to_str
1404
sql_print_information("rollback xid %s",
1405
xid_to_str(buf, info->list+i));
1407
hton->rollback_by_xid(hton, info->list+i);
1410
if (got < info->len)
1417
int ha_recover(HASH *commit_list)
1419
struct xarecover_st info;
1420
DBUG_ENTER("ha_recover");
1421
info.found_foreign_xids= info.found_my_xids= 0;
1422
info.commit_list= commit_list;
1423
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1426
/* commit_list and tc_heuristic_recover cannot be set both */
1427
DBUG_ASSERT(info.commit_list==0 || tc_heuristic_recover==0);
1428
/* if either is set, total_ha_2pc must be set too */
1429
DBUG_ASSERT(info.dry_run || total_ha_2pc>(ulong)opt_bin_log);
1431
if (total_ha_2pc <= (ulong)opt_bin_log)
1434
if (info.commit_list)
1435
sql_print_information("Starting crash recovery...");
1438
#ifndef WILL_BE_DELETED_LATER
1441
for now, only InnoDB supports 2pc. It means we can always safely
1442
rollback all pending transactions, without risking inconsistent data
1445
DBUG_ASSERT(total_ha_2pc == (ulong) opt_bin_log+1); // only InnoDB and binlog
1446
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1451
for (info.len= MAX_XID_LIST_SIZE ;
1452
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1454
info.list=(XID *)my_malloc(info.len*sizeof(XID), MYF(0));
1458
sql_print_error(ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1462
plugin_foreach(NULL, xarecover_handlerton,
1463
MYSQL_STORAGE_ENGINE_PLUGIN, &info);
1465
my_free((uchar*)info.list, MYF(0));
1466
if (info.found_foreign_xids)
1467
sql_print_warning("Found %d prepared XA transactions",
1468
info.found_foreign_xids);
1469
if (info.dry_run && info.found_my_xids)
1471
sql_print_error("Found %d prepared transactions! It means that mysqld was "
1472
"not shut down properly last time and critical recovery "
1473
"information (last binlog or %s file) was manually deleted "
1474
"after a crash. You have to start mysqld with "
1475
"--tc-heuristic-recover switch to commit or rollback "
1476
"pending transactions.",
1477
info.found_my_xids, opt_tc_log_file);
1480
if (info.commit_list)
1481
sql_print_information("Crash recovery finished.");
1486
return the list of XID's to a client, the same way SHOW commands do.
1489
I didn't find in XA specs that an RM cannot return the same XID twice,
1490
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1491
It can be easily fixed later, if necessary.
1493
bool mysql_xa_recover(THD *thd)
1495
List<Item> field_list;
1496
Protocol *protocol= thd->protocol;
1499
DBUG_ENTER("mysql_xa_recover");
1501
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1502
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1503
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1504
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1506
if (protocol->send_fields(&field_list,
1507
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1510
pthread_mutex_lock(&LOCK_xid_cache);
1511
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1513
if (xs->xa_state==XA_PREPARED)
1515
protocol->prepare_for_resend();
1516
protocol->store_longlong((longlong)xs->xid.formatID, FALSE);
1517
protocol->store_longlong((longlong)xs->xid.gtrid_length, FALSE);
1518
protocol->store_longlong((longlong)xs->xid.bqual_length, FALSE);
1519
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1521
if (protocol->write())
1523
pthread_mutex_unlock(&LOCK_xid_cache);
1529
pthread_mutex_unlock(&LOCK_xid_cache);
1536
This function should be called when MySQL sends rows of a SELECT result set
1537
or the EOF mark to the client. It releases a possible adaptive hash index
1538
S-latch held by thd in InnoDB and also releases a possible InnoDB query
1539
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to
1540
keep them over several calls of the InnoDB handler interface when a join
1541
is executed. But when we let the control to pass to the client they have
1542
to be released because if the application program uses mysql_use_result(),
1543
it may deadlock on the S-latch if the application on another connection
1544
performs another SQL query. In MySQL-4.1 this is even more important because
1545
there a connection can have several SELECT queries open at the same time.
1547
@param thd the thread handle of the current connection
1552
static my_bool release_temporary_latches(THD *thd, plugin_ref plugin,
1555
handlerton *hton= plugin_data(plugin, handlerton *);
1557
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1558
hton->release_temporary_latches(hton, thd);
1564
int ha_release_temporary_latches(THD *thd)
1566
plugin_foreach(thd, release_temporary_latches, MYSQL_STORAGE_ENGINE_PLUGIN,
1572
int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
1575
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1576
&thd->transaction.all);
1577
Ha_trx_info *ha_info, *ha_info_next;
1579
DBUG_ENTER("ha_rollback_to_savepoint");
1583
rolling back to savepoint in all storage engines that were part of the
1584
transaction when the savepoint was set
1586
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1589
handlerton *ht= ha_info->ht();
1591
DBUG_ASSERT(ht->savepoint_set != 0);
1592
if ((err= ht->savepoint_rollback(ht, thd,
1593
(uchar *)(sv+1)+ht->savepoint_offset)))
1595
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1598
status_var_increment(thd->status_var.ha_savepoint_rollback_count);
1599
trans->no_2pc|= ht->prepare == 0;
1602
rolling back the transaction in all storage engines that were not part of
1603
the transaction when the savepoint was set
1605
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1606
ha_info= ha_info_next)
1609
handlerton *ht= ha_info->ht();
1610
if ((err= ht->rollback(ht, thd, !thd->in_sub_stmt)))
1612
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1615
status_var_increment(thd->status_var.ha_rollback_count);
1616
ha_info_next= ha_info->next();
1617
ha_info->reset(); /* keep it conveniently zero-filled */
1619
trans->ha_list= sv->ha_list;
1625
according to the sql standard (ISO/IEC 9075-2:2003)
1626
section "4.33.4 SQL-statements and transaction states",
1627
SAVEPOINT is *not* transaction-initiating SQL-statement
1629
int ha_savepoint(THD *thd, SAVEPOINT *sv)
1632
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1633
&thd->transaction.all);
1634
Ha_trx_info *ha_info= trans->ha_list;
1635
DBUG_ENTER("ha_savepoint");
1636
for (; ha_info; ha_info= ha_info->next())
1639
handlerton *ht= ha_info->ht();
1641
if (! ht->savepoint_set)
1643
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1647
if ((err= ht->savepoint_set(ht, thd, (uchar *)(sv+1)+ht->savepoint_offset)))
1649
my_error(ER_GET_ERRNO, MYF(0), err);
1652
status_var_increment(thd->status_var.ha_savepoint_count);
1655
Remember the list of registered storage engines. All new
1656
engines are prepended to the beginning of the list.
1658
sv->ha_list= trans->ha_list;
1662
int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
1665
Ha_trx_info *ha_info= sv->ha_list;
1666
DBUG_ENTER("ha_release_savepoint");
1668
for (; ha_info; ha_info= ha_info->next())
1671
handlerton *ht= ha_info->ht();
1672
/* Savepoint life time is enclosed into transaction life time. */
1674
if (!ht->savepoint_release)
1676
if ((err= ht->savepoint_release(ht, thd,
1677
(uchar *)(sv+1) + ht->savepoint_offset)))
1679
my_error(ER_GET_ERRNO, MYF(0), err);
1687
static my_bool snapshot_handlerton(THD *thd, plugin_ref plugin,
1690
handlerton *hton= plugin_data(plugin, handlerton *);
1691
if (hton->state == SHOW_OPTION_YES &&
1692
hton->start_consistent_snapshot)
1694
hton->start_consistent_snapshot(hton, thd);
1695
*((bool *)arg)= false;
1700
int ha_start_consistent_snapshot(THD *thd)
1704
plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
1707
Same idea as when one wants to CREATE TABLE in one engine which does not
1711
push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1712
"This MySQL server does not support any "
1713
"consistent-read capable storage engine");
1718
static my_bool flush_handlerton(THD *thd, plugin_ref plugin,
1721
handlerton *hton= plugin_data(plugin, handlerton *);
1722
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1723
hton->flush_logs(hton))
1729
bool ha_flush_logs(handlerton *db_type)
1731
if (db_type == NULL)
1733
if (plugin_foreach(NULL, flush_handlerton,
1734
MYSQL_STORAGE_ENGINE_PLUGIN, 0))
1739
if (db_type->state != SHOW_OPTION_YES ||
1740
(db_type->flush_logs && db_type->flush_logs(db_type)))
1746
static const char *check_lowercase_names(handler *file, const char *path,
1749
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1752
/* Ensure that table handler get path in lower case */
1753
if (tmp_path != path)
1754
strmov(tmp_path, path);
1757
we only should turn into lowercase database/table part
1758
so start the process after homedirectory
1760
my_casedn_str(files_charset_info, tmp_path + mysql_data_home_len);
1766
An interceptor to hijack the text of the error message without
1767
setting an error in the thread. We need the text to present it
1768
in the form of a warning to the user.
1771
struct Ha_delete_table_error_handler: public Internal_error_handler
1774
virtual bool handle_error(uint sql_errno,
1775
const char *message,
1776
MYSQL_ERROR::enum_warning_level level,
1778
char buff[MYSQL_ERRMSG_SIZE];
1783
Ha_delete_table_error_handler::
1784
handle_error(uint sql_errno,
1785
const char *message,
1786
MYSQL_ERROR::enum_warning_level level,
1789
/* Grab the error message */
1790
strmake(buff, message, sizeof(buff)-1);
1796
This should return ENOENT if the file doesn't exists.
1797
The .frm file will be deleted only if we return 0 or ENOENT
1799
int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
1800
const char *db, const char *alias, bool generate_warning)
1803
char tmp_path[FN_REFLEN];
1806
TABLE_SHARE dummy_share;
1807
DBUG_ENTER("ha_delete_table");
1809
bzero((char*) &dummy_table, sizeof(dummy_table));
1810
bzero((char*) &dummy_share, sizeof(dummy_share));
1811
dummy_table.s= &dummy_share;
1813
/* DB_TYPE_UNKNOWN is used in ALTER TABLE when renaming only .frm files */
1814
if (table_type == NULL ||
1815
! (file=get_new_handler((TABLE_SHARE*)0, thd->mem_root, table_type)))
1816
DBUG_RETURN(ENOENT);
1818
path= check_lowercase_names(file, path, tmp_path);
1819
if ((error= file->ha_delete_table(path)) && generate_warning)
1822
Because file->print_error() use my_error() to generate the error message
1823
we use an internal error handler to intercept it and store the text
1824
in a temporary buffer. Later the message will be presented to user
1827
Ha_delete_table_error_handler ha_delete_table_error_handler;
1829
/* Fill up strucutures that print_error may need */
1830
dummy_share.path.str= (char*) path;
1831
dummy_share.path.length= strlen(path);
1832
dummy_share.db.str= (char*) db;
1833
dummy_share.db.length= strlen(db);
1834
dummy_share.table_name.str= (char*) alias;
1835
dummy_share.table_name.length= strlen(alias);
1836
dummy_table.alias= alias;
1838
file->change_table_ptr(&dummy_table, &dummy_share);
1840
thd->push_internal_handler(&ha_delete_table_error_handler);
1841
file->print_error(error, 0);
1843
thd->pop_internal_handler();
1846
XXX: should we convert *all* errors to warnings here?
1847
What if the error is fatal?
1849
push_warning(thd, MYSQL_ERROR::WARN_LEVEL_ERROR, error,
1850
ha_delete_table_error_handler.buff);
1856
/****************************************************************************
1857
** General handler functions
1858
****************************************************************************/
1859
handler *handler::clone(MEM_ROOT *mem_root)
1861
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
1863
Allocate handler->ref here because otherwise ha_open will allocate it
1864
on this->table->mem_root and we will not be able to reclaim that memory
1865
when the clone handler object is destroyed.
1867
if (!(new_handler->ref= (uchar*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
1869
if (new_handler && !new_handler->ha_open(table,
1870
table->s->normalized_path.str,
1872
HA_OPEN_IGNORE_IF_LOCKED))
1879
void handler::ha_statistic_increment(ulong SSV::*offset) const
1881
status_var_increment(table->in_use->status_var.*offset);
1884
void **handler::ha_data(THD *thd) const
1886
return thd_ha_data(thd, ht);
1889
THD *handler::ha_thd(void) const
1891
DBUG_ASSERT(!table || !table->in_use || table->in_use == current_thd);
1892
return (table && table->in_use) ? table->in_use : current_thd;
1897
Get tablespace name from handler
1898
Returns the tablespace name associated
1899
with the table or NULL if not defined
1902
char* handler::get_tablespace_name()
1904
return table->s->tablespace;
1908
Open database-handler.
1910
Try O_RDONLY if cannot open as O_RDWR
1911
Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
1913
int handler::ha_open(TABLE *table_arg, const char *name, int mode,
1917
DBUG_ENTER("handler::ha_open");
1919
("name: %s db_type: %d db_stat: %d mode: %d lock_test: %d",
1920
name, ht->db_type, table_arg->db_stat, mode,
1924
DBUG_ASSERT(table->s == table_share);
1925
DBUG_ASSERT(alloc_root_inited(&table->mem_root));
1927
if ((error=open(name,mode,test_if_locked)))
1929
if ((error == EACCES || error == EROFS) && mode == O_RDWR &&
1930
(table->db_stat & HA_TRY_READ_ONLY))
1932
table->db_stat|=HA_READ_ONLY;
1933
error=open(name,O_RDONLY,test_if_locked);
1938
my_errno= error; /* Safeguard */
1939
DBUG_PRINT("error",("error: %d errno: %d",error,errno));
1943
if (table->s->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
1944
table->db_stat|=HA_READ_ONLY;
1945
(void) extra(HA_EXTRA_NO_READCHECK); // Not needed in SQL
1947
/* ref is already allocated for us if we're called from handler::clone() */
1948
if (!ref && !(ref= (uchar*) alloc_root(&table->mem_root,
1949
ALIGN_SIZE(ref_length)*2)))
1952
error=HA_ERR_OUT_OF_MEM;
1955
dup_ref=ref+ALIGN_SIZE(ref_length);
1956
cached_table_flags= table_flags();
1962
one has to use this method when to find
1963
random position by record as the plain
1964
position() call doesn't work for some
1965
handlers for random position
1968
int handler::rnd_pos_by_record(uchar *record)
1971
DBUG_ENTER("handler::rnd_pos_by_record");
1974
if (inited && (error= ha_index_end()))
1976
if ((error= ha_rnd_init(FALSE)))
1979
DBUG_RETURN(rnd_pos(record, ref));
1983
Read first row (only) from a table.
1985
This is never called for InnoDB tables, as these table types
1986
has the HA_STATS_RECORDS_IS_EXACT set.
1988
int handler::read_first_row(uchar * buf, uint primary_key)
1991
DBUG_ENTER("handler::read_first_row");
1993
ha_statistic_increment(&SSV::ha_read_first_count);
1996
If there is very few deleted rows in the table, find the first row by
1998
TODO remove the test for HA_READ_ORDER
2000
if (stats.deleted < 10 || primary_key >= MAX_KEY ||
2001
!(index_flags(primary_key, 0, 0) & HA_READ_ORDER))
2003
(void) ha_rnd_init(1);
2004
while ((error= rnd_next(buf)) == HA_ERR_RECORD_DELETED) ;
2005
(void) ha_rnd_end();
2009
/* Find the first row through the primary key */
2010
(void) ha_index_init(primary_key, 0);
2011
error=index_first(buf);
2012
(void) ha_index_end();
2018
Generate the next auto-increment number based on increment and offset.
2019
computes the lowest number
2020
- strictly greater than "nr"
2021
- of the form: auto_increment_offset + N * auto_increment_increment
2023
In most cases increment= offset= 1, in which case we get:
2024
@verbatim 1,2,3,4,5,... @endverbatim
2025
If increment=10 and offset=5 and previous number is 1, we get:
2026
@verbatim 1,5,15,25,35,... @endverbatim
2029
compute_next_insert_id(uint64_t nr,struct system_variables *variables)
2031
if (variables->auto_increment_increment == 1)
2032
return (nr+1); // optimization of the formula below
2033
nr= (((nr+ variables->auto_increment_increment -
2034
variables->auto_increment_offset)) /
2035
(uint64_t) variables->auto_increment_increment);
2036
return (nr* (uint64_t) variables->auto_increment_increment +
2037
variables->auto_increment_offset);
2041
void handler::adjust_next_insert_id_after_explicit_value(uint64_t nr)
2044
If we have set THD::next_insert_id previously and plan to insert an
2045
explicitely-specified value larger than this, we need to increase
2046
THD::next_insert_id to be greater than the explicit value.
2048
if ((next_insert_id > 0) && (nr >= next_insert_id))
2049
set_next_insert_id(compute_next_insert_id(nr, &table->in_use->variables));
2054
Compute a previous insert id
2056
Computes the largest number X:
2057
- smaller than or equal to "nr"
2058
- of the form: auto_increment_offset + N * auto_increment_increment
2061
@param nr Number to "round down"
2062
@param variables variables struct containing auto_increment_increment and
2063
auto_increment_offset
2066
The number X if it exists, "nr" otherwise.
2069
prev_insert_id(uint64_t nr, struct system_variables *variables)
2071
if (unlikely(nr < variables->auto_increment_offset))
2074
There's nothing good we can do here. That is a pathological case, where
2075
the offset is larger than the column's max possible value, i.e. not even
2076
the first sequence value may be inserted. User will receive warning.
2078
DBUG_PRINT("info",("auto_increment: nr: %lu cannot honour "
2079
"auto_increment_offset: %lu",
2080
(ulong) nr, variables->auto_increment_offset));
2083
if (variables->auto_increment_increment == 1)
2084
return nr; // optimization of the formula below
2085
nr= (((nr - variables->auto_increment_offset)) /
2086
(uint64_t) variables->auto_increment_increment);
2087
return (nr * (uint64_t) variables->auto_increment_increment +
2088
variables->auto_increment_offset);
2093
Update the auto_increment field if necessary.
2095
Updates columns with type NEXT_NUMBER if:
2097
- If column value is set to NULL (in which case
2098
auto_increment_field_not_null is 0)
2099
- If column is set to 0 and (sql_mode & MODE_NO_AUTO_VALUE_ON_ZERO) is not
2100
set. In the future we will only set NEXT_NUMBER fields if one sets them
2101
to NULL (or they are not included in the insert list).
2103
In those cases, we check if the currently reserved interval still has
2104
values we have not used. If yes, we pick the smallest one and use it.
2107
- If a list of intervals has been provided to the statement via SET
2108
INSERT_ID or via an Intvar_log_event (in a replication slave), we pick the
2109
first unused interval from this list, consider it as reserved.
2111
- Otherwise we set the column for the first row to the value
2112
next_insert_id(get_auto_increment(column))) which is usually
2113
max-used-column-value+1.
2114
We call get_auto_increment() for the first row in a multi-row
2115
statement. get_auto_increment() will tell us the interval of values it
2118
- In both cases, for the following rows we use those reserved values without
2119
calling the handler again (we just progress in the interval, computing
2120
each new value from the previous one). Until we have exhausted them, then
2121
we either take the next provided interval or call get_auto_increment()
2122
again to reserve a new interval.
2124
- In both cases, the reserved intervals are remembered in
2125
thd->auto_inc_intervals_in_cur_stmt_for_binlog if statement-based
2126
binlogging; the last reserved interval is remembered in
2127
auto_inc_interval_for_cur_row.
2129
The idea is that generated auto_increment values are predictable and
2130
independent of the column values in the table. This is needed to be
2131
able to replicate into a table that already has rows with a higher
2132
auto-increment value than the one that is inserted.
2134
After we have already generated an auto-increment number and the user
2135
inserts a column with a higher value than the last used one, we will
2136
start counting from the inserted value.
2138
This function's "outputs" are: the table's auto_increment field is filled
2139
with a value, thd->next_insert_id is filled with the value to use for the
2140
next row, if a value was autogenerated for the current row it is stored in
2141
thd->insert_id_for_cur_row, if get_auto_increment() was called
2142
thd->auto_inc_interval_for_cur_row is modified, if that interval is not
2143
present in thd->auto_inc_intervals_in_cur_stmt_for_binlog it is added to
2147
Replace all references to "next number" or NEXT_NUMBER to
2148
"auto_increment", everywhere (see below: there is
2149
table->auto_increment_field_not_null, and there also exists
2150
table->next_number_field, it's not consistent).
2155
HA_ERR_AUTOINC_READ_FAILED get_auto_increment() was called and
2156
returned ~(uint64_t) 0
2158
HA_ERR_AUTOINC_ERANGE storing value in field caused strict mode
2162
#define AUTO_INC_DEFAULT_NB_ROWS 1 // Some prefer 1024 here
2163
#define AUTO_INC_DEFAULT_NB_MAX_BITS 16
2164
#define AUTO_INC_DEFAULT_NB_MAX ((1 << AUTO_INC_DEFAULT_NB_MAX_BITS) - 1)
2166
int handler::update_auto_increment()
2168
uint64_t nr, nb_reserved_values;
2170
THD *thd= table->in_use;
2171
struct system_variables *variables= &thd->variables;
2172
DBUG_ENTER("handler::update_auto_increment");
2175
next_insert_id is a "cursor" into the reserved interval, it may go greater
2176
than the interval, but not smaller.
2178
DBUG_ASSERT(next_insert_id >= auto_inc_interval_for_cur_row.minimum());
2180
if (((nr= table->next_number_field->val_int()) != 0) ||
2181
(table->auto_increment_field_not_null && (thd->variables.sql_mode & MODE_NO_AUTO_VALUE_ON_ZERO)))
2184
Update next_insert_id if we had already generated a value in this
2185
statement (case of INSERT VALUES(null),(3763),(null):
2186
the last NULL needs to insert 3764, not the value of the first NULL plus
2189
adjust_next_insert_id_after_explicit_value(nr);
2190
insert_id_for_cur_row= 0; // didn't generate anything
2194
if ((nr= next_insert_id) >= auto_inc_interval_for_cur_row.maximum())
2196
/* next_insert_id is beyond what is reserved, so we reserve more. */
2197
const Discrete_interval *forced=
2198
thd->auto_inc_intervals_forced.get_next();
2201
nr= forced->minimum();
2202
nb_reserved_values= forced->values();
2207
handler::estimation_rows_to_insert was set by
2208
handler::ha_start_bulk_insert(); if 0 it means "unknown".
2210
uint nb_already_reserved_intervals=
2211
thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements();
2212
uint64_t nb_desired_values;
2214
If an estimation was given to the engine:
2216
- if we already reserved numbers, it means the estimation was
2217
not accurate, then we'll reserve 2*AUTO_INC_DEFAULT_NB_ROWS the 2nd
2218
time, twice that the 3rd time etc.
2219
If no estimation was given, use those increasing defaults from the
2220
start, starting from AUTO_INC_DEFAULT_NB_ROWS.
2221
Don't go beyond a max to not reserve "way too much" (because
2222
reservation means potentially losing unused values).
2224
if (nb_already_reserved_intervals == 0 &&
2225
(estimation_rows_to_insert > 0))
2226
nb_desired_values= estimation_rows_to_insert;
2227
else /* go with the increasing defaults */
2229
/* avoid overflow in formula, with this if() */
2230
if (nb_already_reserved_intervals <= AUTO_INC_DEFAULT_NB_MAX_BITS)
2232
nb_desired_values= AUTO_INC_DEFAULT_NB_ROWS *
2233
(1 << nb_already_reserved_intervals);
2234
set_if_smaller(nb_desired_values, AUTO_INC_DEFAULT_NB_MAX);
2237
nb_desired_values= AUTO_INC_DEFAULT_NB_MAX;
2239
/* This call ignores all its parameters but nr, currently */
2240
get_auto_increment(variables->auto_increment_offset,
2241
variables->auto_increment_increment,
2242
nb_desired_values, &nr,
2243
&nb_reserved_values);
2244
if (nr == ~(uint64_t) 0)
2245
DBUG_RETURN(HA_ERR_AUTOINC_READ_FAILED); // Mark failure
2248
That rounding below should not be needed when all engines actually
2249
respect offset and increment in get_auto_increment(). But they don't
2250
so we still do it. Wonder if for the not-first-in-index we should do
2251
it. Hope that this rounding didn't push us out of the interval; even
2252
if it did we cannot do anything about it (calling the engine again
2253
will not help as we inserted no row).
2255
nr= compute_next_insert_id(nr-1, variables);
2258
if (table->s->next_number_keypart == 0)
2260
/* We must defer the appending until "nr" has been possibly truncated */
2266
For such auto_increment there is no notion of interval, just a
2267
singleton. The interval is not even stored in
2268
thd->auto_inc_interval_for_cur_row, so we are sure to call the engine
2271
DBUG_PRINT("info",("auto_increment: special not-first-in-index"));
2275
DBUG_PRINT("info",("auto_increment: %lu", (ulong) nr));
2277
if (unlikely(table->next_number_field->store((longlong) nr, TRUE)))
2280
first test if the query was aborted due to strict mode constraints
2282
if (thd->killed == THD::KILL_BAD_DATA)
2283
DBUG_RETURN(HA_ERR_AUTOINC_ERANGE);
2286
field refused this value (overflow) and truncated it, use the result of
2287
the truncation (which is going to be inserted); however we try to
2288
decrease it to honour auto_increment_* variables.
2289
That will shift the left bound of the reserved interval, we don't
2290
bother shifting the right bound (anyway any other value from this
2291
interval will cause a duplicate key).
2293
nr= prev_insert_id(table->next_number_field->val_int(), variables);
2294
if (unlikely(table->next_number_field->store((longlong) nr, TRUE)))
2295
nr= table->next_number_field->val_int();
2299
auto_inc_interval_for_cur_row.replace(nr, nb_reserved_values,
2300
variables->auto_increment_increment);
2301
/* Row-based replication does not need to store intervals in binlog */
2302
if (!thd->current_stmt_binlog_row_based)
2303
thd->auto_inc_intervals_in_cur_stmt_for_binlog.append(auto_inc_interval_for_cur_row.minimum(),
2304
auto_inc_interval_for_cur_row.values(),
2305
variables->auto_increment_increment);
2309
Record this autogenerated value. If the caller then
2310
succeeds to insert this value, it will call
2311
record_first_successful_insert_id_in_cur_stmt()
2312
which will set first_successful_insert_id_in_cur_stmt if it's not
2315
insert_id_for_cur_row= nr;
2317
Set next insert id to point to next auto-increment value to be able to
2318
handle multi-row statements.
2320
set_next_insert_id(compute_next_insert_id(nr, variables));
2327
MySQL signal that it changed the column bitmap
2329
This is for handlers that needs to setup their own column bitmaps.
2330
Normally the handler should set up their own column bitmaps in
2331
index_init() or rnd_init() and in any column_bitmaps_signal() call after
2334
The handler is allowed to do changes to the bitmap after a index_init or
2335
rnd_init() call is made as after this, MySQL will not use the bitmap
2336
for any program logic checking.
2338
void handler::column_bitmaps_signal()
2340
DBUG_ENTER("column_bitmaps_signal");
2341
DBUG_PRINT("info", ("read_set: 0x%lx write_set: 0x%lx", (long) table->read_set,
2342
(long) table->write_set));
2348
Reserves an interval of auto_increment values from the handler.
2350
offset and increment means that we want values to be of the form
2351
offset + N * increment, where N>=0 is integer.
2352
If the function sets *first_value to ~(uint64_t)0 it means an error.
2353
If the function sets *nb_reserved_values to ULONGLONG_MAX it means it has
2354
reserved to "positive infinite".
2358
@param nb_desired_values how many values we want
2359
@param first_value (OUT) the first value reserved by the handler
2360
@param nb_reserved_values (OUT) how many values the handler reserved
2362
void handler::get_auto_increment(uint64_t offset, uint64_t increment,
2363
uint64_t nb_desired_values,
2364
uint64_t *first_value,
2365
uint64_t *nb_reserved_values)
2370
(void) extra(HA_EXTRA_KEYREAD);
2371
table->mark_columns_used_by_index_no_reset(table->s->next_number_index,
2373
column_bitmaps_signal();
2374
index_init(table->s->next_number_index, 1);
2375
if (table->s->next_number_keypart == 0)
2376
{ // Autoincrement at key-start
2377
error=index_last(table->record[1]);
2379
MySQL implicitely assumes such method does locking (as MySQL decides to
2380
use nr+increment without checking again with the handler, in
2381
handler::update_auto_increment()), so reserves to infinite.
2383
*nb_reserved_values= ULONGLONG_MAX;
2387
uchar key[MAX_KEY_LENGTH];
2388
key_copy(key, table->record[0],
2389
table->key_info + table->s->next_number_index,
2390
table->s->next_number_key_offset);
2391
error= index_read_map(table->record[1], key,
2392
make_prev_keypart_map(table->s->next_number_keypart),
2393
HA_READ_PREFIX_LAST);
2395
MySQL needs to call us for next row: assume we are inserting ("a",null)
2396
here, we return 3, and next this statement will want to insert
2397
("b",null): there is no reason why ("b",3+1) would be the good row to
2398
insert: maybe it already exists, maybe 3+1 is too large...
2400
*nb_reserved_values= 1;
2406
nr= ((uint64_t) table->next_number_field->
2407
val_int_offset(table->s->rec_buff_length)+1);
2409
(void) extra(HA_EXTRA_NO_KEYREAD);
2414
void handler::ha_release_auto_increment()
2416
release_auto_increment();
2417
insert_id_for_cur_row= 0;
2418
auto_inc_interval_for_cur_row.replace(0, 0, 0);
2419
if (next_insert_id > 0)
2423
this statement used forced auto_increment values if there were some,
2424
wipe them away for other statements.
2426
table->in_use->auto_inc_intervals_forced.empty();
2431
void handler::print_keydup_error(uint key_nr, const char *msg)
2433
/* Write the duplicated key in the error message */
2434
char key[MAX_KEY_LENGTH];
2435
String str(key,sizeof(key),system_charset_info);
2437
if (key_nr == MAX_KEY)
2439
/* Key is unknown */
2440
str.copy("", 0, system_charset_info);
2441
my_printf_error(ER_DUP_ENTRY, msg, MYF(0), str.c_ptr(), "*UNKNOWN*");
2445
/* Table is opened and defined at this point */
2446
key_unpack(&str,table,(uint) key_nr);
2447
uint max_length=MYSQL_ERRMSG_SIZE-(uint) strlen(msg);
2448
if (str.length() >= max_length)
2450
str.length(max_length-4);
2451
str.append(STRING_WITH_LEN("..."));
2453
my_printf_error(ER_DUP_ENTRY, msg,
2454
MYF(0), str.c_ptr(), table->key_info[key_nr].name);
2460
Print error that we got from handler function.
2463
In case of delete table it's only safe to use the following parts of
2464
the 'table' structure:
2468
void handler::print_error(int error, myf errflag)
2470
DBUG_ENTER("handler::print_error");
2471
DBUG_PRINT("enter",("error: %d",error));
2473
int textno=ER_GET_ERRNO;
2476
textno=ER_OPEN_AS_READONLY;
2479
textno=ER_FILE_USED;
2482
textno=ER_FILE_NOT_FOUND;
2484
case HA_ERR_KEY_NOT_FOUND:
2485
case HA_ERR_NO_ACTIVE_RECORD:
2486
case HA_ERR_END_OF_FILE:
2487
textno=ER_KEY_NOT_FOUND;
2489
case HA_ERR_WRONG_MRG_TABLE_DEF:
2490
textno=ER_WRONG_MRG_TABLE;
2492
case HA_ERR_FOUND_DUPP_KEY:
2494
uint key_nr=get_dup_key(error);
2495
if ((int) key_nr >= 0)
2497
print_keydup_error(key_nr, ER(ER_DUP_ENTRY_WITH_KEY_NAME));
2503
case HA_ERR_FOREIGN_DUPLICATE_KEY:
2505
uint key_nr= get_dup_key(error);
2506
if ((int) key_nr >= 0)
2509
/* Write the key in the error message */
2510
char key[MAX_KEY_LENGTH];
2511
String str(key,sizeof(key),system_charset_info);
2512
/* Table is opened and defined at this point */
2513
key_unpack(&str,table,(uint) key_nr);
2514
max_length= (MYSQL_ERRMSG_SIZE-
2515
(uint) strlen(ER(ER_FOREIGN_DUPLICATE_KEY)));
2516
if (str.length() >= max_length)
2518
str.length(max_length-4);
2519
str.append(STRING_WITH_LEN("..."));
2521
my_error(ER_FOREIGN_DUPLICATE_KEY, MYF(0), table_share->table_name.str,
2522
str.c_ptr(), key_nr+1);
2528
case HA_ERR_FOUND_DUPP_UNIQUE:
2529
textno=ER_DUP_UNIQUE;
2531
case HA_ERR_RECORD_CHANGED:
2532
textno=ER_CHECKREAD;
2534
case HA_ERR_CRASHED:
2535
textno=ER_NOT_KEYFILE;
2537
case HA_ERR_WRONG_IN_RECORD:
2538
textno= ER_CRASHED_ON_USAGE;
2540
case HA_ERR_CRASHED_ON_USAGE:
2541
textno=ER_CRASHED_ON_USAGE;
2543
case HA_ERR_NOT_A_TABLE:
2546
case HA_ERR_CRASHED_ON_REPAIR:
2547
textno=ER_CRASHED_ON_REPAIR;
2549
case HA_ERR_OUT_OF_MEM:
2550
textno=ER_OUT_OF_RESOURCES;
2552
case HA_ERR_WRONG_COMMAND:
2553
textno=ER_ILLEGAL_HA;
2555
case HA_ERR_OLD_FILE:
2556
textno=ER_OLD_KEYFILE;
2558
case HA_ERR_UNSUPPORTED:
2559
textno=ER_UNSUPPORTED_EXTENSION;
2561
case HA_ERR_RECORD_FILE_FULL:
2562
case HA_ERR_INDEX_FILE_FULL:
2563
textno=ER_RECORD_FILE_FULL;
2565
case HA_ERR_LOCK_WAIT_TIMEOUT:
2566
textno=ER_LOCK_WAIT_TIMEOUT;
2568
case HA_ERR_LOCK_TABLE_FULL:
2569
textno=ER_LOCK_TABLE_FULL;
2571
case HA_ERR_LOCK_DEADLOCK:
2572
textno=ER_LOCK_DEADLOCK;
2574
case HA_ERR_READ_ONLY_TRANSACTION:
2575
textno=ER_READ_ONLY_TRANSACTION;
2577
case HA_ERR_CANNOT_ADD_FOREIGN:
2578
textno=ER_CANNOT_ADD_FOREIGN;
2580
case HA_ERR_ROW_IS_REFERENCED:
2583
get_error_message(error, &str);
2584
my_error(ER_ROW_IS_REFERENCED_2, MYF(0), str.c_ptr_safe());
2587
case HA_ERR_NO_REFERENCED_ROW:
2590
get_error_message(error, &str);
2591
my_error(ER_NO_REFERENCED_ROW_2, MYF(0), str.c_ptr_safe());
2594
case HA_ERR_TABLE_DEF_CHANGED:
2595
textno=ER_TABLE_DEF_CHANGED;
2597
case HA_ERR_NO_SUCH_TABLE:
2598
my_error(ER_NO_SUCH_TABLE, MYF(0), table_share->db.str,
2599
table_share->table_name.str);
2601
case HA_ERR_RBR_LOGGING_FAILED:
2602
textno= ER_BINLOG_ROW_LOGGING_FAILED;
2604
case HA_ERR_DROP_INDEX_FK:
2606
const char *ptr= "???";
2607
uint key_nr= get_dup_key(error);
2608
if ((int) key_nr >= 0)
2609
ptr= table->key_info[key_nr].name;
2610
my_error(ER_DROP_INDEX_FK, MYF(0), ptr);
2613
case HA_ERR_TABLE_NEEDS_UPGRADE:
2614
textno=ER_TABLE_NEEDS_UPGRADE;
2616
case HA_ERR_TABLE_READONLY:
2617
textno= ER_OPEN_AS_READONLY;
2619
case HA_ERR_AUTOINC_READ_FAILED:
2620
textno= ER_AUTOINC_READ_FAILED;
2622
case HA_ERR_AUTOINC_ERANGE:
2623
textno= ER_WARN_DATA_OUT_OF_RANGE;
2625
case HA_ERR_LOCK_OR_ACTIVE_TRANSACTION:
2626
my_message(ER_LOCK_OR_ACTIVE_TRANSACTION,
2627
ER(ER_LOCK_OR_ACTIVE_TRANSACTION), MYF(0));
2632
/* The error was "unknown" to this function.
2633
Ask handler if it has got a message for this error */
2634
bool temporary= FALSE;
2636
temporary= get_error_message(error, &str);
2637
if (!str.is_empty())
2639
const char* engine= table_type();
2641
my_error(ER_GET_TEMPORARY_ERRMSG, MYF(0), error, str.ptr(), engine);
2643
my_error(ER_GET_ERRMSG, MYF(0), error, str.ptr(), engine);
2646
my_error(ER_GET_ERRNO,errflag,error);
2650
my_error(textno, errflag, table_share->table_name.str, error);
2656
Return an error message specific to this handler.
2658
@param error error code previously returned by handler
2659
@param buf pointer to String where to add error message
2662
Returns true if this is a temporary error
2664
bool handler::get_error_message(int error, String* buf)
2670
int handler::ha_check_for_upgrade(HA_CHECK_OPT *check_opt)
2672
KEY *keyinfo, *keyend;
2673
KEY_PART_INFO *keypart, *keypartend;
2675
if (!table->s->mysql_version)
2677
/* check for blob-in-key error */
2678
keyinfo= table->key_info;
2679
keyend= table->key_info + table->s->keys;
2680
for (; keyinfo < keyend; keyinfo++)
2682
keypart= keyinfo->key_part;
2683
keypartend= keypart + keyinfo->key_parts;
2684
for (; keypart < keypartend; keypart++)
2686
if (!keypart->fieldnr)
2688
Field *field= table->field[keypart->fieldnr-1];
2689
if (field->type() == MYSQL_TYPE_BLOB)
2691
if (check_opt->sql_flags & TT_FOR_UPGRADE)
2692
check_opt->flags= T_MEDIUM;
2693
return HA_ADMIN_NEEDS_CHECK;
2698
return check_for_upgrade(check_opt);
2702
/* Code left, but Drizzle has no legacy yet (while MySQL did) */
2703
int handler::check_old_types()
2709
static bool update_frm_version(TABLE *table)
2711
char path[FN_REFLEN];
2714
DBUG_ENTER("update_frm_version");
2717
No need to update frm version in case table was created or checked
2718
by server with the same version. This also ensures that we do not
2719
update frm version for temporary tables as this code doesn't support
2722
if (table->s->mysql_version == MYSQL_VERSION_ID)
2725
strxmov(path, table->s->normalized_path.str, reg_ext, NullS);
2727
if ((file= my_open(path, O_RDWR|O_BINARY, MYF(MY_WME))) >= 0)
2730
char *key= table->s->table_cache_key.str;
2731
uint key_length= table->s->table_cache_key.length;
2733
HASH_SEARCH_STATE state;
2735
int4store(version, MYSQL_VERSION_ID);
2737
if ((result= my_pwrite(file,(uchar*) version,4,51L,MYF_RW)))
2740
for (entry=(TABLE*) hash_first(&open_cache,(uchar*) key,key_length, &state);
2742
entry= (TABLE*) hash_next(&open_cache,(uchar*) key,key_length, &state))
2743
entry->s->mysql_version= MYSQL_VERSION_ID;
2747
VOID(my_close(file,MYF(MY_WME)));
2748
DBUG_RETURN(result);
2755
key if error because of duplicated keys
2757
uint handler::get_dup_key(int error)
2759
DBUG_ENTER("handler::get_dup_key");
2760
table->file->errkey = (uint) -1;
2761
if (error == HA_ERR_FOUND_DUPP_KEY || error == HA_ERR_FOREIGN_DUPLICATE_KEY ||
2762
error == HA_ERR_FOUND_DUPP_UNIQUE ||
2763
error == HA_ERR_DROP_INDEX_FK)
2764
info(HA_STATUS_ERRKEY | HA_STATUS_NO_LOCK);
2765
DBUG_RETURN(table->file->errkey);
2770
Delete all files with extension from bas_ext().
2772
@param name Base name of table
2775
We assume that the handler may return more extensions than
2776
was actually used for the file.
2779
0 If we successfully deleted at least one file from base_ext and
2780
didn't get any other errors than ENOENT
2784
int handler::delete_table(const char *name)
2787
int enoent_or_zero= ENOENT; // Error if no file was deleted
2788
char buff[FN_REFLEN];
2790
for (const char **ext=bas_ext(); *ext ; ext++)
2792
fn_format(buff, name, "", *ext, MY_UNPACK_FILENAME|MY_APPEND_EXT);
2793
if (my_delete_with_symlink(buff, MYF(0)))
2795
if ((error= my_errno) != ENOENT)
2799
enoent_or_zero= 0; // No error for ENOENT
2800
error= enoent_or_zero;
2806
int handler::rename_table(const char * from, const char * to)
2809
for (const char **ext= bas_ext(); *ext ; ext++)
2811
if (rename_file_ext(from, to, *ext))
2813
if ((error=my_errno) != ENOENT)
2822
void handler::drop_table(const char *name)
2830
Performs checks upon the table.
2832
@param thd thread doing CHECK TABLE operation
2833
@param check_opt options from the parser
2836
HA_ADMIN_OK Successful upgrade
2838
HA_ADMIN_NEEDS_UPGRADE Table has structures requiring upgrade
2840
HA_ADMIN_NEEDS_ALTER Table has structures requiring ALTER TABLE
2842
HA_ADMIN_NOT_IMPLEMENTED
2844
int handler::ha_check(THD *thd, HA_CHECK_OPT *check_opt)
2848
if ((table->s->mysql_version >= MYSQL_VERSION_ID) &&
2849
(check_opt->sql_flags & TT_FOR_UPGRADE))
2852
if (table->s->mysql_version < MYSQL_VERSION_ID)
2854
if ((error= check_old_types()))
2856
error= ha_check_for_upgrade(check_opt);
2857
if (error && (error != HA_ADMIN_NEEDS_CHECK))
2859
if (!error && (check_opt->sql_flags & TT_FOR_UPGRADE))
2862
if ((error= check(thd, check_opt)))
2864
return update_frm_version(table);
2868
A helper function to mark a transaction read-write,
2874
handler::mark_trx_read_write()
2876
Ha_trx_info *ha_info= &ha_thd()->ha_data[ht->slot].ha_info[0];
2878
When a storage engine method is called, the transaction must
2879
have been started, unless it's a DDL call, for which the
2880
storage engine starts the transaction internally, and commits
2881
it internally, without registering in the ha_list.
2882
Unfortunately here we can't know know for sure if the engine
2883
has registered the transaction or not, so we must check.
2885
if (ha_info->is_started())
2887
DBUG_ASSERT(has_transactions());
2889
table_share can be NULL in ha_delete_table(). See implementation
2890
of standalone function ha_delete_table() in sql_base.cc.
2892
if (table_share == NULL || table_share->tmp_table == NO_TMP_TABLE)
2893
ha_info->set_trx_read_write();
2899
Repair table: public interface.
2901
@sa handler::repair()
2904
int handler::ha_repair(THD* thd, HA_CHECK_OPT* check_opt)
2908
mark_trx_read_write();
2910
if ((result= repair(thd, check_opt)))
2912
return update_frm_version(table);
2917
Bulk update row: public interface.
2919
@sa handler::bulk_update_row()
2923
handler::ha_bulk_update_row(const uchar *old_data, uchar *new_data,
2924
uint *dup_key_found)
2926
mark_trx_read_write();
2928
return bulk_update_row(old_data, new_data, dup_key_found);
2933
Delete all rows: public interface.
2935
@sa handler::delete_all_rows()
2939
handler::ha_delete_all_rows()
2941
mark_trx_read_write();
2943
return delete_all_rows();
2948
Reset auto increment: public interface.
2950
@sa handler::reset_auto_increment()
2954
handler::ha_reset_auto_increment(uint64_t value)
2956
mark_trx_read_write();
2958
return reset_auto_increment(value);
2963
Optimize table: public interface.
2965
@sa handler::optimize()
2969
handler::ha_optimize(THD* thd, HA_CHECK_OPT* check_opt)
2971
mark_trx_read_write();
2973
return optimize(thd, check_opt);
2978
Analyze table: public interface.
2980
@sa handler::analyze()
2984
handler::ha_analyze(THD* thd, HA_CHECK_OPT* check_opt)
2986
mark_trx_read_write();
2988
return analyze(thd, check_opt);
2993
Check and repair table: public interface.
2995
@sa handler::check_and_repair()
2999
handler::ha_check_and_repair(THD *thd)
3001
mark_trx_read_write();
3003
return check_and_repair(thd);
3008
Disable indexes: public interface.
3010
@sa handler::disable_indexes()
3014
handler::ha_disable_indexes(uint mode)
3016
mark_trx_read_write();
3018
return disable_indexes(mode);
3023
Enable indexes: public interface.
3025
@sa handler::enable_indexes()
3029
handler::ha_enable_indexes(uint mode)
3031
mark_trx_read_write();
3033
return enable_indexes(mode);
3038
Discard or import tablespace: public interface.
3040
@sa handler::discard_or_import_tablespace()
3044
handler::ha_discard_or_import_tablespace(my_bool discard)
3046
mark_trx_read_write();
3048
return discard_or_import_tablespace(discard);
3053
Prepare for alter: public interface.
3055
Called to prepare an *online* ALTER.
3057
@sa handler::prepare_for_alter()
3061
handler::ha_prepare_for_alter()
3063
mark_trx_read_write();
3065
prepare_for_alter();
3070
Rename table: public interface.
3072
@sa handler::rename_table()
3076
handler::ha_rename_table(const char *from, const char *to)
3078
mark_trx_read_write();
3080
return rename_table(from, to);
3085
Delete table: public interface.
3087
@sa handler::delete_table()
3091
handler::ha_delete_table(const char *name)
3093
mark_trx_read_write();
3095
return delete_table(name);
3100
Drop table in the engine: public interface.
3102
@sa handler::drop_table()
3106
handler::ha_drop_table(const char *name)
3108
mark_trx_read_write();
3110
return drop_table(name);
3115
Create a table in the engine: public interface.
3117
@sa handler::create()
3121
handler::ha_create(const char *name, TABLE *form, HA_CREATE_INFO *info)
3123
mark_trx_read_write();
3125
return create(name, form, info);
3130
Create handler files for CREATE TABLE: public interface.
3132
@sa handler::create_handler_files()
3136
handler::ha_create_handler_files(const char *name, const char *old_name,
3137
int action_flag, HA_CREATE_INFO *info)
3139
mark_trx_read_write();
3141
return create_handler_files(name, old_name, action_flag, info);
3146
Tell the storage engine that it is allowed to "disable transaction" in the
3147
handler. It is a hint that ACID is not required - it is used in NDB for
3148
ALTER TABLE, for example, when data are copied to temporary table.
3149
A storage engine may treat this hint any way it likes. NDB for example
3150
starts to commit every now and then automatically.
3151
This hint can be safely ignored.
3153
int ha_enable_transaction(THD *thd, bool on)
3156
DBUG_ENTER("ha_enable_transaction");
3157
DBUG_PRINT("enter", ("on: %d", (int) on));
3159
if ((thd->transaction.on= on))
3162
Now all storage engines should have transaction handling enabled.
3163
But some may have it enabled all the time - "disabling" transactions
3164
is an optimization hint that storage engine is free to ignore.
3165
So, let's commit an open transaction (if any) now.
3167
if (!(error= ha_commit_trans(thd, 0)))
3168
error= end_trans(thd, COMMIT);
3173
int handler::index_next_same(uchar *buf, const uchar *key, uint keylen)
3176
DBUG_ENTER("index_next_same");
3177
if (!(error=index_next(buf)))
3179
my_ptrdiff_t ptrdiff= buf - table->record[0];
3180
uchar *save_record_0= NULL;
3181
KEY *key_info= NULL;
3182
KEY_PART_INFO *key_part;
3183
KEY_PART_INFO *key_part_end= NULL;
3186
key_cmp_if_same() compares table->record[0] against 'key'.
3187
In parts it uses table->record[0] directly, in parts it uses
3188
field objects with their local pointers into table->record[0].
3189
If 'buf' is distinct from table->record[0], we need to move
3190
all record references. This is table->record[0] itself and
3191
the field pointers of the fields used in this key.
3195
save_record_0= table->record[0];
3196
table->record[0]= buf;
3197
key_info= table->key_info + active_index;
3198
key_part= key_info->key_part;
3199
key_part_end= key_part + key_info->key_parts;
3200
for (; key_part < key_part_end; key_part++)
3202
DBUG_ASSERT(key_part->field);
3203
key_part->field->move_field_offset(ptrdiff);
3207
if (key_cmp_if_same(table, key, active_index, keylen))
3209
table->status=STATUS_NOT_FOUND;
3210
error=HA_ERR_END_OF_FILE;
3213
/* Move back if necessary. */
3216
table->record[0]= save_record_0;
3217
for (key_part= key_info->key_part; key_part < key_part_end; key_part++)
3218
key_part->field->move_field_offset(-ptrdiff);
3225
/****************************************************************************
3226
** Some general functions that isn't in the handler class
3227
****************************************************************************/
3230
Initiates table-file and calls appropriate database-creator.
3237
int ha_create_table(THD *thd, const char *path,
3238
const char *db, const char *table_name,
3239
HA_CREATE_INFO *create_info,
3240
bool update_create_info)
3244
char name_buff[FN_REFLEN];
3247
DBUG_ENTER("ha_create_table");
3249
init_tmp_table_share(thd, &share, db, 0, table_name, path);
3250
if (open_table_def(thd, &share, 0) ||
3251
open_table_from_share(thd, &share, "", 0, (uint) READ_ALL, 0, &table,
3255
if (update_create_info)
3256
update_create_info_from_table(create_info, &table);
3258
name= check_lowercase_names(table.file, share.path.str, name_buff);
3260
error= table.file->ha_create(name, &table, create_info);
3261
VOID(closefrm(&table, 0));
3264
strxmov(name_buff, db, ".", table_name, NullS);
3265
my_error(ER_CANT_CREATE_TABLE, MYF(ME_BELL+ME_WAITTANG), name_buff, error);
3268
free_table_share(&share);
3269
DBUG_RETURN(error != 0);
3273
Try to discover table from engine.
3276
If found, write the frm file to disk.
3279
-1 Table did not exists
3283
> 0 Error, table existed but could not be created
3285
int ha_create_table_from_engine(THD* thd, const char *db, const char *name)
3290
char path[FN_REFLEN];
3291
HA_CREATE_INFO create_info;
3294
DBUG_ENTER("ha_create_table_from_engine");
3295
DBUG_PRINT("enter", ("name '%s'.'%s'", db, name));
3297
bzero((uchar*) &create_info,sizeof(create_info));
3298
if ((error= ha_discover(thd, db, name, &frmblob, &frmlen)))
3300
/* Table could not be discovered and thus not created */
3305
Table exists in handler and could be discovered
3306
frmblob and frmlen are set, write the frm to disk
3309
build_table_filename(path, FN_REFLEN-1, db, name, "", 0);
3310
// Save the frm file
3311
error= writefrm(path, frmblob, frmlen);
3312
my_free(frmblob, MYF(0));
3316
init_tmp_table_share(thd, &share, db, 0, name, path);
3317
if (open_table_def(thd, &share, 0))
3321
if (open_table_from_share(thd, &share, "" ,0, 0, 0, &table, OTM_OPEN))
3323
free_table_share(&share);
3327
update_create_info_from_table(&create_info, &table);
3328
create_info.table_options|= HA_OPTION_CREATE_FROM_ENGINE;
3330
check_lowercase_names(table.file, path, path);
3331
error=table.file->ha_create(path, &table, &create_info);
3332
VOID(closefrm(&table, 1));
3334
DBUG_RETURN(error != 0);
3337
void st_ha_check_opt::init()
3339
flags= sql_flags= 0;
3340
sort_buffer_size = current_thd->variables.myisam_sort_buff_size;
3344
/*****************************************************************************
3347
This code is only relevant for ISAM/MyISAM tables
3349
key_cache->cache may be 0 only in the case where a key cache is not
3350
initialized or when we where not able to init the key cache in a previous
3351
call to ha_init_key_cache() (probably out of memory)
3352
*****************************************************************************/
3355
Init a key cache if it has not been initied before.
3357
int ha_init_key_cache(const char *name, KEY_CACHE *key_cache)
3359
DBUG_ENTER("ha_init_key_cache");
3361
if (!key_cache->key_cache_inited)
3363
pthread_mutex_lock(&LOCK_global_system_variables);
3364
ulong tmp_buff_size= (ulong) key_cache->param_buff_size;
3365
uint tmp_block_size= (uint) key_cache->param_block_size;
3366
uint division_limit= key_cache->param_division_limit;
3367
uint age_threshold= key_cache->param_age_threshold;
3368
pthread_mutex_unlock(&LOCK_global_system_variables);
3369
DBUG_RETURN(!init_key_cache(key_cache,
3372
division_limit, age_threshold));
3381
int ha_resize_key_cache(KEY_CACHE *key_cache)
3383
DBUG_ENTER("ha_resize_key_cache");
3385
if (key_cache->key_cache_inited)
3387
pthread_mutex_lock(&LOCK_global_system_variables);
3388
long tmp_buff_size= (long) key_cache->param_buff_size;
3389
long tmp_block_size= (long) key_cache->param_block_size;
3390
uint division_limit= key_cache->param_division_limit;
3391
uint age_threshold= key_cache->param_age_threshold;
3392
pthread_mutex_unlock(&LOCK_global_system_variables);
3393
DBUG_RETURN(!resize_key_cache(key_cache, tmp_block_size,
3395
division_limit, age_threshold));
3402
Change parameters for key cache (like size)
3404
int ha_change_key_cache_param(KEY_CACHE *key_cache)
3406
if (key_cache->key_cache_inited)
3408
pthread_mutex_lock(&LOCK_global_system_variables);
3409
uint division_limit= key_cache->param_division_limit;
3410
uint age_threshold= key_cache->param_age_threshold;
3411
pthread_mutex_unlock(&LOCK_global_system_variables);
3412
change_key_cache_param(key_cache, division_limit, age_threshold);
3418
Free memory allocated by a key cache.
3420
int ha_end_key_cache(KEY_CACHE *key_cache)
3422
end_key_cache(key_cache, 1); // Can never fail
3427
Move all tables from one key cache to another one.
3429
int ha_change_key_cache(KEY_CACHE *old_key_cache,
3430
KEY_CACHE *new_key_cache)
3432
mi_change_key_cache(old_key_cache, new_key_cache);
3438
Try to discover one table from handler(s).
3441
-1 Table did not exists
3443
0 OK. In this case *frmblob and *frmlen are set
3445
>0 error. frmblob and frmlen may not be set
3447
struct st_discover_args
3455
static my_bool discover_handlerton(THD *thd, plugin_ref plugin,
3458
st_discover_args *vargs= (st_discover_args *)arg;
3459
handlerton *hton= plugin_data(plugin, handlerton *);
3460
if (hton->state == SHOW_OPTION_YES && hton->discover &&
3461
(!(hton->discover(hton, thd, vargs->db, vargs->name,
3469
int ha_discover(THD *thd, const char *db, const char *name,
3470
uchar **frmblob, size_t *frmlen)
3472
int error= -1; // Table does not exist in any handler
3473
DBUG_ENTER("ha_discover");
3474
DBUG_PRINT("enter", ("db: %s, name: %s", db, name));
3475
st_discover_args args= {db, name, frmblob, frmlen};
3477
if (is_prefix(name,tmp_file_prefix)) /* skip temporary tables */
3480
if (plugin_foreach(thd, discover_handlerton,
3481
MYSQL_STORAGE_ENGINE_PLUGIN, &args))
3485
status_var_increment(thd->status_var.ha_discover_count);
3491
Call this function in order to give the handler the possiblity
3492
to ask engine if there are any new tables that should be written to disk
3493
or any dropped tables that need to be removed from disk
3495
struct st_find_files_args
3501
List<LEX_STRING> *files;
3505
Ask handler if the table exists in engine.
3507
HA_ERR_NO_SUCH_TABLE Table does not exist
3509
HA_ERR_TABLE_EXIST Table exists
3513
struct st_table_exists_in_engine_args
3520
static my_bool table_exists_in_engine_handlerton(THD *thd, plugin_ref plugin,
3523
st_table_exists_in_engine_args *vargs= (st_table_exists_in_engine_args *)arg;
3524
handlerton *hton= plugin_data(plugin, handlerton *);
3526
int err= HA_ERR_NO_SUCH_TABLE;
3528
if (hton->state == SHOW_OPTION_YES && hton->table_exists_in_engine)
3529
err = hton->table_exists_in_engine(hton, thd, vargs->db, vargs->name);
3532
if (vargs->err == HA_ERR_TABLE_EXIST)
3538
int ha_table_exists_in_engine(THD* thd, const char* db, const char* name)
3540
DBUG_ENTER("ha_table_exists_in_engine");
3541
DBUG_PRINT("enter", ("db: %s, name: %s", db, name));
3542
st_table_exists_in_engine_args args= {db, name, HA_ERR_NO_SUCH_TABLE};
3543
plugin_foreach(thd, table_exists_in_engine_handlerton,
3544
MYSQL_STORAGE_ENGINE_PLUGIN, &args);
3545
DBUG_PRINT("exit", ("error: %d", args.err));
3546
DBUG_RETURN(args.err);
3550
Calculate cost of 'index only' scan for given index and number of records
3552
@param keynr Index number
3553
@param records Estimated number of records to be retrieved
3556
It is assumed that we will read trough the whole key range and that all
3557
key blocks are half full (normally things are much better). It is also
3558
assumed that each time we read the next key from the index, the handler
3559
performs a random seek, thus the cost is proportional to the number of
3563
Consider joining this function and handler::read_time() into one
3564
handler::read_time(keynr, records, ranges, bool index_only) function.
3567
Estimated cost of 'index only' scan
3570
double handler::index_only_read_time(uint keynr, double records)
3573
uint keys_per_block= (stats.block_size/2/
3574
(table->key_info[keynr].key_length + ref_length) + 1);
3575
read_time=((double) (records + keys_per_block-1) /
3576
(double) keys_per_block);
3581
/****************************************************************************
3582
* Default MRR implementation (MRR to non-MRR converter)
3583
***************************************************************************/
3586
Get cost and other information about MRR scan over a known list of ranges
3588
Calculate estimated cost and other information about an MRR scan for given
3591
@param keyno Index number
3592
@param seq Range sequence to be traversed
3593
@param seq_init_param First parameter for seq->init()
3594
@param n_ranges_arg Number of ranges in the sequence, or 0 if the caller
3595
can't efficiently determine it
3596
@param bufsz INOUT IN: Size of the buffer available for use
3597
OUT: Size of the buffer that is expected to be actually
3598
used, or 0 if buffer is not needed.
3599
@param flags INOUT A combination of HA_MRR_* flags
3600
@param cost OUT Estimated cost of MRR access
3603
This method (or an overriding one in a derived class) must check for
3604
thd->killed and return HA_POS_ERROR if it is not zero. This is required
3605
for a user to be able to interrupt the calculation by killing the
3609
HA_POS_ERROR Error or the engine is unable to perform the requested
3610
scan. Values of OUT parameters are undefined.
3612
other OK, *cost contains cost of the scan, *bufsz and *flags
3613
contain scan parameters.
3617
handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
3618
void *seq_init_param, uint n_ranges_arg,
3619
uint *bufsz, uint *flags, COST_VECT *cost)
3621
KEY_MULTI_RANGE range;
3623
ha_rows rows, total_rows= 0;
3625
THD *thd= current_thd;
3627
/* Default MRR implementation doesn't need buffer */
3630
seq_it= seq->init(seq_init_param, n_ranges, *flags);
3631
while (!seq->next(seq_it, &range))
3633
if (unlikely(thd->killed != 0))
3634
return HA_POS_ERROR;
3637
key_range *min_endp, *max_endp;
3639
min_endp= range.start_key.length? &range.start_key : NULL;
3640
max_endp= range.end_key.length? &range.end_key : NULL;
3642
if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
3643
rows= 1; /* there can be at most one row */
3646
if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp,
3649
/* Can't scan one range => can't do MRR scan at all */
3650
total_rows= HA_POS_ERROR;
3657
if (total_rows != HA_POS_ERROR)
3659
/* The following calculation is the same as in multi_range_read_info(): */
3660
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3662
cost->avg_io_cost= 1; /* assume random seeks */
3663
if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
3664
cost->io_count= index_only_read_time(keyno, (uint)total_rows);
3666
cost->io_count= read_time(keyno, n_ranges, total_rows);
3667
cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
3674
Get cost and other information about MRR scan over some sequence of ranges
3676
Calculate estimated cost and other information about an MRR scan for some
3679
The ranges themselves will be known only at execution phase. When this
3680
function is called we only know number of ranges and a (rough) E(#records)
3681
within those ranges.
3683
Currently this function is only called for "n-keypart singlepoint" ranges,
3684
i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN"
3686
The flags parameter is a combination of those flags: HA_MRR_SORTED,
3687
HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS.
3689
@param keyno Index number
3690
@param n_ranges Estimated number of ranges (i.e. intervals) in the
3692
@param n_rows Estimated total number of records contained within all
3694
@param bufsz INOUT IN: Size of the buffer available for use
3695
OUT: Size of the buffer that will be actually used, or
3696
0 if buffer is not needed.
3697
@param flags INOUT A combination of HA_MRR_* flags
3698
@param cost OUT Estimated cost of MRR access
3701
0 OK, *cost contains cost of the scan, *bufsz and *flags contain scan
3704
other Error or can't perform the requested scan
3707
int handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
3708
uint *bufsz, uint *flags, COST_VECT *cost)
3710
*bufsz= 0; /* Default implementation doesn't need a buffer */
3712
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3715
cost->avg_io_cost= 1; /* assume random seeks */
3717
/* Produce the same cost as non-MRR code does */
3718
if (*flags & HA_MRR_INDEX_ONLY)
3719
cost->io_count= index_only_read_time(keyno, n_rows);
3721
cost->io_count= read_time(keyno, n_ranges, n_rows);
3727
Initialize the MRR scan
3729
Initialize the MRR scan. This function may do heavyweight scan
3730
initialization like row prefetching/sorting/etc (NOTE: but better not do
3731
it here as we may not need it, e.g. if we never satisfy WHERE clause on
3732
previous tables. For many implementations it would be natural to do such
3733
initializations in the first multi_read_range_next() call)
3735
mode is a combination of the following flags: HA_MRR_SORTED,
3736
HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION
3738
@param seq Range sequence to be traversed
3739
@param seq_init_param First parameter for seq->init()
3740
@param n_ranges Number of ranges in the sequence
3741
@param mode Flags, see the description section for the details
3742
@param buf INOUT: memory buffer to be used
3745
One must have called index_init() before calling this function. Several
3746
multi_range_read_init() calls may be made in course of one query.
3748
Until WL#2623 is done (see its text, section 3.2), the following will
3750
The caller will guarantee that if "seq->init == mrr_ranges_array_init"
3751
then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures.
3752
This property will only be used by NDB handler until WL#2623 is done.
3754
Buffer memory management is done according to the following scenario:
3755
The caller allocates the buffer and provides it to the callee by filling
3756
the members of HANDLER_BUFFER structure.
3757
The callee consumes all or some fraction of the provided buffer space, and
3758
sets the HANDLER_BUFFER members accordingly.
3759
The callee may use the buffer memory until the next multi_range_read_init()
3760
call is made, all records have been read, or until index_end() call is
3761
made, whichever comes first.
3768
handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3769
uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3771
DBUG_ENTER("handler::multi_range_read_init");
3772
mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
3773
mrr_funcs= *seq_funcs;
3774
mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
3775
mrr_have_range= FALSE;
3781
Get next record in MRR scan
3783
Default MRR implementation: read the next record
3785
@param range_info OUT Undefined if HA_MRR_NO_ASSOCIATION flag is in effect
3786
Otherwise, the opaque value associated with the range
3787
that contains the returned record.
3790
@retval other Error code
3793
int handler::multi_range_read_next(char **range_info)
3797
DBUG_ENTER("handler::multi_range_read_next");
3799
if (!mrr_have_range)
3801
mrr_have_range= TRUE;
3807
/* Save a call if there can be only one row in range. */
3808
if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
3810
result= read_range_next();
3811
/* On success or non-EOF errors jump to the end. */
3812
if (result != HA_ERR_END_OF_FILE)
3817
if (was_semi_consistent_read())
3820
We need to set this for the last range only, but checking this
3821
condition is more expensive than just setting the result code.
3823
result= HA_ERR_END_OF_FILE;
3827
/* Try the next range(s) until one matches a record. */
3828
while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
3831
result= read_range_first(mrr_cur_range.start_key.keypart_map ?
3832
&mrr_cur_range.start_key : 0,
3833
mrr_cur_range.end_key.keypart_map ?
3834
&mrr_cur_range.end_key : 0,
3835
test(mrr_cur_range.range_flag & EQ_RANGE),
3836
mrr_is_output_sorted);
3837
if (result != HA_ERR_END_OF_FILE)
3841
while ((result == HA_ERR_END_OF_FILE) && !range_res);
3843
*range_info= mrr_cur_range.ptr;
3844
DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
3845
DBUG_RETURN(result);
3849
/* **************************************************************************
3850
* DS-MRR implementation
3851
***************************************************************************/
3854
DS-MRR: Initialize and start MRR scan
3856
Initialize and start the MRR scan. Depending on the mode parameter, this
3857
may use default or DS-MRR implementation.
3859
@param h Table handler to be used
3860
@param key Index to be used
3861
@param seq_funcs Interval sequence enumeration functions
3862
@param seq_init_param Interval sequence enumeration parameter
3863
@param n_ranges Number of ranges in the sequence.
3864
@param mode HA_MRR_* modes to use
3865
@param buf INOUT Buffer to use
3867
@retval 0 Ok, Scan started.
3871
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3872
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3873
uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3877
Item *pushed_cond= NULL;
3879
DBUG_ENTER("DsMrr_impl::dsmrr_init");
3880
keyno= h->active_index;
3881
DBUG_ASSERT(h2 == NULL);
3882
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3884
use_default_impl= TRUE;
3885
DBUG_RETURN(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3886
n_ranges, mode, buf));
3888
rowids_buf= buf->buffer;
3889
//psergey-todo: don't add key_length as it is not needed anymore
3890
rowids_buf += key->key_length + h->ref_length;
3892
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3893
rowids_buf_end= buf->buffer_end;
3895
elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3896
rowids_buf_last= rowids_buf +
3897
((rowids_buf_end - rowids_buf)/ elem_size)*
3899
rowids_buf_end= rowids_buf_last;
3901
/* Create a separate handler object to do rndpos() calls. */
3902
THD *thd= current_thd;
3903
if (!(new_h2= h->clone(thd->mem_root)) ||
3904
new_h2->ha_external_lock(thd, F_RDLCK))
3910
if (keyno == h->pushed_idx_cond_keyno)
3911
pushed_cond= h->pushed_idx_cond;
3912
if (h->ha_index_end())
3919
table->prepare_for_position();
3920
new_h2->extra(HA_EXTRA_KEYREAD);
3922
if (h2->ha_index_init(keyno, FALSE) ||
3923
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3926
use_default_impl= FALSE;
3929
h2->idx_cond_push(keyno, pushed_cond);
3930
if (dsmrr_fill_buffer(new_h2))
3934
If the above call has scanned through all intervals in *seq, then
3935
adjust *buf to indicate that the remaining buffer space will not be used.
3938
buf->end_of_used_area= rowids_buf_last;
3940
if (h->ha_rnd_init(FALSE))
3945
h2->ha_index_or_rnd_end();
3946
h2->ha_external_lock(thd, F_UNLCK);
3953
void DsMrr_impl::dsmrr_close()
3955
DBUG_ENTER("DsMrr_impl::dsmrr_close");
3958
h2->ha_external_lock(current_thd, F_UNLCK);
3963
use_default_impl= TRUE;
3968
static int rowid_cmp(void *h, uchar *a, uchar *b)
3970
return ((handler*)h)->cmp_ref(a, b);
3975
DS-MRR: Fill the buffer with rowids and sort it by rowid
3977
{This is an internal function of DiskSweep MRR implementation}
3978
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3979
buffer. When the buffer is full or scan is completed, sort the buffer by
3982
The function assumes that rowids buffer is empty when it is invoked.
3984
@param h Table handler
3986
@retval 0 OK, the next portion of rowids is in the buffer,
3991
int DsMrr_impl::dsmrr_fill_buffer(handler *unused)
3995
DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
3997
rowids_buf_cur= rowids_buf;
3998
while ((rowids_buf_cur < rowids_buf_end) &&
3999
!(res= h2->handler::multi_range_read_next(&range_info)))
4001
/* Put rowid, or {rowid, range_id} pair into the buffer */
4002
h2->position(table->record[0]);
4003
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
4004
rowids_buf_cur += h->ref_length;
4008
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
4009
rowids_buf_cur += sizeof(void*);
4013
if (res && res != HA_ERR_END_OF_FILE)
4015
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
4017
/* Sort the buffer contents by rowid */
4018
uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
4019
uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
4021
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
4023
rowids_buf_last= rowids_buf_cur;
4024
rowids_buf_cur= rowids_buf;
4030
DS-MRR implementation: multi_range_read_next() function
4033
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
4037
if (use_default_impl)
4038
return h->handler::multi_range_read_next(range_info);
4040
if (rowids_buf_cur == rowids_buf_last)
4044
res= HA_ERR_END_OF_FILE;
4047
res= dsmrr_fill_buffer(h);
4052
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
4053
if (rowids_buf_cur == rowids_buf_last)
4055
res= HA_ERR_END_OF_FILE;
4059
res= h->rnd_pos(table->record[0], rowids_buf_cur);
4060
rowids_buf_cur += h->ref_length;
4063
memcpy(range_info, rowids_buf_cur, sizeof(void*));
4064
rowids_buf_cur += sizeof(void*);
4075
DS-MRR implementation: multi_range_read_info() function
4077
int DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, uint *bufsz,
4078
uint *flags, COST_VECT *cost)
4081
uint def_flags= *flags;
4082
uint def_bufsz= *bufsz;
4084
/* Get cost/flags/mem_usage of default MRR implementation */
4085
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
4089
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
4090
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
4092
/* Default implementation is choosen */
4093
DBUG_PRINT("info", ("Default MRR implementation choosen"));
4099
DBUG_PRINT("info", ("DS-MRR implementation choosen"));
4106
DS-MRR Implementation: multi_range_read_info_const() function
4109
ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
4110
void *seq_init_param, uint n_ranges,
4111
uint *bufsz, uint *flags, COST_VECT *cost)
4114
uint def_flags= *flags;
4115
uint def_bufsz= *bufsz;
4116
/* Get cost/flags/mem_usage of default MRR implementation */
4117
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
4118
n_ranges, &def_bufsz,
4120
if (rows == HA_POS_ERROR)
4122
/* Default implementation can't perform MRR scan => we can't either */
4127
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
4128
use the default MRR implementation (we need it for UPDATE/DELETE).
4129
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
4131
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
4132
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
4134
DBUG_PRINT("info", ("Default MRR implementation choosen"));
4140
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
4141
DBUG_PRINT("info", ("DS-MRR implementation choosen"));
4148
Check if key has partially-covered columns
4150
We can't use DS-MRR to perform range scans when the ranges are over
4151
partially-covered keys, because we'll not have full key part values
4152
(we'll have their prefixes from the index) and will not be able to check
4153
if we've reached the end the range.
4155
@param keyno Key to check
4158
Allow use of DS-MRR in cases where the index has partially-covered
4159
components but they are not used for scanning.
4165
bool DsMrr_impl::key_uses_partial_cols(uint keyno)
4167
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
4168
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
4169
for (; kp != kp_end; kp++)
4171
if (!kp->field->part_of_key.is_set(keyno))
4179
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
4181
Make the choice between using Default MRR implementation and DS-MRR.
4182
This function contains common functionality factored out of dsmrr_info()
4183
and dsmrr_info_const(). The function assumes that the default MRR
4184
implementation's applicability requirements are satisfied.
4186
@param keyno Index number
4187
@param rows E(full rows to be retrieved)
4188
@param flags IN MRR flags provided by the MRR user
4189
OUT If DS-MRR is choosen, flags of DS-MRR implementation
4190
else the value is not modified
4191
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
4192
else the value is not modified
4193
@param cost IN Cost of default MRR implementation
4194
OUT If DS-MRR is choosen, cost of DS-MRR scan
4195
else the value is not modified
4197
@retval TRUE Default MRR implementation should be used
4198
@retval FALSE DS-MRR implementation should be used
4201
bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
4202
uint *bufsz, COST_VECT *cost)
4204
COST_VECT dsmrr_cost;
4206
THD *thd= current_thd;
4207
if ((thd->variables.optimizer_use_mrr == 2) ||
4208
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
4209
(keyno == table->s->primary_key &&
4210
h->primary_key_is_clustered()) ||
4211
key_uses_partial_cols(keyno))
4213
/* Use the default implementation */
4214
*flags |= HA_MRR_USE_DEFAULT_IMPL;
4218
uint add_len= table->key_info[keyno].key_length + h->ref_length;
4220
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
4226
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
4227
DS-MRR and Default implementations cost. This allows one to force use of
4228
DS-MRR whenever it is applicable without affecting other cost-based
4231
if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
4232
dsmrr_cost.total_cost() > cost->total_cost())
4235
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
4237
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
4238
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
4244
/* Use the default MRR implementation */
4251
static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
4255
Get cost of DS-MRR scan
4257
@param keynr Index to be used
4258
@param rows E(Number of rows to be scanned)
4259
@param flags Scan parameters (HA_MRR_* flags)
4260
@param buffer_size INOUT Buffer size
4261
@param cost OUT The cost
4264
@retval TRUE Error, DS-MRR cannot be used (the buffer is too small
4268
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
4269
uint *buffer_size, COST_VECT *cost)
4271
ulong max_buff_entries, elem_size;
4272
ha_rows rows_in_full_step, rows_in_last_step;
4274
double index_read_cost;
4276
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
4277
max_buff_entries = *buffer_size / elem_size;
4279
if (!max_buff_entries)
4280
return TRUE; /* Buffer has not enough space for even 1 rowid */
4282
/* Number of iterations we'll make with full buffer */
4283
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
4286
Get numbers of rows we'll be processing in
4287
- non-last sweep, with full buffer
4288
- last iteration, with non-full buffer
4290
rows_in_full_step= max_buff_entries;
4291
rows_in_last_step= rows % max_buff_entries;
4293
/* Adjust buffer size if we expect to use only part of the buffer */
4296
get_sort_and_sweep_cost(table, rows, cost);
4297
cost->multiply(n_full_steps);
4302
*buffer_size= max(*buffer_size,
4303
(size_t)(1.2*rows_in_last_step) * elem_size +
4304
h->ref_length + table->key_info[keynr].key_length);
4307
COST_VECT last_step_cost;
4308
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
4309
cost->add(&last_step_cost);
4311
if (n_full_steps != 0)
4312
cost->mem_cost= *buffer_size;
4314
cost->mem_cost= (double)rows_in_last_step * elem_size;
4316
/* Total cost of all index accesses */
4317
index_read_cost= h->index_only_read_time(keynr, (double)rows);
4318
cost->add_io(index_read_cost, 1 /* Random seeks */);
4324
Get cost of one sort-and-sweep step
4327
get_sort_and_sweep_cost()
4328
table Table being accessed
4329
nrows Number of rows to be sorted and retrieved
4333
Get cost of these operations:
4334
- sort an array of #nrows ROWIDs using qsort
4335
- read #nrows records from table in a sweep.
4339
void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
4343
get_sweep_read_cost(table, nrows, FALSE, cost);
4344
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
4345
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
4348
cost->cpu_cost += cmp_op * log2(cmp_op);
4356
Get cost of reading nrows table records in a "disk sweep"
4358
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
4359
for an ordered sequence of rowids.
4361
We assume hard disk IO. The read is performed as follows:
4363
1. The disk head is moved to the needed cylinder
4364
2. The controller waits for the plate to rotate
4365
3. The data is transferred
4367
Time to do #3 is insignificant compared to #2+#1.
4369
Time to move the disk head is proportional to head travel distance.
4371
Time to wait for the plate to rotate depends on whether the disk head
4374
If disk head wasn't moved, the wait time is proportional to distance
4375
between the previous block and the block we're reading.
4377
If the head was moved, we don't know how much we'll need to wait for the
4378
plate to rotate. We assume the wait time to be a variate with a mean of
4379
0.5 of full rotation time.
4381
Our cost units are "random disk seeks". The cost of random disk seek is
4382
actually not a constant, it depends one range of cylinders we're going
4383
to access. We make it constant by introducing a fuzzy concept of "typical
4384
datafile length" (it's fuzzy as it's hard to tell whether it should
4385
include index file, temp.tables etc). Then random seek cost is:
4387
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
4389
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
4391
@param table Table to be accessed
4392
@param nrows Number of rows to retrieve
4393
@param interrupted TRUE <=> Assume that the disk sweep will be
4394
interrupted by other disk IO. FALSE - otherwise.
4395
@param cost OUT The cost.
4398
void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
4401
DBUG_ENTER("get_sweep_read_cost");
4404
if (table->file->primary_key_is_clustered())
4406
cost->io_count= table->file->read_time(table->s->primary_key,
4407
(uint) nrows, nrows);
4412
ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
4414
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
4415
if (busy_blocks < 1.0)
4418
DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks,
4420
cost->io_count= busy_blocks;
4424
/* Assume reading is done in one 'sweep' */
4425
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
4426
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
4429
DBUG_PRINT("info",("returning cost=%g", cost->total_cost()));
4434
/* **************************************************************************
4435
* DS-MRR implementation ends
4436
***************************************************************************/
4439
Read first row between two ranges.
4441
@param start_key Start key. Is 0 if no min range
4442
@param end_key End key. Is 0 if no max range
4443
@param eq_range_arg Set to 1 if start_key == end_key
4444
@param sorted Set to 1 if result should be sorted per key
4447
Record is read into table->record[0]
4452
HA_ERR_END_OF_FILE No rows in range
4456
int handler::read_range_first(const key_range *start_key,
4457
const key_range *end_key,
4459
bool sorted /* ignored */)
4462
DBUG_ENTER("handler::read_range_first");
4464
eq_range= eq_range_arg;
4468
end_range= &save_end_range;
4469
save_end_range= *end_key;
4470
key_compare_result_on_equal= ((end_key->flag == HA_READ_BEFORE_KEY) ? 1 :
4471
(end_key->flag == HA_READ_AFTER_KEY) ? -1 : 0);
4473
range_key_part= table->key_info[active_index].key_part;
4475
if (!start_key) // Read first record
4476
result= index_first(table->record[0]);
4478
result= index_read_map(table->record[0],
4480
start_key->keypart_map,
4483
DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND)
4484
? HA_ERR_END_OF_FILE
4487
DBUG_RETURN (compare_key(end_range) <= 0 ? 0 : HA_ERR_END_OF_FILE);
4492
Read next row between two endpoints.
4495
Record is read into table->record[0]
4500
HA_ERR_END_OF_FILE No rows in range
4504
int handler::read_range_next()
4507
DBUG_ENTER("handler::read_range_next");
4511
/* We trust that index_next_same always gives a row in range */
4512
DBUG_RETURN(index_next_same(table->record[0],
4514
end_range->length));
4516
result= index_next(table->record[0]);
4518
DBUG_RETURN(result);
4519
DBUG_RETURN(compare_key(end_range) <= 0 ? 0 : HA_ERR_END_OF_FILE);
4524
Compare if found key (in row) is over max-value.
4526
@param range range to compare to row. May be 0 for no range
4532
The return value is SIGN(key_in_row - range_key):
4534
- 0 : Key is equal to range or 'range' == 0 (no range)
4535
- -1 : Key is less than range
4536
- 1 : Key is larger than range
4538
int handler::compare_key(key_range *range)
4541
if (!range || in_range_check_pushed_down)
4542
return 0; // No max range
4543
cmp= key_cmp(range_key_part, range->key, range->length);
4545
cmp= key_compare_result_on_equal;
4551
Same as compare_key() but doesn't check have in_range_check_pushed_down.
4552
This is used by index condition pushdown implementation.
4555
int handler::compare_key2(key_range *range)
4559
return 0; // no max range
4560
cmp= key_cmp(range_key_part, range->key, range->length);
4562
cmp= key_compare_result_on_equal;
4566
int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key,
4567
key_part_map keypart_map,
4568
enum ha_rkey_function find_flag)
4571
error= index_init(index, 0);
4574
error= index_read_map(buf, key, keypart_map, find_flag);
4575
error1= index_end();
4577
return error ? error : error1;
4582
Returns a list of all known extensions.
4584
No mutexes, worst case race is a minor surplus memory allocation
4585
We have to recreate the extension map if mysqld is restarted (for example
4589
pointer pointer to TYPELIB structure
4591
static my_bool exts_handlerton(THD *unused, plugin_ref plugin,
4594
List<char> *found_exts= (List<char> *) arg;
4595
handlerton *hton= plugin_data(plugin, handlerton *);
4597
if (hton->state == SHOW_OPTION_YES && hton->create &&
4598
(file= hton->create(hton, (TABLE_SHARE*) 0, current_thd->mem_root)))
4600
List_iterator_fast<char> it(*found_exts);
4601
const char **ext, *old_ext;
4603
for (ext= file->bas_ext(); *ext; ext++)
4605
while ((old_ext= it++))
4607
if (!strcmp(old_ext, *ext))
4611
found_exts->push_back((char *) *ext);
4620
TYPELIB *ha_known_exts(void)
4622
if (!known_extensions.type_names || mysys_usage_id != known_extensions_id)
4624
List<char> found_exts;
4625
const char **ext, *old_ext;
4627
known_extensions_id= mysys_usage_id;
4629
plugin_foreach(NULL, exts_handlerton,
4630
MYSQL_STORAGE_ENGINE_PLUGIN, &found_exts);
4632
ext= (const char **) my_once_alloc(sizeof(char *)*
4633
(found_exts.elements+1),
4634
MYF(MY_WME | MY_FAE));
4636
DBUG_ASSERT(ext != 0);
4637
known_extensions.count= found_exts.elements;
4638
known_extensions.type_names= ext;
4640
List_iterator_fast<char> it(found_exts);
4641
while ((old_ext= it++))
4645
return &known_extensions;
4649
static bool stat_print(THD *thd, const char *type, uint type_len,
4650
const char *file, uint file_len,
4651
const char *status, uint status_len)
4653
Protocol *protocol= thd->protocol;
4654
protocol->prepare_for_resend();
4655
protocol->store(type, type_len, system_charset_info);
4656
protocol->store(file, file_len, system_charset_info);
4657
protocol->store(status, status_len, system_charset_info);
4658
if (protocol->write())
4664
static my_bool showstat_handlerton(THD *thd, plugin_ref plugin,
4667
enum ha_stat_type stat= *(enum ha_stat_type *) arg;
4668
handlerton *hton= plugin_data(plugin, handlerton *);
4669
if (hton->state == SHOW_OPTION_YES && hton->show_status &&
4670
hton->show_status(hton, thd, stat_print, stat))
4675
bool ha_show_status(THD *thd, handlerton *db_type, enum ha_stat_type stat)
4677
List<Item> field_list;
4678
Protocol *protocol= thd->protocol;
4681
field_list.push_back(new Item_empty_string("Type",10));
4682
field_list.push_back(new Item_empty_string("Name",FN_REFLEN));
4683
field_list.push_back(new Item_empty_string("Status",10));
4685
if (protocol->send_fields(&field_list,
4686
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
4689
if (db_type == NULL)
4691
result= plugin_foreach(thd, showstat_handlerton,
4692
MYSQL_STORAGE_ENGINE_PLUGIN, &stat);
4696
if (db_type->state != SHOW_OPTION_YES)
4698
const LEX_STRING *name=&hton2plugin[db_type->slot]->name;
4699
result= stat_print(thd, name->str, name->length,
4700
"", 0, "DISABLED", 8) ? 1 : 0;
4703
result= db_type->show_status &&
4704
db_type->show_status(db_type, thd, stat_print, stat) ? 1 : 0;
4714
Check if the conditions for row-based binlogging is correct for the table.
4716
A row in the given table should be replicated if:
4717
- Row-based replication is enabled in the current thread
4718
- The binlog is enabled
4719
- It is not a temporary table
4720
- The binary log is open
4721
- The database the table resides in shall be binlogged (binlog_*_db rules)
4722
- table is not mysql.event
4725
static bool check_table_binlog_row_based(THD *thd, TABLE *table)
4727
if (table->s->cached_row_logging_check == -1)
4729
int const check(table->s->tmp_table == NO_TMP_TABLE &&
4730
binlog_filter->db_ok(table->s->db.str));
4731
table->s->cached_row_logging_check= check;
4734
DBUG_ASSERT(table->s->cached_row_logging_check == 0 ||
4735
table->s->cached_row_logging_check == 1);
4737
return (thd->current_stmt_binlog_row_based &&
4738
table->s->cached_row_logging_check &&
4739
(thd->options & OPTION_BIN_LOG) &&
4740
mysql_bin_log.is_open());
4745
Write table maps for all (manually or automatically) locked tables
4748
This function will generate and write table maps for all tables
4749
that are locked by the thread 'thd'. Either manually locked
4750
(stored in THD::locked_tables) and automatically locked (stored
4751
in THD::lock) are considered.
4753
@param thd Pointer to THD structure
4756
@retval 1 Failed to write all table maps
4763
static int write_locked_table_maps(THD *thd)
4765
DBUG_ENTER("write_locked_table_maps");
4766
DBUG_PRINT("enter", ("thd: 0x%lx thd->lock: 0x%lx thd->locked_tables: 0x%lx "
4767
"thd->extra_lock: 0x%lx",
4768
(long) thd, (long) thd->lock,
4769
(long) thd->locked_tables, (long) thd->extra_lock));
4771
if (thd->get_binlog_table_maps() == 0)
4773
MYSQL_LOCK *locks[3];
4774
locks[0]= thd->extra_lock;
4775
locks[1]= thd->lock;
4776
locks[2]= thd->locked_tables;
4777
for (uint i= 0 ; i < sizeof(locks)/sizeof(*locks) ; ++i )
4779
MYSQL_LOCK const *const lock= locks[i];
4783
TABLE **const end_ptr= lock->table + lock->table_count;
4784
for (TABLE **table_ptr= lock->table ;
4785
table_ptr != end_ptr ;
4788
TABLE *const table= *table_ptr;
4789
DBUG_PRINT("info", ("Checking table %s", table->s->table_name.str));
4790
if (table->current_lock == F_WRLCK &&
4791
check_table_binlog_row_based(thd, table))
4793
int const has_trans= table->file->has_transactions();
4794
int const error= thd->binlog_write_table_map(table, has_trans);
4796
If an error occurs, it is the responsibility of the caller to
4797
roll back the transaction.
4799
if (unlikely(error))
4809
typedef bool Log_func(THD*, TABLE*, bool, const uchar*, const uchar*);
4811
static int binlog_log_row(TABLE* table,
4812
const uchar *before_record,
4813
const uchar *after_record,
4816
if (table->no_replicate)
4819
THD *const thd= table->in_use;
4821
if (check_table_binlog_row_based(thd, table))
4823
DBUG_DUMP("read_set 10", (uchar*) table->read_set->bitmap,
4824
(table->s->fields + 7) / 8);
4826
If there are no table maps written to the binary log, this is
4827
the first row handled in this statement. In that case, we need
4828
to write table maps for all locked tables to the binary log.
4830
if (likely(!(error= write_locked_table_maps(thd))))
4832
bool const has_trans= table->file->has_transactions();
4833
error= (*log_func)(thd, table, has_trans, before_record, after_record);
4836
return error ? HA_ERR_RBR_LOGGING_FAILED : 0;
4839
int handler::ha_external_lock(THD *thd, int lock_type)
4841
DBUG_ENTER("handler::ha_external_lock");
4843
Whether this is lock or unlock, this should be true, and is to verify that
4844
if get_auto_increment() was called (thus may have reserved intervals or
4845
taken a table lock), ha_release_auto_increment() was too.
4847
DBUG_ASSERT(next_insert_id == 0);
4850
We cache the table flags if the locking succeeded. Otherwise, we
4851
keep them as they were when they were fetched in ha_open().
4853
MYSQL_EXTERNAL_LOCK(lock_type);
4855
int error= external_lock(thd, lock_type);
4857
cached_table_flags= table_flags();
4863
Check handler usage and reset state of file to after 'open'
4865
int handler::ha_reset()
4867
DBUG_ENTER("ha_reset");
4868
/* Check that we have called all proper deallocation functions */
4869
DBUG_ASSERT((uchar*) table->def_read_set.bitmap +
4870
table->s->column_bitmap_size ==
4871
(uchar*) table->def_write_set.bitmap);
4872
DBUG_ASSERT(bitmap_is_set_all(&table->s->all_set));
4873
DBUG_ASSERT(table->key_read == 0);
4874
/* ensure that ha_index_end / ha_rnd_end has been called */
4875
DBUG_ASSERT(inited == NONE);
4876
/* Free cache used by filesort */
4877
free_io_cache(table);
4878
/* reset the bitmaps to point to defaults */
4879
table->default_column_bitmaps();
4880
DBUG_RETURN(reset());
4884
int handler::ha_write_row(uchar *buf)
4887
Log_func *log_func= Write_rows_log_event::binlog_row_logging_function;
4888
DBUG_ENTER("handler::ha_write_row");
4889
MYSQL_INSERT_ROW_START();
4891
mark_trx_read_write();
4893
if (unlikely(error= write_row(buf)))
4895
if (unlikely(error= binlog_log_row(table, 0, buf, log_func)))
4896
DBUG_RETURN(error); /* purecov: inspected */
4897
MYSQL_INSERT_ROW_END();
4902
int handler::ha_update_row(const uchar *old_data, uchar *new_data)
4905
Log_func *log_func= Update_rows_log_event::binlog_row_logging_function;
4908
Some storage engines require that the new record is in record[0]
4909
(and the old record is in record[1]).
4911
DBUG_ASSERT(new_data == table->record[0]);
4913
mark_trx_read_write();
4915
if (unlikely(error= update_row(old_data, new_data)))
4917
if (unlikely(error= binlog_log_row(table, old_data, new_data, log_func)))
4922
int handler::ha_delete_row(const uchar *buf)
4925
Log_func *log_func= Delete_rows_log_event::binlog_row_logging_function;
4927
mark_trx_read_write();
4929
if (unlikely(error= delete_row(buf)))
4931
if (unlikely(error= binlog_log_row(table, buf, 0, log_func)))
4940
use_hidden_primary_key() is called in case of an update/delete when
4941
(table_flags() and HA_PRIMARY_KEY_REQUIRED_FOR_DELETE) is defined
4942
but we don't have a primary key
4944
void handler::use_hidden_primary_key()
4946
/* fallback to use all columns in the table to identify row */
4947
table->use_all_columns();