1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
1
/* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
2
* vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
4
* Copyright (C) 2008 Sun Microsystems
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; version 2 of the License.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
23
Handler-calling-functions
22
#ifdef USE_PRAGMA_IMPLEMENTATION
23
#pragma implementation // gcc: Class implementation
26
#include <drizzled/server_includes.h>
27
#include "rpl_filter.h"
28
#include <drizzled/drizzled_error_messages.h>
31
While we have legacy_db_type, we have this array to
32
check for dups and to find handlerton from legacy_db_type.
33
Remove when legacy_db_type is finally gone
35
st_plugin_int *hton2plugin[MAX_HA];
37
static handlerton *installed_htons[128];
39
#define BITMAP_STACKBUF_SIZE (128/8)
41
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NullS,0}, {NullS,0} };
43
/* number of entries in handlertons[] */
45
/* number of storage engines (from handlertons[]) that support 2pc */
46
uint32_t total_ha_2pc= 0;
47
/* size of savepoint storage area (see ha_init) */
48
uint32_t savepoint_alloc_size= 0;
50
static const LEX_STRING sys_table_aliases[]=
52
{ C_STRING_WITH_LEN("INNOBASE") }, { C_STRING_WITH_LEN("INNODB") },
53
{ C_STRING_WITH_LEN("HEAP") }, { C_STRING_WITH_LEN("MEMORY") },
57
const char *ha_row_type[] = {
58
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
61
const char *tx_isolation_names[] =
62
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
64
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
65
tx_isolation_names, NULL};
67
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
68
uint known_extensions_id= 0;
72
static plugin_ref ha_default_plugin(THD *thd)
74
if (thd->variables.table_plugin)
75
return thd->variables.table_plugin;
76
return my_plugin_lock(thd, &global_system_variables.table_plugin);
81
Return the default storage engine handlerton for thread
83
@param ha_default_handlerton(thd)
84
@param thd current thread
89
handlerton *ha_default_handlerton(THD *thd)
91
plugin_ref plugin= ha_default_plugin(thd);
93
handlerton *hton= plugin_data(plugin, handlerton*);
100
Return the storage engine handlerton for the supplied name
102
@param thd current thread
103
@param name name of storage engine
106
pointer to storage engine plugin handle
108
plugin_ref ha_resolve_by_name(THD *thd, const LEX_STRING *name)
110
const LEX_STRING *table_alias;
114
/* my_strnncoll is a macro and gcc doesn't do early expansion of macro */
115
if (thd && !my_charset_latin1.coll->strnncoll(&my_charset_latin1,
116
(const uchar *)name->str, name->length,
117
(const uchar *)STRING_WITH_LEN("DEFAULT"), 0))
118
return ha_default_plugin(thd);
120
if ((plugin= my_plugin_lock_by_name(thd, name, MYSQL_STORAGE_ENGINE_PLUGIN)))
122
handlerton *hton= plugin_data(plugin, handlerton *);
123
if (!(hton->flags & HTON_NOT_USER_SELECTABLE))
127
unlocking plugin immediately after locking is relatively low cost.
129
plugin_unlock(thd, plugin);
133
We check for the historical aliases.
135
for (table_alias= sys_table_aliases; table_alias->str; table_alias+= 2)
137
if (!my_strnncoll(&my_charset_latin1,
138
(const uchar *)name->str, name->length,
139
(const uchar *)table_alias->str, table_alias->length))
141
name= table_alias + 1;
150
plugin_ref ha_lock_engine(THD *thd, handlerton *hton)
154
st_plugin_int **plugin= hton2plugin + hton->slot;
156
return my_plugin_lock(thd, &plugin);
162
handlerton *ha_resolve_by_legacy_type(THD *thd, enum legacy_db_type db_type)
166
case DB_TYPE_DEFAULT:
167
return ha_default_handlerton(thd);
169
if (db_type > DB_TYPE_UNKNOWN && db_type < DB_TYPE_DEFAULT &&
170
(plugin= ha_lock_engine(thd, installed_htons[db_type])))
171
return plugin_data(plugin, handlerton*);
173
case DB_TYPE_UNKNOWN:
180
Use other database handler if databasehandler is not compiled in.
182
handlerton *ha_checktype(THD *thd, enum legacy_db_type database_type,
183
bool no_substitute, bool report_error)
185
handlerton *hton= ha_resolve_by_legacy_type(thd, database_type);
186
if (ha_storage_engine_is_enabled(hton))
193
const char *engine_name= ha_resolve_storage_engine_name(hton);
194
my_error(ER_FEATURE_DISABLED,MYF(0),engine_name,engine_name);
199
switch (database_type) {
201
return ha_resolve_by_legacy_type(thd, DB_TYPE_HASH);
206
return ha_default_handlerton(thd);
210
handler *get_new_handler(TABLE_SHARE *share, MEM_ROOT *alloc,
215
if (db_type && db_type->state == SHOW_OPTION_YES && db_type->create)
217
if ((file= db_type->create(db_type, share, alloc)))
222
Try the default table type
223
Here the call to current_thd() is ok as we call this function a lot of
224
times but we enter this branch very seldom.
226
return(get_new_handler(share, alloc, ha_default_handlerton(current_thd)));
231
Register handler error messages for use with my_error().
239
int ha_init_errors(void)
241
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
242
const char **errmsgs;
244
/* Allocate a pointer array for the error message strings. */
245
/* Zerofill it to avoid uninitialized gaps. */
246
if (! (errmsgs= (const char**) my_malloc(HA_ERR_ERRORS * sizeof(char*),
247
MYF(MY_WME | MY_ZEROFILL))))
250
/* Set the dedicated error messages. */
251
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
252
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
253
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
254
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
255
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
256
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
257
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
258
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
259
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
260
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
261
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
262
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
263
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
264
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
265
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
266
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
267
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
268
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
269
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
270
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
271
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
272
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
273
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
274
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
275
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
276
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
277
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
278
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
279
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
280
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
281
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
282
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
283
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
284
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
285
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
286
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
287
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
288
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
289
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
290
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
291
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
293
/* Register the error messages for use with my_error(). */
294
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
299
Unregister handler error messages.
306
static int ha_finish_errors(void)
308
const char **errmsgs;
310
/* Allocate a pointer array for the error message strings. */
311
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
313
my_free((uchar*) errmsgs, MYF(0));
318
int ha_finalize_handlerton(st_plugin_int *plugin)
320
handlerton *hton= (handlerton *)plugin->data;
325
case SHOW_OPTION_DISABLED:
327
case SHOW_OPTION_YES:
328
if (installed_htons[hton->db_type] == hton)
329
installed_htons[hton->db_type]= NULL;
333
if (hton && plugin->plugin->deinit)
334
(void)plugin->plugin->deinit(hton);
336
my_free((uchar*)hton, MYF(0));
342
int ha_initialize_handlerton(st_plugin_int *plugin)
346
hton= (handlerton *)my_malloc(sizeof(handlerton),
347
MYF(MY_WME | MY_ZEROFILL));
349
FIXME: the MY_ZEROFILL flag above doesn't zero all the bytes.
351
This was detected after adding get_backup_engine member to handlerton
352
structure. Apparently get_backup_engine was not NULL even though it was
355
memset(hton, 0, sizeof(hton));
356
/* Historical Requirement */
357
plugin->data= hton; // shortcut for the future
358
if (plugin->plugin->init)
360
if (plugin->plugin->init(hton))
362
sql_print_error("Plugin '%s' init function returned error.",
369
the switch below and hton->state should be removed when
370
command-line options for plugins will be implemented
372
switch (hton->state) {
375
case SHOW_OPTION_YES:
378
/* now check the db_type for conflict */
379
if (hton->db_type <= DB_TYPE_UNKNOWN ||
380
hton->db_type >= DB_TYPE_DEFAULT ||
381
installed_htons[hton->db_type])
383
int idx= (int) DB_TYPE_FIRST_DYNAMIC;
385
while (idx < (int) DB_TYPE_DEFAULT && installed_htons[idx])
388
if (idx == (int) DB_TYPE_DEFAULT)
390
sql_print_warning("Too many storage engines!");
393
if (hton->db_type != DB_TYPE_UNKNOWN)
394
sql_print_warning("Storage engine '%s' has conflicting typecode. "
395
"Assigning value %d.", plugin->plugin->name, idx);
396
hton->db_type= (enum legacy_db_type) idx;
398
installed_htons[hton->db_type]= hton;
399
tmp= hton->savepoint_offset;
400
hton->savepoint_offset= savepoint_alloc_size;
401
savepoint_alloc_size+= tmp;
402
hton->slot= total_ha++;
403
hton2plugin[hton->slot]=plugin;
410
hton->state= SHOW_OPTION_DISABLED;
415
This is entirely for legacy. We will create a new "disk based" hton and a
416
"memory" hton which will be configurable longterm. We should be able to
417
remove partition and myisammrg.
419
switch (hton->db_type) {
439
assert(total_ha < MAX_HA);
441
Check if there is a transaction-capable storage engine besides the
442
binary log (which is considered a transaction-capable storage engine in
445
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
446
savepoint_alloc_size+= sizeof(SAVEPOINT);
455
This should be eventualy based on the graceful shutdown flag.
456
So if flag is equal to HA_PANIC_CLOSE, the deallocate
459
if (ha_finish_errors())
465
static bool dropdb_handlerton(THD *unused1 __attribute__((unused)),
469
handlerton *hton= plugin_data(plugin, handlerton *);
470
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
471
hton->drop_database(hton, (char *)path);
476
void ha_drop_database(char* path)
478
plugin_foreach(NULL, dropdb_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, path);
482
static bool closecon_handlerton(THD *thd, plugin_ref plugin,
483
void *unused __attribute__((unused)))
485
handlerton *hton= plugin_data(plugin, handlerton *);
487
there's no need to rollback here as all transactions must
488
be rolled back already
490
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
491
thd_get_ha_data(thd, hton))
492
hton->close_connection(hton, thd);
499
don't bother to rollback here, it's done already
501
void ha_close_connection(THD* thd)
503
plugin_foreach(thd, closecon_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, 0);
506
/* ========================================================================
507
======================= TRANSACTIONS ===================================*/
510
Transaction handling in the server
511
==================================
513
In each client connection, MySQL maintains two transactional
515
- a statement transaction,
516
- a standard, also called normal transaction.
520
"Statement transaction" is a non-standard term that comes
521
from the times when MySQL supported BerkeleyDB storage engine.
523
First of all, it should be said that in BerkeleyDB auto-commit
524
mode auto-commits operations that are atomic to the storage
525
engine itself, such as a write of a record, and are too
526
high-granular to be atomic from the application perspective
527
(MySQL). One SQL statement could involve many BerkeleyDB
528
auto-committed operations and thus BerkeleyDB auto-commit was of
531
Secondly, instead of SQL standard savepoints, BerkeleyDB
532
provided the concept of "nested transactions". In a nutshell,
533
transactions could be arbitrarily nested, but when the parent
534
transaction was committed or aborted, all its child (nested)
535
transactions were handled committed or aborted as well.
536
Commit of a nested transaction, in turn, made its changes
537
visible, but not durable: it destroyed the nested transaction,
538
all its changes would become available to the parent and
539
currently active nested transactions of this parent.
541
So the mechanism of nested transactions was employed to
542
provide "all or nothing" guarantee of SQL statements
543
required by the standard.
544
A nested transaction would be created at start of each SQL
545
statement, and destroyed (committed or aborted) at statement
546
end. Such nested transaction was internally referred to as
547
a "statement transaction" and gave birth to the term.
549
<Historical note ends>
551
Since then a statement transaction is started for each statement
552
that accesses transactional tables or uses the binary log. If
553
the statement succeeds, the statement transaction is committed.
554
If the statement fails, the transaction is rolled back. Commits
555
of statement transactions are not durable -- each such
556
transaction is nested in the normal transaction, and if the
557
normal transaction is rolled back, the effects of all enclosed
558
statement transactions are undone as well. Technically,
559
a statement transaction can be viewed as a savepoint which is
560
maintained automatically in order to make effects of one
563
The normal transaction is started by the user and is ended
564
usually upon a user request as well. The normal transaction
565
encloses transactions of all statements issued between
566
its beginning and its end.
567
In autocommit mode, the normal transaction is equivalent
568
to the statement transaction.
570
Since MySQL supports PSEA (pluggable storage engine
571
architecture), more than one transactional engine can be
572
active at a time. Hence transactions, from the server
573
point of view, are always distributed. In particular,
574
transactional state is maintained independently for each
575
engine. In order to commit a transaction the two phase
576
commit protocol is employed.
578
Not all statements are executed in context of a transaction.
579
Administrative and status information statements do not modify
580
engine data, and thus do not start a statement transaction and
581
also have no effect on the normal transaction. Examples of such
582
statements are SHOW STATUS and RESET SLAVE.
584
Similarly DDL statements are not transactional,
585
and therefore a transaction is [almost] never started for a DDL
586
statement. The difference between a DDL statement and a purely
587
administrative statement though is that a DDL statement always
588
commits the current transaction before proceeding, if there is
591
At last, SQL statements that work with non-transactional
592
engines also have no effect on the transaction state of the
593
connection. Even though they are written to the binary log,
594
and the binary log is, overall, transactional, the writes
595
are done in "write-through" mode, directly to the binlog
596
file, followed with a OS cache sync, in other words,
597
bypassing the binlog undo log (translog).
598
They do not commit the current normal transaction.
599
A failure of a statement that uses non-transactional tables
600
would cause a rollback of the statement transaction, but
601
in case there no non-transactional tables are used,
602
no statement transaction is started.
607
The server stores its transaction-related data in
608
thd->transaction. This structure has two members of type
609
THD_TRANS. These members correspond to the statement and
610
normal transactions respectively:
612
- thd->transaction.stmt contains a list of engines
613
that are participating in the given statement
614
- thd->transaction.all contains a list of engines that
615
have participated in any of the statement transactions started
616
within the context of the normal transaction.
617
Each element of the list contains a pointer to the storage
618
engine, engine-specific transactional data, and engine-specific
621
In autocommit mode thd->transaction.all is empty.
622
Instead, data of thd->transaction.stmt is
623
used to commit/rollback the normal transaction.
625
The list of registered engines has a few important properties:
626
- no engine is registered in the list twice
627
- engines are present in the list a reverse temporal order --
628
new participants are always added to the beginning of the list.
630
Transaction life cycle
631
----------------------
633
When a new connection is established, thd->transaction
634
members are initialized to an empty state.
635
If a statement uses any tables, all affected engines
636
are registered in the statement engine list. In
637
non-autocommit mode, the same engines are registered in
638
the normal transaction list.
639
At the end of the statement, the server issues a commit
640
or a roll back for all engines in the statement list.
641
At this point transaction flags of an engine, if any, are
642
propagated from the statement list to the list of the normal
644
When commit/rollback is finished, the statement list is
645
cleared. It will be filled in again by the next statement,
646
and emptied again at the next statement's end.
648
The normal transaction is committed in a similar way
649
(by going over all engines in thd->transaction.all list)
650
but at different times:
651
- upon COMMIT SQL statement is issued by the user
652
- implicitly, by the server, at the beginning of a DDL statement
653
or SET AUTOCOMMIT={0|1} statement.
655
The normal transaction can be rolled back as well:
656
- if the user has requested so, by issuing ROLLBACK SQL
658
- if one of the storage engines requested a rollback
659
by setting thd->transaction_rollback_request. This may
660
happen in case, e.g., when the transaction in the engine was
661
chosen a victim of the internal deadlock resolution algorithm
662
and rolled back internally. When such a situation happens, there
663
is little the server can do and the only option is to rollback
664
transactions in all other participating engines. In this case
665
the rollback is accompanied by an error sent to the user.
667
As follows from the use cases above, the normal transaction
668
is never committed when there is an outstanding statement
669
transaction. In most cases there is no conflict, since
670
commits of the normal transaction are issued by a stand-alone
671
administrative or DDL statement, thus no outstanding statement
672
transaction of the previous statement exists. Besides,
673
all statements that manipulate with the normal transaction
674
are prohibited in stored functions and triggers, therefore
675
no conflicting situation can occur in a sub-statement either.
676
The remaining rare cases when the server explicitly has
677
to commit the statement transaction prior to committing the normal
678
one cover error-handling scenarios (see for example
681
When committing a statement or a normal transaction, the server
682
either uses the two-phase commit protocol, or issues a commit
683
in each engine independently. The two-phase commit protocol
685
- all participating engines support two-phase commit (provide
686
handlerton::prepare PSEA API call) and
687
- transactions in at least two engines modify data (i.e. are
690
Note that the two phase commit is used for
691
statement transactions, even though they are not durable anyway.
692
This is done to ensure logical consistency of data in a multiple-
694
For example, imagine that some day MySQL supports unique
695
constraint checks deferred till the end of statement. In such
696
case a commit in one of the engines may yield ER_DUP_KEY,
697
and MySQL should be able to gracefully abort statement
698
transactions of other participants.
700
After the normal transaction has been committed,
701
thd->transaction.all list is cleared.
703
When a connection is closed, the current normal transaction, if
706
Roles and responsibilities
707
--------------------------
709
The server has no way to know that an engine participates in
710
the statement and a transaction has been started
711
in it unless the engine says so. Thus, in order to be
712
a part of a transaction, the engine must "register" itself.
713
This is done by invoking trans_register_ha() server call.
714
Normally the engine registers itself whenever handler::external_lock()
715
is called. trans_register_ha() can be invoked many times: if
716
an engine is already registered, the call does nothing.
717
In case autocommit is not set, the engine must register itself
718
twice -- both in the statement list and in the normal transaction
720
In which list to register is a parameter of trans_register_ha().
722
Note, that although the registration interface in itself is
723
fairly clear, the current usage practice often leads to undesired
724
effects. E.g. since a call to trans_register_ha() in most engines
725
is embedded into implementation of handler::external_lock(), some
726
DDL statements start a transaction (at least from the server
727
point of view) even though they are not expected to. E.g.
728
CREATE TABLE does not start a transaction, since
729
handler::external_lock() is never called during CREATE TABLE. But
730
CREATE TABLE ... SELECT does, since handler::external_lock() is
731
called for the table that is being selected from. This has no
732
practical effects currently, but must be kept in mind
735
Once an engine is registered, the server will do the rest
738
During statement execution, whenever any of data-modifying
739
PSEA API methods is used, e.g. handler::write_row() or
740
handler::update_row(), the read-write flag is raised in the
741
statement transaction for the involved engine.
742
Currently All PSEA calls are "traced", and the data can not be
743
changed in a way other than issuing a PSEA call. Important:
744
unless this invariant is preserved the server will not know that
745
a transaction in a given engine is read-write and will not
746
involve the two-phase commit protocol!
748
At the end of a statement, server call
749
ha_autocommit_or_rollback() is invoked. This call in turn
750
invokes handlerton::prepare() for every involved engine.
751
Prepare is followed by a call to handlerton::commit_one_phase()
752
If a one-phase commit will suffice, handlerton::prepare() is not
753
invoked and the server only calls handlerton::commit_one_phase().
754
At statement commit, the statement-related read-write engine
755
flag is propagated to the corresponding flag in the normal
756
transaction. When the commit is complete, the list of registered
759
Rollback is handled in a similar fashion.
761
Additional notes on DDL and the normal transaction.
762
---------------------------------------------------
764
DDLs and operations with non-transactional engines
765
do not "register" in thd->transaction lists, and thus do not
766
modify the transaction state. Besides, each DDL in
767
MySQL is prefixed with an implicit normal transaction commit
768
(a call to end_active_trans()), and thus leaves nothing
770
However, as it has been pointed out with CREATE TABLE .. SELECT,
771
some DDL statements can start a *new* transaction.
773
Behaviour of the server in this case is currently badly
775
DDL statements use a form of "semantic" logging
776
to maintain atomicity: if CREATE TABLE .. SELECT failed,
777
the newly created table is deleted.
778
In addition, some DDL statements issue interim transaction
779
commits: e.g. ALTER TABLE issues a commit after data is copied
780
from the original table to the internal temporary table. Other
781
statements, e.g. CREATE TABLE ... SELECT do not always commit
783
And finally there is a group of DDL statements such as
784
RENAME/DROP TABLE that doesn't start a new transaction
787
This diversity makes it hard to say what will happen if
788
by chance a stored function is invoked during a DDL --
789
whether any modifications it makes will be committed or not
790
is not clear. Fortunately, SQL grammar of few DDLs allows
791
invocation of a stored function.
793
A consistent behaviour is perhaps to always commit the normal
794
transaction after all DDLs, just like the statement transaction
795
is always committed at the end of all statements.
799
Register a storage engine for a transaction.
801
Every storage engine MUST call this function when it starts
802
a transaction or a statement (that is it must be called both for the
803
"beginning of transaction" and "beginning of statement").
804
Only storage engines registered for the transaction/statement
805
will know when to commit/rollback it.
808
trans_register_ha is idempotent - storage engine may register many
809
times per transaction.
812
void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
815
Ha_trx_info *ha_info;
819
trans= &thd->transaction.all;
820
thd->server_status|= SERVER_STATUS_IN_TRANS;
823
trans= &thd->transaction.stmt;
825
ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
827
if (ha_info->is_started())
828
return; /* already registered, return */
830
ha_info->register_ha(trans, ht_arg);
832
trans->no_2pc|=(ht_arg->prepare==0);
833
if (thd->transaction.xid_state.xid.is_null())
834
thd->transaction.xid_state.xid.set(thd->query_id);
843
1 error, transaction was rolled back
845
int ha_prepare(THD *thd)
848
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
849
Ha_trx_info *ha_info= trans->ha_list;
852
for (; ha_info; ha_info= ha_info->next())
855
handlerton *ht= ha_info->ht();
856
status_var_increment(thd->status_var.ha_prepare_count);
859
if ((err= ht->prepare(ht, thd, all)))
861
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
862
ha_rollback_trans(thd, all);
869
push_warning_printf(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN,
870
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
871
ha_resolve_storage_engine_name(ht));
879
Check if we can skip the two-phase commit.
881
A helper function to evaluate if two-phase commit is mandatory.
882
As a side effect, propagates the read-only/read-write flags
883
of the statement transaction to its enclosing normal transaction.
885
@retval true we must run a two-phase commit. Returned
886
if we have at least two engines with read-write changes.
887
@retval false Don't need two-phase commit. Even if we have two
888
transactional engines, we can run two independent
889
commits if changes in one of the engines are read-only.
894
ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
897
/* The number of storage engines that have actual changes. */
898
unsigned rw_ha_count= 0;
899
Ha_trx_info *ha_info;
901
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
903
if (ha_info->is_trx_read_write())
908
Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
909
assert(ha_info != ha_info_all);
911
Merge read-only/read-write information about statement
912
transaction to its enclosing normal transaction. Do this
913
only if in a real transaction -- that is, if we know
914
that ha_info_all is registered in thd->transaction.all.
915
Since otherwise we only clutter the normal transaction flags.
917
if (ha_info_all->is_started()) /* false if autocommit. */
918
ha_info_all->coalesce_trx_with(ha_info);
920
else if (rw_ha_count > 1)
923
It is a normal transaction, so we don't need to merge read/write
924
information up, and the need for two-phase commit has been
925
already established. Break the loop prematurely.
930
return rw_ha_count > 1;
938
1 transaction was rolled back
940
2 error during commit, data may be inconsistent
943
Since we don't support nested statement transactions in 5.0,
944
we can't commit or rollback stmt transactions while we are inside
945
stored functions or triggers. So we simply do nothing now.
946
TODO: This should be fixed in later ( >= 5.1) releases.
948
int ha_commit_trans(THD *thd, bool all)
950
int error= 0, cookie= 0;
952
'all' means that this is either an explicit commit issued by
953
user, or an implicit commit issued by a DDL.
955
THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
956
bool is_real_trans= all || thd->transaction.all.ha_list == 0;
957
Ha_trx_info *ha_info= trans->ha_list;
958
my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
961
We must not commit the normal transaction if a statement
962
transaction is pending. Otherwise statement transaction
963
flags will not get propagated to its normal transaction's
966
assert(thd->transaction.stmt.ha_list == NULL ||
967
trans == &thd->transaction.stmt);
969
if (thd->in_sub_stmt)
972
Since we don't support nested statement transactions in 5.0,
973
we can't commit or rollback stmt transactions while we are inside
974
stored functions or triggers. So we simply do nothing now.
975
TODO: This should be fixed in later ( >= 5.1) releases.
980
We assume that all statements which commit or rollback main transaction
981
are prohibited inside of stored functions or triggers. So they should
982
bail out with error even before ha_commit_trans() call. To be 100% safe
983
let us throw error in non-debug builds.
986
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
993
if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
995
ha_rollback_trans(thd, all);
1001
&& ! thd->slave_thread
1004
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
1005
ha_rollback_trans(thd, all);
1010
must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
1012
if (!trans->no_2pc && must_2pc)
1014
for (; ha_info && !error; ha_info= ha_info->next())
1017
handlerton *ht= ha_info->ht();
1019
Do not call two-phase commit if this particular
1020
transaction is read-only. This allows for simpler
1021
implementation in engines that are always read-only.
1023
if (! ha_info->is_trx_read_write())
1026
Sic: we know that prepare() is not NULL since otherwise
1027
trans->no_2pc would have been set.
1029
if ((err= ht->prepare(ht, thd, all)))
1031
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1034
status_var_increment(thd->status_var.ha_prepare_count);
1036
if (error || (is_real_trans && xid &&
1037
(error= !(cookie= tc_log->log_xid(thd, xid)))))
1039
ha_rollback_trans(thd, all);
1044
error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
1046
tc_log->unlog(cookie, xid);
1049
start_waiting_global_read_lock(thd);
1056
This function does not care about global read lock. A caller should.
1058
int ha_commit_one_phase(THD *thd, bool all)
1061
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1062
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1063
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1066
for (; ha_info; ha_info= ha_info_next)
1069
handlerton *ht= ha_info->ht();
1070
if ((err= ht->commit(ht, thd, all)))
1072
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1075
status_var_increment(thd->status_var.ha_commit_count);
1076
ha_info_next= ha_info->next();
1077
ha_info->reset(); /* keep it conveniently zero-filled */
1082
thd->transaction.xid_state.xid.null();
1085
thd->variables.tx_isolation=thd->session_tx_isolation;
1086
thd->transaction.cleanup();
1093
int ha_rollback_trans(THD *thd, bool all)
1096
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1097
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1098
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1101
We must not rollback the normal transaction if a statement
1102
transaction is pending.
1104
assert(thd->transaction.stmt.ha_list == NULL ||
1105
trans == &thd->transaction.stmt);
1107
if (thd->in_sub_stmt)
1110
If we are inside stored function or trigger we should not commit or
1111
rollback current statement transaction. See comment in ha_commit_trans()
1112
call for more information.
1117
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1122
for (; ha_info; ha_info= ha_info_next)
1125
handlerton *ht= ha_info->ht();
1126
if ((err= ht->rollback(ht, thd, all)))
1128
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1131
status_var_increment(thd->status_var.ha_rollback_count);
1132
ha_info_next= ha_info->next();
1133
ha_info->reset(); /* keep it conveniently zero-filled */
1138
thd->transaction.xid_state.xid.null();
1141
thd->variables.tx_isolation=thd->session_tx_isolation;
1142
thd->transaction.cleanup();
1146
thd->transaction_rollback_request= false;
1149
If a non-transactional table was updated, warn; don't warn if this is a
1150
slave thread (because when a slave thread executes a ROLLBACK, it has
1151
been read from the binary log, so it's 100% sure and normal to produce
1152
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
1153
slave SQL thread, it would not stop the thread but just be printed in
1154
the error log; but we don't want users to wonder why they have this
1155
message in the error log, so we don't send it.
1157
if (is_real_trans && thd->transaction.all.modified_non_trans_table &&
1158
!thd->slave_thread && thd->killed != THD::KILL_CONNECTION)
1159
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN,
1160
ER_WARNING_NOT_COMPLETE_ROLLBACK,
1161
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
1166
This is used to commit or rollback a single statement depending on
1170
Note that if the autocommit is on, then the following call inside
1171
InnoDB will commit or rollback the whole transaction (= the statement). The
1172
autocommit mechanism built into InnoDB is based on counting locks, but if
1173
the user has used LOCK TABLES then that mechanism does not know to do the
1176
int ha_autocommit_or_rollback(THD *thd, int error)
1178
if (thd->transaction.stmt.ha_list)
1182
if (ha_commit_trans(thd, 0))
1187
(void) ha_rollback_trans(thd, 0);
1188
if (thd->transaction_rollback_request && !thd->in_sub_stmt)
1189
(void) ha_rollback(thd);
1192
thd->variables.tx_isolation=thd->session_tx_isolation;
1203
static bool xacommit_handlerton(THD *unused1 __attribute__((unused)),
1207
handlerton *hton= plugin_data(plugin, handlerton *);
1208
if (hton->state == SHOW_OPTION_YES && hton->recover)
1210
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
1211
((struct xahton_st *)arg)->result= 0;
1216
static bool xarollback_handlerton(THD *unused1 __attribute__((unused)),
1220
handlerton *hton= plugin_data(plugin, handlerton *);
1221
if (hton->state == SHOW_OPTION_YES && hton->recover)
1223
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
1224
((struct xahton_st *)arg)->result= 0;
1230
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
1232
struct xahton_st xaop;
1236
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
1237
MYSQL_STORAGE_ENGINE_PLUGIN, &xaop);
1243
recover() step of xa.
1246
there are three modes of operation:
1247
- automatic recover after a crash
1248
in this case commit_list != 0, tc_heuristic_recover==0
1249
all xids from commit_list are committed, others are rolled back
1250
- manual (heuristic) recover
1251
in this case commit_list==0, tc_heuristic_recover != 0
1252
DBA has explicitly specified that all prepared transactions should
1253
be committed (or rolled back).
1254
- no recovery (MySQL did not detect a crash)
1255
in this case commit_list==0, tc_heuristic_recover == 0
1256
there should be no prepared transactions in this case.
1260
int len, found_foreign_xids, found_my_xids;
1266
static bool xarecover_handlerton(THD *unused __attribute__((unused)),
1270
handlerton *hton= plugin_data(plugin, handlerton *);
1271
struct xarecover_st *info= (struct xarecover_st *) arg;
1274
if (hton->state == SHOW_OPTION_YES && hton->recover)
1276
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
1278
sql_print_information("Found %d prepared transaction(s) in %s",
1279
got, ha_resolve_storage_engine_name(hton));
1280
for (int i=0; i < got; i ++)
1282
my_xid x=info->list[i].get_my_xid();
1283
if (!x) // not "mine" - that is generated by external TM
1285
xid_cache_insert(info->list+i, XA_PREPARED);
1286
info->found_foreign_xids++;
1291
info->found_my_xids++;
1295
if (info->commit_list ?
1296
hash_search(info->commit_list, (uchar *)&x, sizeof(x)) != 0 :
1297
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
1299
hton->commit_by_xid(hton, info->list+i);
1303
hton->rollback_by_xid(hton, info->list+i);
1306
if (got < info->len)
1313
int ha_recover(HASH *commit_list)
1315
struct xarecover_st info;
1316
info.found_foreign_xids= info.found_my_xids= 0;
1317
info.commit_list= commit_list;
1318
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1321
/* commit_list and tc_heuristic_recover cannot be set both */
1322
assert(info.commit_list==0 || tc_heuristic_recover==0);
1323
/* if either is set, total_ha_2pc must be set too */
1324
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1326
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1329
if (info.commit_list)
1330
sql_print_information("Starting crash recovery...");
1333
#ifndef WILL_BE_DELETED_LATER
1336
for now, only InnoDB supports 2pc. It means we can always safely
1337
rollback all pending transactions, without risking inconsistent data
1340
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1341
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1346
for (info.len= MAX_XID_LIST_SIZE ;
1347
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1349
info.list=(XID *)my_malloc(info.len*sizeof(XID), MYF(0));
1353
sql_print_error(ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1357
plugin_foreach(NULL, xarecover_handlerton,
1358
MYSQL_STORAGE_ENGINE_PLUGIN, &info);
1360
my_free((uchar*)info.list, MYF(0));
1361
if (info.found_foreign_xids)
1362
sql_print_warning("Found %d prepared XA transactions",
1363
info.found_foreign_xids);
1364
if (info.dry_run && info.found_my_xids)
1366
sql_print_error("Found %d prepared transactions! It means that mysqld was "
1367
"not shut down properly last time and critical recovery "
1368
"information (last binlog or %s file) was manually deleted "
1369
"after a crash. You have to start mysqld with "
1370
"--tc-heuristic-recover switch to commit or rollback "
1371
"pending transactions.",
1372
info.found_my_xids, opt_tc_log_file);
1375
if (info.commit_list)
1376
sql_print_information("Crash recovery finished.");
1381
return the list of XID's to a client, the same way SHOW commands do.
1384
I didn't find in XA specs that an RM cannot return the same XID twice,
1385
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1386
It can be easily fixed later, if necessary.
1388
bool mysql_xa_recover(THD *thd)
1390
List<Item> field_list;
1391
Protocol *protocol= thd->protocol;
1395
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1396
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1397
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1398
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1400
if (protocol->send_fields(&field_list,
1401
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1404
pthread_mutex_lock(&LOCK_xid_cache);
1405
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1407
if (xs->xa_state==XA_PREPARED)
1409
protocol->prepare_for_resend();
1410
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1411
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1412
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1413
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1415
if (protocol->write())
1417
pthread_mutex_unlock(&LOCK_xid_cache);
1423
pthread_mutex_unlock(&LOCK_xid_cache);
1430
This function should be called when MySQL sends rows of a SELECT result set
1431
or the EOF mark to the client. It releases a possible adaptive hash index
1432
S-latch held by thd in InnoDB and also releases a possible InnoDB query
1433
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to
1434
keep them over several calls of the InnoDB handler interface when a join
1435
is executed. But when we let the control to pass to the client they have
1436
to be released because if the application program uses mysql_use_result(),
1437
it may deadlock on the S-latch if the application on another connection
1438
performs another SQL query. In MySQL-4.1 this is even more important because
1439
there a connection can have several SELECT queries open at the same time.
1441
@param thd the thread handle of the current connection
1446
static bool release_temporary_latches(THD *thd, plugin_ref plugin,
1447
void *unused __attribute__((unused)))
1449
handlerton *hton= plugin_data(plugin, handlerton *);
1451
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1452
hton->release_temporary_latches(hton, thd);
1458
int ha_release_temporary_latches(THD *thd)
1460
plugin_foreach(thd, release_temporary_latches, MYSQL_STORAGE_ENGINE_PLUGIN,
1466
int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
1469
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1470
&thd->transaction.all);
1471
Ha_trx_info *ha_info, *ha_info_next;
1475
rolling back to savepoint in all storage engines that were part of the
1476
transaction when the savepoint was set
1478
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1481
handlerton *ht= ha_info->ht();
1483
assert(ht->savepoint_set != 0);
1484
if ((err= ht->savepoint_rollback(ht, thd,
1485
(uchar *)(sv+1)+ht->savepoint_offset)))
1487
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1490
status_var_increment(thd->status_var.ha_savepoint_rollback_count);
1491
trans->no_2pc|= ht->prepare == 0;
1494
rolling back the transaction in all storage engines that were not part of
1495
the transaction when the savepoint was set
1497
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1498
ha_info= ha_info_next)
1501
handlerton *ht= ha_info->ht();
1502
if ((err= ht->rollback(ht, thd, !thd->in_sub_stmt)))
1504
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1507
status_var_increment(thd->status_var.ha_rollback_count);
1508
ha_info_next= ha_info->next();
1509
ha_info->reset(); /* keep it conveniently zero-filled */
1511
trans->ha_list= sv->ha_list;
1517
according to the sql standard (ISO/IEC 9075-2:2003)
1518
section "4.33.4 SQL-statements and transaction states",
1519
SAVEPOINT is *not* transaction-initiating SQL-statement
1521
int ha_savepoint(THD *thd, SAVEPOINT *sv)
1524
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1525
&thd->transaction.all);
1526
Ha_trx_info *ha_info= trans->ha_list;
1527
for (; ha_info; ha_info= ha_info->next())
1530
handlerton *ht= ha_info->ht();
1532
if (! ht->savepoint_set)
1534
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1538
if ((err= ht->savepoint_set(ht, thd, (uchar *)(sv+1)+ht->savepoint_offset)))
1540
my_error(ER_GET_ERRNO, MYF(0), err);
1543
status_var_increment(thd->status_var.ha_savepoint_count);
1546
Remember the list of registered storage engines. All new
1547
engines are prepended to the beginning of the list.
1549
sv->ha_list= trans->ha_list;
1553
int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
1556
Ha_trx_info *ha_info= sv->ha_list;
1558
for (; ha_info; ha_info= ha_info->next())
1561
handlerton *ht= ha_info->ht();
1562
/* Savepoint life time is enclosed into transaction life time. */
1564
if (!ht->savepoint_release)
1566
if ((err= ht->savepoint_release(ht, thd,
1567
(uchar *)(sv+1) + ht->savepoint_offset)))
1569
my_error(ER_GET_ERRNO, MYF(0), err);
1577
static bool snapshot_handlerton(THD *thd, plugin_ref plugin, void *arg)
1579
handlerton *hton= plugin_data(plugin, handlerton *);
1580
if (hton->state == SHOW_OPTION_YES &&
1581
hton->start_consistent_snapshot)
1583
hton->start_consistent_snapshot(hton, thd);
1584
*((bool *)arg)= false;
1589
int ha_start_consistent_snapshot(THD *thd)
1593
plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
1596
Same idea as when one wants to CREATE TABLE in one engine which does not
1600
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1601
"This MySQL server does not support any "
1602
"consistent-read capable storage engine");
1607
static bool flush_handlerton(THD *thd __attribute__((unused)),
1609
void *arg __attribute__((unused)))
1611
handlerton *hton= plugin_data(plugin, handlerton *);
1612
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1613
hton->flush_logs(hton))
1619
bool ha_flush_logs(handlerton *db_type)
1621
if (db_type == NULL)
1623
if (plugin_foreach(NULL, flush_handlerton,
1624
MYSQL_STORAGE_ENGINE_PLUGIN, 0))
1629
if (db_type->state != SHOW_OPTION_YES ||
1630
(db_type->flush_logs && db_type->flush_logs(db_type)))
1636
static const char *check_lowercase_names(handler *file, const char *path,
1639
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1642
/* Ensure that table handler get path in lower case */
1643
if (tmp_path != path)
1644
strmov(tmp_path, path);
1647
we only should turn into lowercase database/table part
1648
so start the process after homedirectory
1650
my_casedn_str(files_charset_info, tmp_path + mysql_data_home_len);
1656
An interceptor to hijack the text of the error message without
1657
setting an error in the thread. We need the text to present it
1658
in the form of a warning to the user.
1661
struct Ha_delete_table_error_handler: public Internal_error_handler
1664
virtual bool handle_error(uint sql_errno,
1665
const char *message,
1666
DRIZZLE_ERROR::enum_warning_level level,
1668
char buff[DRIZZLE_ERRMSG_SIZE];
1673
Ha_delete_table_error_handler::
1674
handle_error(uint sql_errno __attribute__((unused)),
1675
const char *message,
1676
DRIZZLE_ERROR::enum_warning_level level __attribute__((unused)),
1677
THD *thd __attribute__((unused)))
1679
/* Grab the error message */
1680
strmake(buff, message, sizeof(buff)-1);
1686
This should return ENOENT if the file doesn't exists.
1687
The .frm file will be deleted only if we return 0 or ENOENT
1689
int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
1690
const char *db, const char *alias, bool generate_warning)
1693
char tmp_path[FN_REFLEN];
1696
TABLE_SHARE dummy_share;
1698
memset(&dummy_table, 0, sizeof(dummy_table));
1699
memset(&dummy_share, 0, sizeof(dummy_share));
1700
dummy_table.s= &dummy_share;
1702
/* DB_TYPE_UNKNOWN is used in ALTER TABLE when renaming only .frm files */
1703
if (table_type == NULL ||
1704
! (file=get_new_handler((TABLE_SHARE*)0, thd->mem_root, table_type)))
1707
path= check_lowercase_names(file, path, tmp_path);
1708
if ((error= file->ha_delete_table(path)) && generate_warning)
1711
Because file->print_error() use my_error() to generate the error message
1712
we use an internal error handler to intercept it and store the text
1713
in a temporary buffer. Later the message will be presented to user
1716
Ha_delete_table_error_handler ha_delete_table_error_handler;
1718
/* Fill up strucutures that print_error may need */
1719
dummy_share.path.str= (char*) path;
1720
dummy_share.path.length= strlen(path);
1721
dummy_share.db.str= (char*) db;
1722
dummy_share.db.length= strlen(db);
1723
dummy_share.table_name.str= (char*) alias;
1724
dummy_share.table_name.length= strlen(alias);
1725
dummy_table.alias= alias;
1727
file->change_table_ptr(&dummy_table, &dummy_share);
1729
thd->push_internal_handler(&ha_delete_table_error_handler);
1730
file->print_error(error, 0);
1732
thd->pop_internal_handler();
1735
XXX: should we convert *all* errors to warnings here?
1736
What if the error is fatal?
1738
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_ERROR, error,
1739
ha_delete_table_error_handler.buff);
30
#include "drizzled/my_hash.h"
31
#include "drizzled/error.h"
32
#include "drizzled/gettext.h"
33
#include "drizzled/probes.h"
34
#include "drizzled/sql_parse.h"
35
#include "drizzled/optimizer/cost_vector.h"
36
#include "drizzled/session.h"
37
#include "drizzled/sql_base.h"
38
#include "drizzled/transaction_services.h"
39
#include "drizzled/lock.h"
40
#include "drizzled/item/int.h"
41
#include "drizzled/item/empty_string.h"
42
#include "drizzled/field/timestamp.h"
43
#include "drizzled/message/table.pb.h"
44
#include "drizzled/plugin/client.h"
45
#include "drizzled/internal/my_sys.h"
46
#include "drizzled/plugin/event_observer.h"
1745
53
/****************************************************************************
1746
** General handler functions
54
** General Cursor functions
1747
55
****************************************************************************/
1748
handler *handler::clone(MEM_ROOT *mem_root)
1750
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
56
Cursor::Cursor(plugin::StorageEngine &engine_arg,
57
TableShare &share_arg)
58
: table_share(&share_arg), table(0),
59
estimation_rows_to_insert(0), engine(&engine_arg),
61
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
62
ref_length(sizeof(internal::my_off_t)),
65
next_insert_id(0), insert_id_for_cur_row(0)
70
assert(locked == false);
71
/* TODO: assert(inited == NONE); */
76
* @note this only used in
77
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
78
* of the writing of this comment. -Brian
80
Cursor *Cursor::clone(memory::Root *mem_root)
82
Cursor *new_handler= table->getMutableShare()->db_type()->getCursor(*table->getMutableShare());
1752
Allocate handler->ref here because otherwise ha_open will allocate it
1753
on this->table->mem_root and we will not be able to reclaim that memory
1754
when the clone handler object is destroyed.
85
Allocate Cursor->ref here because otherwise ha_open will allocate it
86
on this->table->mem_root and we will not be able to reclaim that memory
87
when the clone Cursor object is destroyed.
1756
if (!(new_handler->ref= (uchar*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
89
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1758
if (new_handler && !new_handler->ha_open(table,
1759
table->s->normalized_path.str,
92
TableIdentifier identifier(table->getShare()->getSchemaName(),
93
table->getShare()->getTableName(),
94
table->getShare()->getType());
96
if (new_handler && !new_handler->ha_open(identifier,
1761
99
HA_OPEN_IGNORE_IF_LOCKED))
1762
100
return new_handler;
1768
void handler::ha_statistic_increment(ulong SSV::*offset) const
1770
status_var_increment(table->in_use->status_var.*offset);
1773
void **handler::ha_data(THD *thd) const
1775
return thd_ha_data(thd, ht);
1778
THD *handler::ha_thd(void) const
1780
assert(!table || !table->in_use || table->in_use == current_thd);
1781
return (table && table->in_use) ? table->in_use : current_thd;
106
given a buffer with a key value, and a map of keyparts
107
that are present in this value, returns the length of the value
109
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
111
/* works only with key prefixes */
112
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
114
const KeyPartInfo *key_part_found= table->getShare()->getKeyInfo(key_position).key_part;
115
const KeyPartInfo *end_key_part_found= key_part_found + table->getShare()->getKeyInfo(key_position).key_parts;
118
while (key_part_found < end_key_part_found && keypart_map_arg)
120
length+= key_part_found->store_length;
121
keypart_map_arg >>= 1;
127
int Cursor::startIndexScan(uint32_t idx, bool sorted)
130
assert(inited == NONE);
131
if (!(result= doStartIndexScan(idx, sorted)))
137
int Cursor::endIndexScan()
139
assert(inited==INDEX);
142
return(doEndIndexScan());
145
int Cursor::startTableScan(bool scan)
148
assert(inited==NONE || (inited==RND && scan));
149
inited= (result= doStartTableScan(scan)) ? NONE: RND;
154
int Cursor::endTableScan()
158
return(doEndTableScan());
161
int Cursor::ha_index_or_rnd_end()
163
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
166
void Cursor::ha_start_bulk_insert(ha_rows rows)
168
estimation_rows_to_insert= rows;
169
start_bulk_insert(rows);
172
int Cursor::ha_end_bulk_insert()
174
estimation_rows_to_insert= 0;
175
return end_bulk_insert();
178
void Cursor::change_table_ptr(Table *table_arg, TableShare *share)
184
const key_map *Cursor::keys_to_use_for_scanning()
186
return &key_map_empty;
189
bool Cursor::has_transactions()
191
return (table->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
194
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
196
(table->in_use->status_var.*offset)++;
199
void **Cursor::ha_data(Session *session) const
201
return session->getEngineData(engine);
204
bool Cursor::is_fatal_error(int error, uint32_t flags)
207
((flags & HA_CHECK_DUP_KEY) &&
208
(error == HA_ERR_FOUND_DUPP_KEY ||
209
error == HA_ERR_FOUND_DUPP_UNIQUE)))
215
ha_rows Cursor::records() { return stats.records; }
216
uint64_t Cursor::tableSize() { return stats.index_file_length + stats.data_file_length; }
217
uint64_t Cursor::rowSize() { return table->getRecordLength() + table->sizeFields(); }
219
int Cursor::doOpen(const TableIdentifier &identifier, int mode, uint32_t test_if_locked)
221
return open(identifier.getPath().c_str(), mode, test_if_locked);
1785
Open database-handler.
225
Open database-Cursor.
1787
227
Try O_RDONLY if cannot open as O_RDWR
1788
228
Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
1790
int handler::ha_open(TABLE *table_arg, const char *name, int mode,
230
int Cursor::ha_open(const TableIdentifier &identifier,
1795
237
table= table_arg;
1796
assert(table->s == table_share);
1797
assert(alloc_root_inited(&table->mem_root));
238
assert(table->getShare() == table_share);
1799
if ((error=open(name,mode,test_if_locked)))
240
if ((error= doOpen(identifier, mode, test_if_locked)))
1801
242
if ((error == EACCES || error == EROFS) && mode == O_RDWR &&
1802
(table->db_stat & HA_TRY_READ_ONLY))
243
(table->db_stat & HA_TRY_READ_ONLY))
1804
245
table->db_stat|=HA_READ_ONLY;
1805
error=open(name,O_RDONLY,test_if_locked);
246
error= doOpen(identifier, O_RDONLY,test_if_locked);
1810
my_errno= error; /* Safeguard */
251
errno= error; /* Safeguard */
1814
if (table->s->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
255
if (table->getShare()->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
1815
256
table->db_stat|=HA_READ_ONLY;
1816
257
(void) extra(HA_EXTRA_NO_READCHECK); // Not needed in SQL
1818
/* ref is already allocated for us if we're called from handler::clone() */
1819
if (!ref && !(ref= (uchar*) alloc_root(&table->mem_root,
1820
ALIGN_SIZE(ref_length)*2)))
259
/* ref is already allocated for us if we're called from Cursor::clone() */
260
if (!ref && !(ref= (unsigned char*) table->alloc_root(ALIGN_SIZE(ref_length)*2)))
1823
263
error=HA_ERR_OUT_OF_MEM;
1826
266
dup_ref=ref+ALIGN_SIZE(ref_length);
1827
cached_table_flags= table_flags();
1833
one has to use this method when to find
1834
random position by record as the plain
1835
position() call doesn't work for some
1836
handlers for random position
1839
int handler::rnd_pos_by_record(uchar *record)
1844
if (inited && (error= ha_index_end()))
1846
if ((error= ha_rnd_init(false)))
1849
return(rnd_pos(record, ref));
3677
1152
while ((result == HA_ERR_END_OF_FILE) && !range_res);
3679
1154
*range_info= mrr_cur_range.ptr;
3684
/* **************************************************************************
3685
* DS-MRR implementation
3686
***************************************************************************/
3689
DS-MRR: Initialize and start MRR scan
3691
Initialize and start the MRR scan. Depending on the mode parameter, this
3692
may use default or DS-MRR implementation.
3694
@param h Table handler to be used
3695
@param key Index to be used
3696
@param seq_funcs Interval sequence enumeration functions
3697
@param seq_init_param Interval sequence enumeration parameter
3698
@param n_ranges Number of ranges in the sequence.
3699
@param mode HA_MRR_* modes to use
3700
@param buf INOUT Buffer to use
3702
@retval 0 Ok, Scan started.
3706
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3707
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3708
uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3712
Item *pushed_cond= NULL;
3714
keyno= h->active_index;
3716
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3718
use_default_impl= true;
3719
return(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3720
n_ranges, mode, buf));
3722
rowids_buf= buf->buffer;
3723
//psergey-todo: don't add key_length as it is not needed anymore
3724
rowids_buf += key->key_length + h->ref_length;
3726
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3727
rowids_buf_end= buf->buffer_end;
3729
elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3730
rowids_buf_last= rowids_buf +
3731
((rowids_buf_end - rowids_buf)/ elem_size)*
3733
rowids_buf_end= rowids_buf_last;
3735
/* Create a separate handler object to do rndpos() calls. */
3736
THD *thd= current_thd;
3737
if (!(new_h2= h->clone(thd->mem_root)) ||
3738
new_h2->ha_external_lock(thd, F_RDLCK))
3744
if (keyno == h->pushed_idx_cond_keyno)
3745
pushed_cond= h->pushed_idx_cond;
3746
if (h->ha_index_end())
3753
table->prepare_for_position();
3754
new_h2->extra(HA_EXTRA_KEYREAD);
3756
if (h2->ha_index_init(keyno, false) ||
3757
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3760
use_default_impl= false;
3763
h2->idx_cond_push(keyno, pushed_cond);
3764
if (dsmrr_fill_buffer(new_h2))
3768
If the above call has scanned through all intervals in *seq, then
3769
adjust *buf to indicate that the remaining buffer space will not be used.
3772
buf->end_of_used_area= rowids_buf_last;
3774
if (h->ha_rnd_init(false))
3779
h2->ha_index_or_rnd_end();
3780
h2->ha_external_lock(thd, F_UNLCK);
3787
void DsMrr_impl::dsmrr_close()
3791
h2->ha_external_lock(current_thd, F_UNLCK);
3796
use_default_impl= true;
3801
static int rowid_cmp(void *h, uchar *a, uchar *b)
3803
return ((handler*)h)->cmp_ref(a, b);
3808
DS-MRR: Fill the buffer with rowids and sort it by rowid
3810
{This is an internal function of DiskSweep MRR implementation}
3811
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3812
buffer. When the buffer is full or scan is completed, sort the buffer by
3815
The function assumes that rowids buffer is empty when it is invoked.
3817
@param h Table handler
3819
@retval 0 OK, the next portion of rowids is in the buffer,
3824
int DsMrr_impl::dsmrr_fill_buffer(handler *unused __attribute__((unused)))
3829
rowids_buf_cur= rowids_buf;
3830
while ((rowids_buf_cur < rowids_buf_end) &&
3831
!(res= h2->handler::multi_range_read_next(&range_info)))
3833
/* Put rowid, or {rowid, range_id} pair into the buffer */
3834
h2->position(table->record[0]);
3835
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3836
rowids_buf_cur += h->ref_length;
3840
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3841
rowids_buf_cur += sizeof(void*);
3845
if (res && res != HA_ERR_END_OF_FILE)
3847
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3849
/* Sort the buffer contents by rowid */
3850
uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3851
uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3853
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3855
rowids_buf_last= rowids_buf_cur;
3856
rowids_buf_cur= rowids_buf;
3862
DS-MRR implementation: multi_range_read_next() function
3865
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
3869
if (use_default_impl)
3870
return h->handler::multi_range_read_next(range_info);
3872
if (rowids_buf_cur == rowids_buf_last)
3876
res= HA_ERR_END_OF_FILE;
3879
res= dsmrr_fill_buffer(h);
3884
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3885
if (rowids_buf_cur == rowids_buf_last)
3887
res= HA_ERR_END_OF_FILE;
3891
res= h->rnd_pos(table->record[0], rowids_buf_cur);
3892
rowids_buf_cur += h->ref_length;
3895
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3896
rowids_buf_cur += sizeof(void*);
3907
DS-MRR implementation: multi_range_read_info() function
3909
int DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, uint *bufsz,
3910
uint *flags, COST_VECT *cost)
3913
uint def_flags= *flags;
3914
uint def_bufsz= *bufsz;
3916
/* Get cost/flags/mem_usage of default MRR implementation */
3917
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3921
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3922
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3924
/* Default implementation is choosen */
3933
DS-MRR Implementation: multi_range_read_info_const() function
3936
ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
3937
void *seq_init_param, uint n_ranges,
3938
uint *bufsz, uint *flags, COST_VECT *cost)
3941
uint def_flags= *flags;
3942
uint def_bufsz= *bufsz;
3943
/* Get cost/flags/mem_usage of default MRR implementation */
3944
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3945
n_ranges, &def_bufsz,
3947
if (rows == HA_POS_ERROR)
3949
/* Default implementation can't perform MRR scan => we can't either */
3954
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3955
use the default MRR implementation (we need it for UPDATE/DELETE).
3956
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3958
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3959
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3966
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3973
Check if key has partially-covered columns
3975
We can't use DS-MRR to perform range scans when the ranges are over
3976
partially-covered keys, because we'll not have full key part values
3977
(we'll have their prefixes from the index) and will not be able to check
3978
if we've reached the end the range.
3980
@param keyno Key to check
3983
Allow use of DS-MRR in cases where the index has partially-covered
3984
components but they are not used for scanning.
3990
bool DsMrr_impl::key_uses_partial_cols(uint keyno)
3992
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3993
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3994
for (; kp != kp_end; kp++)
3996
if (!kp->field->part_of_key.is_set(keyno))
4004
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
4006
Make the choice between using Default MRR implementation and DS-MRR.
4007
This function contains common functionality factored out of dsmrr_info()
4008
and dsmrr_info_const(). The function assumes that the default MRR
4009
implementation's applicability requirements are satisfied.
4011
@param keyno Index number
4012
@param rows E(full rows to be retrieved)
4013
@param flags IN MRR flags provided by the MRR user
4014
OUT If DS-MRR is choosen, flags of DS-MRR implementation
4015
else the value is not modified
4016
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
4017
else the value is not modified
4018
@param cost IN Cost of default MRR implementation
4019
OUT If DS-MRR is choosen, cost of DS-MRR scan
4020
else the value is not modified
4022
@retval true Default MRR implementation should be used
4023
@retval false DS-MRR implementation should be used
4026
bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
4027
uint *bufsz, COST_VECT *cost)
4029
COST_VECT dsmrr_cost;
4031
THD *thd= current_thd;
4032
if ((thd->variables.optimizer_use_mrr == 2) ||
4033
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
4034
(keyno == table->s->primary_key &&
4035
h->primary_key_is_clustered()) ||
4036
key_uses_partial_cols(keyno))
4038
/* Use the default implementation */
4039
*flags |= HA_MRR_USE_DEFAULT_IMPL;
4043
uint add_len= table->key_info[keyno].key_length + h->ref_length;
4045
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
4051
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
4052
DS-MRR and Default implementations cost. This allows one to force use of
4053
DS-MRR whenever it is applicable without affecting other cost-based
4056
if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
4057
dsmrr_cost.total_cost() > cost->total_cost())
4060
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
4062
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
4063
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
4069
/* Use the default MRR implementation */
4076
static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
4080
Get cost of DS-MRR scan
4082
@param keynr Index to be used
4083
@param rows E(Number of rows to be scanned)
4084
@param flags Scan parameters (HA_MRR_* flags)
4085
@param buffer_size INOUT Buffer size
4086
@param cost OUT The cost
4089
@retval true Error, DS-MRR cannot be used (the buffer is too small
4093
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
4094
uint *buffer_size, COST_VECT *cost)
4096
uint32_t max_buff_entries, elem_size;
4097
ha_rows rows_in_full_step, rows_in_last_step;
4099
double index_read_cost;
4101
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
4102
max_buff_entries = *buffer_size / elem_size;
4104
if (!max_buff_entries)
4105
return true; /* Buffer has not enough space for even 1 rowid */
4107
/* Number of iterations we'll make with full buffer */
4108
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
4111
Get numbers of rows we'll be processing in
4112
- non-last sweep, with full buffer
4113
- last iteration, with non-full buffer
4115
rows_in_full_step= max_buff_entries;
4116
rows_in_last_step= rows % max_buff_entries;
4118
/* Adjust buffer size if we expect to use only part of the buffer */
4121
get_sort_and_sweep_cost(table, rows, cost);
4122
cost->multiply(n_full_steps);
4127
*buffer_size= max(*buffer_size,
4128
(size_t)(1.2*rows_in_last_step) * elem_size +
4129
h->ref_length + table->key_info[keynr].key_length);
4132
COST_VECT last_step_cost;
4133
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
4134
cost->add(&last_step_cost);
4136
if (n_full_steps != 0)
4137
cost->mem_cost= *buffer_size;
4139
cost->mem_cost= (double)rows_in_last_step * elem_size;
4141
/* Total cost of all index accesses */
4142
index_read_cost= h->index_only_read_time(keynr, (double)rows);
4143
cost->add_io(index_read_cost, 1 /* Random seeks */);
4149
Get cost of one sort-and-sweep step
4152
get_sort_and_sweep_cost()
4153
table Table being accessed
4154
nrows Number of rows to be sorted and retrieved
4158
Get cost of these operations:
4159
- sort an array of #nrows ROWIDs using qsort
4160
- read #nrows records from table in a sweep.
4164
void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
4168
get_sweep_read_cost(table, nrows, false, cost);
4169
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
4170
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
4173
cost->cpu_cost += cmp_op * log2(cmp_op);
4181
Get cost of reading nrows table records in a "disk sweep"
4183
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
4184
for an ordered sequence of rowids.
4186
We assume hard disk IO. The read is performed as follows:
4188
1. The disk head is moved to the needed cylinder
4189
2. The controller waits for the plate to rotate
4190
3. The data is transferred
4192
Time to do #3 is insignificant compared to #2+#1.
4194
Time to move the disk head is proportional to head travel distance.
4196
Time to wait for the plate to rotate depends on whether the disk head
4199
If disk head wasn't moved, the wait time is proportional to distance
4200
between the previous block and the block we're reading.
4202
If the head was moved, we don't know how much we'll need to wait for the
4203
plate to rotate. We assume the wait time to be a variate with a mean of
4204
0.5 of full rotation time.
4206
Our cost units are "random disk seeks". The cost of random disk seek is
4207
actually not a constant, it depends one range of cylinders we're going
4208
to access. We make it constant by introducing a fuzzy concept of "typical
4209
datafile length" (it's fuzzy as it's hard to tell whether it should
4210
include index file, temp.tables etc). Then random seek cost is:
4212
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
4214
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
4216
@param table Table to be accessed
4217
@param nrows Number of rows to retrieve
4218
@param interrupted true <=> Assume that the disk sweep will be
4219
interrupted by other disk IO. false - otherwise.
4220
@param cost OUT The cost.
4223
void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
4227
if (table->file->primary_key_is_clustered())
4229
cost->io_count= table->file->read_time(table->s->primary_key,
4230
(uint) nrows, nrows);
4235
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
4237
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
4238
if (busy_blocks < 1.0)
4241
cost->io_count= busy_blocks;
4245
/* Assume reading is done in one 'sweep' */
4246
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
4247
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);