1
/* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
2
* vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
4
* Copyright (C) 2008 Sun Microsystems, Inc.
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; version 2 of the License.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
23
19
Handler-calling-functions
30
#include <drizzled/error.h>
31
#include <drizzled/field/epoch.h>
32
#include <drizzled/gettext.h>
33
#include <drizzled/internal/my_sys.h>
34
#include <drizzled/item/empty_string.h>
35
#include <drizzled/item/int.h>
36
#include <drizzled/lock.h>
37
#include <drizzled/message/table.h>
38
#include <drizzled/my_hash.h>
39
#include <drizzled/optimizer/cost_vector.h>
40
#include <drizzled/plugin/client.h>
41
#include <drizzled/plugin/event_observer.h>
42
#include <drizzled/plugin/storage_engine.h>
43
#include <drizzled/probes.h>
44
#include <drizzled/session.h>
45
#include <drizzled/sql_base.h>
46
#include <drizzled/sql_parse.h>
47
#include <drizzled/transaction_services.h>
22
#ifdef USE_PRAGMA_IMPLEMENTATION
23
#pragma implementation // gcc: Class implementation
26
#include "mysql_priv.h"
27
#include "rpl_filter.h"
28
#include <myisampack.h>
32
While we have legacy_db_type, we have this array to
33
check for dups and to find handlerton from legacy_db_type.
34
Remove when legacy_db_type is finally gone
36
st_plugin_int *hton2plugin[MAX_HA];
38
static handlerton *installed_htons[128];
40
#define BITMAP_STACKBUF_SIZE (128/8)
42
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NullS,0}, {NullS,0} };
44
/* number of entries in handlertons[] */
46
/* number of storage engines (from handlertons[]) that support 2pc */
47
uint32_t total_ha_2pc= 0;
48
/* size of savepoint storage area (see ha_init) */
49
uint32_t savepoint_alloc_size= 0;
51
static const LEX_STRING sys_table_aliases[]=
53
{ C_STRING_WITH_LEN("INNOBASE") }, { C_STRING_WITH_LEN("INNODB") },
54
{ C_STRING_WITH_LEN("HEAP") }, { C_STRING_WITH_LEN("MEMORY") },
58
const char *ha_row_type[] = {
59
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
62
const char *tx_isolation_names[] =
63
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
65
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
66
tx_isolation_names, NULL};
68
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
69
uint known_extensions_id= 0;
73
static plugin_ref ha_default_plugin(THD *thd)
75
if (thd->variables.table_plugin)
76
return thd->variables.table_plugin;
77
return my_plugin_lock(thd, &global_system_variables.table_plugin);
82
Return the default storage engine handlerton for thread
84
@param ha_default_handlerton(thd)
85
@param thd current thread
90
handlerton *ha_default_handlerton(THD *thd)
92
plugin_ref plugin= ha_default_plugin(thd);
94
handlerton *hton= plugin_data(plugin, handlerton*);
101
Return the storage engine handlerton for the supplied name
103
@param thd current thread
104
@param name name of storage engine
107
pointer to storage engine plugin handle
109
plugin_ref ha_resolve_by_name(THD *thd, const LEX_STRING *name)
111
const LEX_STRING *table_alias;
115
/* my_strnncoll is a macro and gcc doesn't do early expansion of macro */
116
if (thd && !my_charset_latin1.coll->strnncoll(&my_charset_latin1,
117
(const uchar *)name->str, name->length,
118
(const uchar *)STRING_WITH_LEN("DEFAULT"), 0))
119
return ha_default_plugin(thd);
121
if ((plugin= my_plugin_lock_by_name(thd, name, MYSQL_STORAGE_ENGINE_PLUGIN)))
123
handlerton *hton= plugin_data(plugin, handlerton *);
124
if (!(hton->flags & HTON_NOT_USER_SELECTABLE))
128
unlocking plugin immediately after locking is relatively low cost.
130
plugin_unlock(thd, plugin);
134
We check for the historical aliases.
136
for (table_alias= sys_table_aliases; table_alias->str; table_alias+= 2)
138
if (!my_strnncoll(&my_charset_latin1,
139
(const uchar *)name->str, name->length,
140
(const uchar *)table_alias->str, table_alias->length))
142
name= table_alias + 1;
151
plugin_ref ha_lock_engine(THD *thd, handlerton *hton)
155
st_plugin_int **plugin= hton2plugin + hton->slot;
157
return my_plugin_lock(thd, &plugin);
163
handlerton *ha_resolve_by_legacy_type(THD *thd, enum legacy_db_type db_type)
167
case DB_TYPE_DEFAULT:
168
return ha_default_handlerton(thd);
170
if (db_type > DB_TYPE_UNKNOWN && db_type < DB_TYPE_DEFAULT &&
171
(plugin= ha_lock_engine(thd, installed_htons[db_type])))
172
return plugin_data(plugin, handlerton*);
174
case DB_TYPE_UNKNOWN:
181
Use other database handler if databasehandler is not compiled in.
183
handlerton *ha_checktype(THD *thd, enum legacy_db_type database_type,
184
bool no_substitute, bool report_error)
186
handlerton *hton= ha_resolve_by_legacy_type(thd, database_type);
187
if (ha_storage_engine_is_enabled(hton))
194
const char *engine_name= ha_resolve_storage_engine_name(hton);
195
my_error(ER_FEATURE_DISABLED,MYF(0),engine_name,engine_name);
200
switch (database_type) {
202
return ha_resolve_by_legacy_type(thd, DB_TYPE_HASH);
207
return ha_default_handlerton(thd);
211
handler *get_new_handler(TABLE_SHARE *share, MEM_ROOT *alloc,
216
if (db_type && db_type->state == SHOW_OPTION_YES && db_type->create)
218
if ((file= db_type->create(db_type, share, alloc)))
223
Try the default table type
224
Here the call to current_thd() is ok as we call this function a lot of
225
times but we enter this branch very seldom.
227
return(get_new_handler(share, alloc, ha_default_handlerton(current_thd)));
232
Register handler error messages for use with my_error().
240
int ha_init_errors(void)
242
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
243
const char **errmsgs;
245
/* Allocate a pointer array for the error message strings. */
246
/* Zerofill it to avoid uninitialized gaps. */
247
if (! (errmsgs= (const char**) my_malloc(HA_ERR_ERRORS * sizeof(char*),
248
MYF(MY_WME | MY_ZEROFILL))))
251
/* Set the dedicated error messages. */
252
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
253
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
254
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
255
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
256
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
257
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
258
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
259
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
260
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
261
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
262
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
263
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
264
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
265
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
266
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
267
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
268
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
269
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
270
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
271
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
272
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
273
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
274
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
275
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
276
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
277
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
278
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
279
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
280
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
281
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
282
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
283
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
284
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
285
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
286
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
287
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
288
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
289
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
290
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
291
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
292
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
294
/* Register the error messages for use with my_error(). */
295
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
300
Unregister handler error messages.
307
static int ha_finish_errors(void)
309
const char **errmsgs;
311
/* Allocate a pointer array for the error message strings. */
312
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
314
my_free((uchar*) errmsgs, MYF(0));
319
int ha_finalize_handlerton(st_plugin_int *plugin)
321
handlerton *hton= (handlerton *)plugin->data;
326
case SHOW_OPTION_DISABLED:
328
case SHOW_OPTION_YES:
329
if (installed_htons[hton->db_type] == hton)
330
installed_htons[hton->db_type]= NULL;
335
hton->panic(hton, HA_PANIC_CLOSE);
337
my_free((uchar*)hton, MYF(0));
343
int ha_initialize_handlerton(st_plugin_int *plugin)
347
hton= (handlerton *)my_malloc(sizeof(handlerton),
348
MYF(MY_WME | MY_ZEROFILL));
350
FIXME: the MY_ZEROFILL flag above doesn't zero all the bytes.
352
This was detected after adding get_backup_engine member to handlerton
353
structure. Apparently get_backup_engine was not NULL even though it was
356
bzero(hton, sizeof(hton));
357
/* Historical Requirement */
358
plugin->data= hton; // shortcut for the future
359
if (plugin->plugin->init)
361
if (plugin->plugin->init(hton))
363
sql_print_error("Plugin '%s' init function returned error.",
370
the switch below and hton->state should be removed when
371
command-line options for plugins will be implemented
373
switch (hton->state) {
376
case SHOW_OPTION_YES:
379
/* now check the db_type for conflict */
380
if (hton->db_type <= DB_TYPE_UNKNOWN ||
381
hton->db_type >= DB_TYPE_DEFAULT ||
382
installed_htons[hton->db_type])
384
int idx= (int) DB_TYPE_FIRST_DYNAMIC;
386
while (idx < (int) DB_TYPE_DEFAULT && installed_htons[idx])
389
if (idx == (int) DB_TYPE_DEFAULT)
391
sql_print_warning("Too many storage engines!");
394
if (hton->db_type != DB_TYPE_UNKNOWN)
395
sql_print_warning("Storage engine '%s' has conflicting typecode. "
396
"Assigning value %d.", plugin->plugin->name, idx);
397
hton->db_type= (enum legacy_db_type) idx;
399
installed_htons[hton->db_type]= hton;
400
tmp= hton->savepoint_offset;
401
hton->savepoint_offset= savepoint_alloc_size;
402
savepoint_alloc_size+= tmp;
403
hton->slot= total_ha++;
404
hton2plugin[hton->slot]=plugin;
411
hton->state= SHOW_OPTION_DISABLED;
416
This is entirely for legacy. We will create a new "disk based" hton and a
417
"memory" hton which will be configurable longterm. We should be able to
418
remove partition and myisammrg.
420
switch (hton->db_type) {
440
assert(total_ha < MAX_HA);
442
Check if there is a transaction-capable storage engine besides the
443
binary log (which is considered a transaction-capable storage engine in
446
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
447
savepoint_alloc_size+= sizeof(SAVEPOINT);
456
This should be eventualy based on the graceful shutdown flag.
457
So if flag is equal to HA_PANIC_CLOSE, the deallocate
460
if (ha_finish_errors())
466
static bool dropdb_handlerton(THD *unused1 __attribute__((__unused__)),
470
handlerton *hton= plugin_data(plugin, handlerton *);
471
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
472
hton->drop_database(hton, (char *)path);
477
void ha_drop_database(char* path)
479
plugin_foreach(NULL, dropdb_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, path);
483
static bool closecon_handlerton(THD *thd, plugin_ref plugin,
484
void *unused __attribute__((__unused__)))
486
handlerton *hton= plugin_data(plugin, handlerton *);
488
there's no need to rollback here as all transactions must
489
be rolled back already
491
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
492
thd_get_ha_data(thd, hton))
493
hton->close_connection(hton, thd);
500
don't bother to rollback here, it's done already
502
void ha_close_connection(THD* thd)
504
plugin_foreach(thd, closecon_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, 0);
507
/* ========================================================================
508
======================= TRANSACTIONS ===================================*/
511
Transaction handling in the server
512
==================================
514
In each client connection, MySQL maintains two transactional
516
- a statement transaction,
517
- a standard, also called normal transaction.
521
"Statement transaction" is a non-standard term that comes
522
from the times when MySQL supported BerkeleyDB storage engine.
524
First of all, it should be said that in BerkeleyDB auto-commit
525
mode auto-commits operations that are atomic to the storage
526
engine itself, such as a write of a record, and are too
527
high-granular to be atomic from the application perspective
528
(MySQL). One SQL statement could involve many BerkeleyDB
529
auto-committed operations and thus BerkeleyDB auto-commit was of
532
Secondly, instead of SQL standard savepoints, BerkeleyDB
533
provided the concept of "nested transactions". In a nutshell,
534
transactions could be arbitrarily nested, but when the parent
535
transaction was committed or aborted, all its child (nested)
536
transactions were handled committed or aborted as well.
537
Commit of a nested transaction, in turn, made its changes
538
visible, but not durable: it destroyed the nested transaction,
539
all its changes would become available to the parent and
540
currently active nested transactions of this parent.
542
So the mechanism of nested transactions was employed to
543
provide "all or nothing" guarantee of SQL statements
544
required by the standard.
545
A nested transaction would be created at start of each SQL
546
statement, and destroyed (committed or aborted) at statement
547
end. Such nested transaction was internally referred to as
548
a "statement transaction" and gave birth to the term.
550
<Historical note ends>
552
Since then a statement transaction is started for each statement
553
that accesses transactional tables or uses the binary log. If
554
the statement succeeds, the statement transaction is committed.
555
If the statement fails, the transaction is rolled back. Commits
556
of statement transactions are not durable -- each such
557
transaction is nested in the normal transaction, and if the
558
normal transaction is rolled back, the effects of all enclosed
559
statement transactions are undone as well. Technically,
560
a statement transaction can be viewed as a savepoint which is
561
maintained automatically in order to make effects of one
564
The normal transaction is started by the user and is ended
565
usually upon a user request as well. The normal transaction
566
encloses transactions of all statements issued between
567
its beginning and its end.
568
In autocommit mode, the normal transaction is equivalent
569
to the statement transaction.
571
Since MySQL supports PSEA (pluggable storage engine
572
architecture), more than one transactional engine can be
573
active at a time. Hence transactions, from the server
574
point of view, are always distributed. In particular,
575
transactional state is maintained independently for each
576
engine. In order to commit a transaction the two phase
577
commit protocol is employed.
579
Not all statements are executed in context of a transaction.
580
Administrative and status information statements do not modify
581
engine data, and thus do not start a statement transaction and
582
also have no effect on the normal transaction. Examples of such
583
statements are SHOW STATUS and RESET SLAVE.
585
Similarly DDL statements are not transactional,
586
and therefore a transaction is [almost] never started for a DDL
587
statement. The difference between a DDL statement and a purely
588
administrative statement though is that a DDL statement always
589
commits the current transaction before proceeding, if there is
592
At last, SQL statements that work with non-transactional
593
engines also have no effect on the transaction state of the
594
connection. Even though they are written to the binary log,
595
and the binary log is, overall, transactional, the writes
596
are done in "write-through" mode, directly to the binlog
597
file, followed with a OS cache sync, in other words,
598
bypassing the binlog undo log (translog).
599
They do not commit the current normal transaction.
600
A failure of a statement that uses non-transactional tables
601
would cause a rollback of the statement transaction, but
602
in case there no non-transactional tables are used,
603
no statement transaction is started.
608
The server stores its transaction-related data in
609
thd->transaction. This structure has two members of type
610
THD_TRANS. These members correspond to the statement and
611
normal transactions respectively:
613
- thd->transaction.stmt contains a list of engines
614
that are participating in the given statement
615
- thd->transaction.all contains a list of engines that
616
have participated in any of the statement transactions started
617
within the context of the normal transaction.
618
Each element of the list contains a pointer to the storage
619
engine, engine-specific transactional data, and engine-specific
622
In autocommit mode thd->transaction.all is empty.
623
Instead, data of thd->transaction.stmt is
624
used to commit/rollback the normal transaction.
626
The list of registered engines has a few important properties:
627
- no engine is registered in the list twice
628
- engines are present in the list a reverse temporal order --
629
new participants are always added to the beginning of the list.
631
Transaction life cycle
632
----------------------
634
When a new connection is established, thd->transaction
635
members are initialized to an empty state.
636
If a statement uses any tables, all affected engines
637
are registered in the statement engine list. In
638
non-autocommit mode, the same engines are registered in
639
the normal transaction list.
640
At the end of the statement, the server issues a commit
641
or a roll back for all engines in the statement list.
642
At this point transaction flags of an engine, if any, are
643
propagated from the statement list to the list of the normal
645
When commit/rollback is finished, the statement list is
646
cleared. It will be filled in again by the next statement,
647
and emptied again at the next statement's end.
649
The normal transaction is committed in a similar way
650
(by going over all engines in thd->transaction.all list)
651
but at different times:
652
- upon COMMIT SQL statement is issued by the user
653
- implicitly, by the server, at the beginning of a DDL statement
654
or SET AUTOCOMMIT={0|1} statement.
656
The normal transaction can be rolled back as well:
657
- if the user has requested so, by issuing ROLLBACK SQL
659
- if one of the storage engines requested a rollback
660
by setting thd->transaction_rollback_request. This may
661
happen in case, e.g., when the transaction in the engine was
662
chosen a victim of the internal deadlock resolution algorithm
663
and rolled back internally. When such a situation happens, there
664
is little the server can do and the only option is to rollback
665
transactions in all other participating engines. In this case
666
the rollback is accompanied by an error sent to the user.
668
As follows from the use cases above, the normal transaction
669
is never committed when there is an outstanding statement
670
transaction. In most cases there is no conflict, since
671
commits of the normal transaction are issued by a stand-alone
672
administrative or DDL statement, thus no outstanding statement
673
transaction of the previous statement exists. Besides,
674
all statements that manipulate with the normal transaction
675
are prohibited in stored functions and triggers, therefore
676
no conflicting situation can occur in a sub-statement either.
677
The remaining rare cases when the server explicitly has
678
to commit the statement transaction prior to committing the normal
679
one cover error-handling scenarios (see for example
682
When committing a statement or a normal transaction, the server
683
either uses the two-phase commit protocol, or issues a commit
684
in each engine independently. The two-phase commit protocol
686
- all participating engines support two-phase commit (provide
687
handlerton::prepare PSEA API call) and
688
- transactions in at least two engines modify data (i.e. are
691
Note that the two phase commit is used for
692
statement transactions, even though they are not durable anyway.
693
This is done to ensure logical consistency of data in a multiple-
695
For example, imagine that some day MySQL supports unique
696
constraint checks deferred till the end of statement. In such
697
case a commit in one of the engines may yield ER_DUP_KEY,
698
and MySQL should be able to gracefully abort statement
699
transactions of other participants.
701
After the normal transaction has been committed,
702
thd->transaction.all list is cleared.
704
When a connection is closed, the current normal transaction, if
707
Roles and responsibilities
708
--------------------------
710
The server has no way to know that an engine participates in
711
the statement and a transaction has been started
712
in it unless the engine says so. Thus, in order to be
713
a part of a transaction, the engine must "register" itself.
714
This is done by invoking trans_register_ha() server call.
715
Normally the engine registers itself whenever handler::external_lock()
716
is called. trans_register_ha() can be invoked many times: if
717
an engine is already registered, the call does nothing.
718
In case autocommit is not set, the engine must register itself
719
twice -- both in the statement list and in the normal transaction
721
In which list to register is a parameter of trans_register_ha().
723
Note, that although the registration interface in itself is
724
fairly clear, the current usage practice often leads to undesired
725
effects. E.g. since a call to trans_register_ha() in most engines
726
is embedded into implementation of handler::external_lock(), some
727
DDL statements start a transaction (at least from the server
728
point of view) even though they are not expected to. E.g.
729
CREATE TABLE does not start a transaction, since
730
handler::external_lock() is never called during CREATE TABLE. But
731
CREATE TABLE ... SELECT does, since handler::external_lock() is
732
called for the table that is being selected from. This has no
733
practical effects currently, but must be kept in mind
736
Once an engine is registered, the server will do the rest
739
During statement execution, whenever any of data-modifying
740
PSEA API methods is used, e.g. handler::write_row() or
741
handler::update_row(), the read-write flag is raised in the
742
statement transaction for the involved engine.
743
Currently All PSEA calls are "traced", and the data can not be
744
changed in a way other than issuing a PSEA call. Important:
745
unless this invariant is preserved the server will not know that
746
a transaction in a given engine is read-write and will not
747
involve the two-phase commit protocol!
749
At the end of a statement, server call
750
ha_autocommit_or_rollback() is invoked. This call in turn
751
invokes handlerton::prepare() for every involved engine.
752
Prepare is followed by a call to handlerton::commit_one_phase()
753
If a one-phase commit will suffice, handlerton::prepare() is not
754
invoked and the server only calls handlerton::commit_one_phase().
755
At statement commit, the statement-related read-write engine
756
flag is propagated to the corresponding flag in the normal
757
transaction. When the commit is complete, the list of registered
760
Rollback is handled in a similar fashion.
762
Additional notes on DDL and the normal transaction.
763
---------------------------------------------------
765
DDLs and operations with non-transactional engines
766
do not "register" in thd->transaction lists, and thus do not
767
modify the transaction state. Besides, each DDL in
768
MySQL is prefixed with an implicit normal transaction commit
769
(a call to end_active_trans()), and thus leaves nothing
771
However, as it has been pointed out with CREATE TABLE .. SELECT,
772
some DDL statements can start a *new* transaction.
774
Behaviour of the server in this case is currently badly
776
DDL statements use a form of "semantic" logging
777
to maintain atomicity: if CREATE TABLE .. SELECT failed,
778
the newly created table is deleted.
779
In addition, some DDL statements issue interim transaction
780
commits: e.g. ALTER TABLE issues a commit after data is copied
781
from the original table to the internal temporary table. Other
782
statements, e.g. CREATE TABLE ... SELECT do not always commit
784
And finally there is a group of DDL statements such as
785
RENAME/DROP TABLE that doesn't start a new transaction
788
This diversity makes it hard to say what will happen if
789
by chance a stored function is invoked during a DDL --
790
whether any modifications it makes will be committed or not
791
is not clear. Fortunately, SQL grammar of few DDLs allows
792
invocation of a stored function.
794
A consistent behaviour is perhaps to always commit the normal
795
transaction after all DDLs, just like the statement transaction
796
is always committed at the end of all statements.
800
Register a storage engine for a transaction.
802
Every storage engine MUST call this function when it starts
803
a transaction or a statement (that is it must be called both for the
804
"beginning of transaction" and "beginning of statement").
805
Only storage engines registered for the transaction/statement
806
will know when to commit/rollback it.
809
trans_register_ha is idempotent - storage engine may register many
810
times per transaction.
813
void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
816
Ha_trx_info *ha_info;
820
trans= &thd->transaction.all;
821
thd->server_status|= SERVER_STATUS_IN_TRANS;
824
trans= &thd->transaction.stmt;
826
ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
828
if (ha_info->is_started())
829
return; /* already registered, return */
831
ha_info->register_ha(trans, ht_arg);
833
trans->no_2pc|=(ht_arg->prepare==0);
834
if (thd->transaction.xid_state.xid.is_null())
835
thd->transaction.xid_state.xid.set(thd->query_id);
844
1 error, transaction was rolled back
846
int ha_prepare(THD *thd)
849
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
850
Ha_trx_info *ha_info= trans->ha_list;
853
for (; ha_info; ha_info= ha_info->next())
856
handlerton *ht= ha_info->ht();
857
status_var_increment(thd->status_var.ha_prepare_count);
860
if ((err= ht->prepare(ht, thd, all)))
862
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
863
ha_rollback_trans(thd, all);
870
push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
871
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
872
ha_resolve_storage_engine_name(ht));
880
Check if we can skip the two-phase commit.
882
A helper function to evaluate if two-phase commit is mandatory.
883
As a side effect, propagates the read-only/read-write flags
884
of the statement transaction to its enclosing normal transaction.
886
@retval true we must run a two-phase commit. Returned
887
if we have at least two engines with read-write changes.
888
@retval false Don't need two-phase commit. Even if we have two
889
transactional engines, we can run two independent
890
commits if changes in one of the engines are read-only.
895
ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
898
/* The number of storage engines that have actual changes. */
899
unsigned rw_ha_count= 0;
900
Ha_trx_info *ha_info;
902
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
904
if (ha_info->is_trx_read_write())
909
Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
910
assert(ha_info != ha_info_all);
912
Merge read-only/read-write information about statement
913
transaction to its enclosing normal transaction. Do this
914
only if in a real transaction -- that is, if we know
915
that ha_info_all is registered in thd->transaction.all.
916
Since otherwise we only clutter the normal transaction flags.
918
if (ha_info_all->is_started()) /* false if autocommit. */
919
ha_info_all->coalesce_trx_with(ha_info);
921
else if (rw_ha_count > 1)
924
It is a normal transaction, so we don't need to merge read/write
925
information up, and the need for two-phase commit has been
926
already established. Break the loop prematurely.
931
return rw_ha_count > 1;
939
1 transaction was rolled back
941
2 error during commit, data may be inconsistent
944
Since we don't support nested statement transactions in 5.0,
945
we can't commit or rollback stmt transactions while we are inside
946
stored functions or triggers. So we simply do nothing now.
947
TODO: This should be fixed in later ( >= 5.1) releases.
949
int ha_commit_trans(THD *thd, bool all)
951
int error= 0, cookie= 0;
953
'all' means that this is either an explicit commit issued by
954
user, or an implicit commit issued by a DDL.
956
THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
957
bool is_real_trans= all || thd->transaction.all.ha_list == 0;
958
Ha_trx_info *ha_info= trans->ha_list;
959
my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
962
We must not commit the normal transaction if a statement
963
transaction is pending. Otherwise statement transaction
964
flags will not get propagated to its normal transaction's
967
assert(thd->transaction.stmt.ha_list == NULL ||
968
trans == &thd->transaction.stmt);
970
if (thd->in_sub_stmt)
973
Since we don't support nested statement transactions in 5.0,
974
we can't commit or rollback stmt transactions while we are inside
975
stored functions or triggers. So we simply do nothing now.
976
TODO: This should be fixed in later ( >= 5.1) releases.
981
We assume that all statements which commit or rollback main transaction
982
are prohibited inside of stored functions or triggers. So they should
983
bail out with error even before ha_commit_trans() call. To be 100% safe
984
let us throw error in non-debug builds.
987
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
994
if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
996
ha_rollback_trans(thd, all);
1002
&& ! thd->slave_thread
1005
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
1006
ha_rollback_trans(thd, all);
1011
must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
1013
if (!trans->no_2pc && must_2pc)
1015
for (; ha_info && !error; ha_info= ha_info->next())
1018
handlerton *ht= ha_info->ht();
1020
Do not call two-phase commit if this particular
1021
transaction is read-only. This allows for simpler
1022
implementation in engines that are always read-only.
1024
if (! ha_info->is_trx_read_write())
1027
Sic: we know that prepare() is not NULL since otherwise
1028
trans->no_2pc would have been set.
1030
if ((err= ht->prepare(ht, thd, all)))
1032
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1035
status_var_increment(thd->status_var.ha_prepare_count);
1037
if (error || (is_real_trans && xid &&
1038
(error= !(cookie= tc_log->log_xid(thd, xid)))))
1040
ha_rollback_trans(thd, all);
1045
error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
1047
tc_log->unlog(cookie, xid);
1050
start_waiting_global_read_lock(thd);
1057
This function does not care about global read lock. A caller should.
1059
int ha_commit_one_phase(THD *thd, bool all)
1062
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1063
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1064
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1067
for (; ha_info; ha_info= ha_info_next)
1070
handlerton *ht= ha_info->ht();
1071
if ((err= ht->commit(ht, thd, all)))
1073
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1076
status_var_increment(thd->status_var.ha_commit_count);
1077
ha_info_next= ha_info->next();
1078
ha_info->reset(); /* keep it conveniently zero-filled */
1083
thd->transaction.xid_state.xid.null();
1086
thd->variables.tx_isolation=thd->session_tx_isolation;
1087
thd->transaction.cleanup();
1094
int ha_rollback_trans(THD *thd, bool all)
1097
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1098
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1099
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1102
We must not rollback the normal transaction if a statement
1103
transaction is pending.
1105
assert(thd->transaction.stmt.ha_list == NULL ||
1106
trans == &thd->transaction.stmt);
1108
if (thd->in_sub_stmt)
1111
If we are inside stored function or trigger we should not commit or
1112
rollback current statement transaction. See comment in ha_commit_trans()
1113
call for more information.
1118
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1123
for (; ha_info; ha_info= ha_info_next)
1126
handlerton *ht= ha_info->ht();
1127
if ((err= ht->rollback(ht, thd, all)))
1129
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1132
status_var_increment(thd->status_var.ha_rollback_count);
1133
ha_info_next= ha_info->next();
1134
ha_info->reset(); /* keep it conveniently zero-filled */
1139
thd->transaction.xid_state.xid.null();
1142
thd->variables.tx_isolation=thd->session_tx_isolation;
1143
thd->transaction.cleanup();
1147
thd->transaction_rollback_request= false;
1150
If a non-transactional table was updated, warn; don't warn if this is a
1151
slave thread (because when a slave thread executes a ROLLBACK, it has
1152
been read from the binary log, so it's 100% sure and normal to produce
1153
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
1154
slave SQL thread, it would not stop the thread but just be printed in
1155
the error log; but we don't want users to wonder why they have this
1156
message in the error log, so we don't send it.
1158
if (is_real_trans && thd->transaction.all.modified_non_trans_table &&
1159
!thd->slave_thread && thd->killed != THD::KILL_CONNECTION)
1160
push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
1161
ER_WARNING_NOT_COMPLETE_ROLLBACK,
1162
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
1167
This is used to commit or rollback a single statement depending on
1171
Note that if the autocommit is on, then the following call inside
1172
InnoDB will commit or rollback the whole transaction (= the statement). The
1173
autocommit mechanism built into InnoDB is based on counting locks, but if
1174
the user has used LOCK TABLES then that mechanism does not know to do the
1177
int ha_autocommit_or_rollback(THD *thd, int error)
1179
if (thd->transaction.stmt.ha_list)
1183
if (ha_commit_trans(thd, 0))
1188
(void) ha_rollback_trans(thd, 0);
1189
if (thd->transaction_rollback_request && !thd->in_sub_stmt)
1190
(void) ha_rollback(thd);
1193
thd->variables.tx_isolation=thd->session_tx_isolation;
1204
static bool xacommit_handlerton(THD *unused1 __attribute__((__unused__)),
1208
handlerton *hton= plugin_data(plugin, handlerton *);
1209
if (hton->state == SHOW_OPTION_YES && hton->recover)
1211
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
1212
((struct xahton_st *)arg)->result= 0;
1217
static bool xarollback_handlerton(THD *unused1 __attribute__((__unused__)),
1221
handlerton *hton= plugin_data(plugin, handlerton *);
1222
if (hton->state == SHOW_OPTION_YES && hton->recover)
1224
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
1225
((struct xahton_st *)arg)->result= 0;
1231
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
1233
struct xahton_st xaop;
1237
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
1238
MYSQL_STORAGE_ENGINE_PLUGIN, &xaop);
1244
recover() step of xa.
1247
there are three modes of operation:
1248
- automatic recover after a crash
1249
in this case commit_list != 0, tc_heuristic_recover==0
1250
all xids from commit_list are committed, others are rolled back
1251
- manual (heuristic) recover
1252
in this case commit_list==0, tc_heuristic_recover != 0
1253
DBA has explicitly specified that all prepared transactions should
1254
be committed (or rolled back).
1255
- no recovery (MySQL did not detect a crash)
1256
in this case commit_list==0, tc_heuristic_recover == 0
1257
there should be no prepared transactions in this case.
1261
int len, found_foreign_xids, found_my_xids;
1267
static bool xarecover_handlerton(THD *unused __attribute__((__unused__)),
1271
handlerton *hton= plugin_data(plugin, handlerton *);
1272
struct xarecover_st *info= (struct xarecover_st *) arg;
1275
if (hton->state == SHOW_OPTION_YES && hton->recover)
1277
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
1279
sql_print_information("Found %d prepared transaction(s) in %s",
1280
got, ha_resolve_storage_engine_name(hton));
1281
for (int i=0; i < got; i ++)
1283
my_xid x=info->list[i].get_my_xid();
1284
if (!x) // not "mine" - that is generated by external TM
1286
xid_cache_insert(info->list+i, XA_PREPARED);
1287
info->found_foreign_xids++;
1292
info->found_my_xids++;
1296
if (info->commit_list ?
1297
hash_search(info->commit_list, (uchar *)&x, sizeof(x)) != 0 :
1298
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
1300
hton->commit_by_xid(hton, info->list+i);
1304
hton->rollback_by_xid(hton, info->list+i);
1307
if (got < info->len)
1314
int ha_recover(HASH *commit_list)
1316
struct xarecover_st info;
1317
info.found_foreign_xids= info.found_my_xids= 0;
1318
info.commit_list= commit_list;
1319
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1322
/* commit_list and tc_heuristic_recover cannot be set both */
1323
assert(info.commit_list==0 || tc_heuristic_recover==0);
1324
/* if either is set, total_ha_2pc must be set too */
1325
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1327
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1330
if (info.commit_list)
1331
sql_print_information("Starting crash recovery...");
1334
#ifndef WILL_BE_DELETED_LATER
1337
for now, only InnoDB supports 2pc. It means we can always safely
1338
rollback all pending transactions, without risking inconsistent data
1341
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1342
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1347
for (info.len= MAX_XID_LIST_SIZE ;
1348
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1350
info.list=(XID *)my_malloc(info.len*sizeof(XID), MYF(0));
1354
sql_print_error(ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1358
plugin_foreach(NULL, xarecover_handlerton,
1359
MYSQL_STORAGE_ENGINE_PLUGIN, &info);
1361
my_free((uchar*)info.list, MYF(0));
1362
if (info.found_foreign_xids)
1363
sql_print_warning("Found %d prepared XA transactions",
1364
info.found_foreign_xids);
1365
if (info.dry_run && info.found_my_xids)
1367
sql_print_error("Found %d prepared transactions! It means that mysqld was "
1368
"not shut down properly last time and critical recovery "
1369
"information (last binlog or %s file) was manually deleted "
1370
"after a crash. You have to start mysqld with "
1371
"--tc-heuristic-recover switch to commit or rollback "
1372
"pending transactions.",
1373
info.found_my_xids, opt_tc_log_file);
1376
if (info.commit_list)
1377
sql_print_information("Crash recovery finished.");
1382
return the list of XID's to a client, the same way SHOW commands do.
1385
I didn't find in XA specs that an RM cannot return the same XID twice,
1386
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1387
It can be easily fixed later, if necessary.
1389
bool mysql_xa_recover(THD *thd)
1391
List<Item> field_list;
1392
Protocol *protocol= thd->protocol;
1396
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1397
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1398
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1399
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1401
if (protocol->send_fields(&field_list,
1402
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1405
pthread_mutex_lock(&LOCK_xid_cache);
1406
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1408
if (xs->xa_state==XA_PREPARED)
1410
protocol->prepare_for_resend();
1411
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1412
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1413
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1414
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1416
if (protocol->write())
1418
pthread_mutex_unlock(&LOCK_xid_cache);
1424
pthread_mutex_unlock(&LOCK_xid_cache);
1431
This function should be called when MySQL sends rows of a SELECT result set
1432
or the EOF mark to the client. It releases a possible adaptive hash index
1433
S-latch held by thd in InnoDB and also releases a possible InnoDB query
1434
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to
1435
keep them over several calls of the InnoDB handler interface when a join
1436
is executed. But when we let the control to pass to the client they have
1437
to be released because if the application program uses mysql_use_result(),
1438
it may deadlock on the S-latch if the application on another connection
1439
performs another SQL query. In MySQL-4.1 this is even more important because
1440
there a connection can have several SELECT queries open at the same time.
1442
@param thd the thread handle of the current connection
1447
static bool release_temporary_latches(THD *thd, plugin_ref plugin,
1448
void *unused __attribute__((__unused__)))
1450
handlerton *hton= plugin_data(plugin, handlerton *);
1452
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1453
hton->release_temporary_latches(hton, thd);
1459
int ha_release_temporary_latches(THD *thd)
1461
plugin_foreach(thd, release_temporary_latches, MYSQL_STORAGE_ENGINE_PLUGIN,
1467
int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
1470
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1471
&thd->transaction.all);
1472
Ha_trx_info *ha_info, *ha_info_next;
1476
rolling back to savepoint in all storage engines that were part of the
1477
transaction when the savepoint was set
1479
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1482
handlerton *ht= ha_info->ht();
1484
assert(ht->savepoint_set != 0);
1485
if ((err= ht->savepoint_rollback(ht, thd,
1486
(uchar *)(sv+1)+ht->savepoint_offset)))
1488
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1491
status_var_increment(thd->status_var.ha_savepoint_rollback_count);
1492
trans->no_2pc|= ht->prepare == 0;
1495
rolling back the transaction in all storage engines that were not part of
1496
the transaction when the savepoint was set
1498
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1499
ha_info= ha_info_next)
1502
handlerton *ht= ha_info->ht();
1503
if ((err= ht->rollback(ht, thd, !thd->in_sub_stmt)))
1505
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1508
status_var_increment(thd->status_var.ha_rollback_count);
1509
ha_info_next= ha_info->next();
1510
ha_info->reset(); /* keep it conveniently zero-filled */
1512
trans->ha_list= sv->ha_list;
1518
according to the sql standard (ISO/IEC 9075-2:2003)
1519
section "4.33.4 SQL-statements and transaction states",
1520
SAVEPOINT is *not* transaction-initiating SQL-statement
1522
int ha_savepoint(THD *thd, SAVEPOINT *sv)
1525
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1526
&thd->transaction.all);
1527
Ha_trx_info *ha_info= trans->ha_list;
1528
for (; ha_info; ha_info= ha_info->next())
1531
handlerton *ht= ha_info->ht();
1533
if (! ht->savepoint_set)
1535
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1539
if ((err= ht->savepoint_set(ht, thd, (uchar *)(sv+1)+ht->savepoint_offset)))
1541
my_error(ER_GET_ERRNO, MYF(0), err);
1544
status_var_increment(thd->status_var.ha_savepoint_count);
1547
Remember the list of registered storage engines. All new
1548
engines are prepended to the beginning of the list.
1550
sv->ha_list= trans->ha_list;
1554
int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
1557
Ha_trx_info *ha_info= sv->ha_list;
1559
for (; ha_info; ha_info= ha_info->next())
1562
handlerton *ht= ha_info->ht();
1563
/* Savepoint life time is enclosed into transaction life time. */
1565
if (!ht->savepoint_release)
1567
if ((err= ht->savepoint_release(ht, thd,
1568
(uchar *)(sv+1) + ht->savepoint_offset)))
1570
my_error(ER_GET_ERRNO, MYF(0), err);
1578
static bool snapshot_handlerton(THD *thd, plugin_ref plugin, void *arg)
1580
handlerton *hton= plugin_data(plugin, handlerton *);
1581
if (hton->state == SHOW_OPTION_YES &&
1582
hton->start_consistent_snapshot)
1584
hton->start_consistent_snapshot(hton, thd);
1585
*((bool *)arg)= false;
1590
int ha_start_consistent_snapshot(THD *thd)
1594
plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
1597
Same idea as when one wants to CREATE TABLE in one engine which does not
1601
push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1602
"This MySQL server does not support any "
1603
"consistent-read capable storage engine");
1608
static bool flush_handlerton(THD *thd __attribute__((__unused__)),
1610
void *arg __attribute__((__unused__)))
1612
handlerton *hton= plugin_data(plugin, handlerton *);
1613
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1614
hton->flush_logs(hton))
1620
bool ha_flush_logs(handlerton *db_type)
1622
if (db_type == NULL)
1624
if (plugin_foreach(NULL, flush_handlerton,
1625
MYSQL_STORAGE_ENGINE_PLUGIN, 0))
1630
if (db_type->state != SHOW_OPTION_YES ||
1631
(db_type->flush_logs && db_type->flush_logs(db_type)))
1637
static const char *check_lowercase_names(handler *file, const char *path,
1640
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1643
/* Ensure that table handler get path in lower case */
1644
if (tmp_path != path)
1645
strmov(tmp_path, path);
1648
we only should turn into lowercase database/table part
1649
so start the process after homedirectory
1651
my_casedn_str(files_charset_info, tmp_path + mysql_data_home_len);
1657
An interceptor to hijack the text of the error message without
1658
setting an error in the thread. We need the text to present it
1659
in the form of a warning to the user.
1662
struct Ha_delete_table_error_handler: public Internal_error_handler
1665
virtual bool handle_error(uint sql_errno,
1666
const char *message,
1667
MYSQL_ERROR::enum_warning_level level,
1669
char buff[MYSQL_ERRMSG_SIZE];
1674
Ha_delete_table_error_handler::
1675
handle_error(uint sql_errno __attribute__((__unused__)),
1676
const char *message,
1677
MYSQL_ERROR::enum_warning_level level __attribute__((__unused__)),
1678
THD *thd __attribute__((__unused__)))
1680
/* Grab the error message */
1681
strmake(buff, message, sizeof(buff)-1);
1687
This should return ENOENT if the file doesn't exists.
1688
The .frm file will be deleted only if we return 0 or ENOENT
1690
int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
1691
const char *db, const char *alias, bool generate_warning)
1694
char tmp_path[FN_REFLEN];
1697
TABLE_SHARE dummy_share;
1699
bzero((char*) &dummy_table, sizeof(dummy_table));
1700
bzero((char*) &dummy_share, sizeof(dummy_share));
1701
dummy_table.s= &dummy_share;
1703
/* DB_TYPE_UNKNOWN is used in ALTER TABLE when renaming only .frm files */
1704
if (table_type == NULL ||
1705
! (file=get_new_handler((TABLE_SHARE*)0, thd->mem_root, table_type)))
1708
path= check_lowercase_names(file, path, tmp_path);
1709
if ((error= file->ha_delete_table(path)) && generate_warning)
1712
Because file->print_error() use my_error() to generate the error message
1713
we use an internal error handler to intercept it and store the text
1714
in a temporary buffer. Later the message will be presented to user
1717
Ha_delete_table_error_handler ha_delete_table_error_handler;
1719
/* Fill up strucutures that print_error may need */
1720
dummy_share.path.str= (char*) path;
1721
dummy_share.path.length= strlen(path);
1722
dummy_share.db.str= (char*) db;
1723
dummy_share.db.length= strlen(db);
1724
dummy_share.table_name.str= (char*) alias;
1725
dummy_share.table_name.length= strlen(alias);
1726
dummy_table.alias= alias;
1728
file->change_table_ptr(&dummy_table, &dummy_share);
1730
thd->push_internal_handler(&ha_delete_table_error_handler);
1731
file->print_error(error, 0);
1733
thd->pop_internal_handler();
1736
XXX: should we convert *all* errors to warnings here?
1737
What if the error is fatal?
1739
push_warning(thd, MYSQL_ERROR::WARN_LEVEL_ERROR, error,
1740
ha_delete_table_error_handler.buff);
54
1746
/****************************************************************************
55
** General Cursor functions
1747
** General handler functions
56
1748
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
61
estimation_rows_to_insert(0),
63
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
64
ref_length(sizeof(internal::my_off_t)),
67
next_insert_id(0), insert_id_for_cur_row(0)
72
assert(locked == false);
73
/* TODO: assert(inited == NONE); */
78
* @note this only used in
79
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
80
* of the writing of this comment. -Brian
82
Cursor *Cursor::clone(memory::Root *mem_root)
84
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
1749
handler *handler::clone(MEM_ROOT *mem_root)
1751
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
87
Allocate Cursor->ref here because otherwise ha_open will allocate it
88
on this->table->mem_root and we will not be able to reclaim that memory
89
when the clone Cursor object is destroyed.
1753
Allocate handler->ref here because otherwise ha_open will allocate it
1754
on this->table->mem_root and we will not be able to reclaim that memory
1755
when the clone handler object is destroyed.
91
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1757
if (!(new_handler->ref= (uchar*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
94
identifier::Table identifier(getTable()->getShare()->getSchemaName(),
95
getTable()->getShare()->getTableName(),
96
getTable()->getShare()->getType());
98
if (new_handler && !new_handler->ha_open(identifier,
99
getTable()->getDBStat(),
1759
if (new_handler && !new_handler->ha_open(table,
1760
table->s->normalized_path.str,
100
1762
HA_OPEN_IGNORE_IF_LOCKED))
101
1763
return new_handler;
107
given a buffer with a key value, and a map of keyparts
108
that are present in this value, returns the length of the value
110
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
112
/* works only with key prefixes */
113
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
115
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
116
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
119
while (key_part_found < end_key_part_found && keypart_map_arg)
121
length+= key_part_found->store_length;
122
keypart_map_arg >>= 1;
128
int Cursor::startIndexScan(uint32_t idx, bool sorted)
131
assert(inited == NONE);
132
if (!(result= doStartIndexScan(idx, sorted)))
138
int Cursor::endIndexScan()
140
assert(inited==INDEX);
143
return(doEndIndexScan());
146
int Cursor::startTableScan(bool scan)
149
assert(inited==NONE || (inited==RND && scan));
150
inited= (result= doStartTableScan(scan)) ? NONE: RND;
155
int Cursor::endTableScan()
159
return(doEndTableScan());
162
int Cursor::ha_index_or_rnd_end()
164
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
167
void Cursor::ha_start_bulk_insert(ha_rows rows)
169
estimation_rows_to_insert= rows;
170
start_bulk_insert(rows);
173
int Cursor::ha_end_bulk_insert()
175
estimation_rows_to_insert= 0;
176
return end_bulk_insert();
179
const key_map *Cursor::keys_to_use_for_scanning()
181
return &key_map_empty;
184
bool Cursor::has_transactions()
186
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
189
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
191
(getTable()->in_use->status_var.*offset)++;
194
void **Cursor::ha_data(Session *session) const
196
return session->getEngineData(getEngine());
199
bool Cursor::is_fatal_error(int error, uint32_t flags)
202
((flags & HA_CHECK_DUP_KEY) &&
203
(error == HA_ERR_FOUND_DUPP_KEY ||
204
error == HA_ERR_FOUND_DUPP_UNIQUE)))
210
ha_rows Cursor::records() { return stats.records; }
211
uint64_t Cursor::tableSize() { return stats.index_file_length + stats.data_file_length; }
212
uint64_t Cursor::rowSize() { return getTable()->getRecordLength() + getTable()->sizeFields(); }
214
int Cursor::doOpen(const identifier::Table &identifier, int mode, uint32_t test_if_locked)
216
return open(identifier.getPath().c_str(), mode, test_if_locked);
1769
void handler::ha_statistic_increment(ulong SSV::*offset) const
1771
status_var_increment(table->in_use->status_var.*offset);
1774
void **handler::ha_data(THD *thd) const
1776
return thd_ha_data(thd, ht);
1779
THD *handler::ha_thd(void) const
1781
assert(!table || !table->in_use || table->in_use == current_thd);
1782
return (table && table->in_use) ? table->in_use : current_thd;
220
Open database-Cursor.
1786
Open database-handler.
222
1788
Try O_RDONLY if cannot open as O_RDWR
223
1789
Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
225
int Cursor::ha_open(const identifier::Table &identifier,
1791
int handler::ha_open(TABLE *table_arg, const char *name, int mode,
231
if ((error= doOpen(identifier, mode, test_if_locked)))
1797
assert(table->s == table_share);
1798
assert(alloc_root_inited(&table->mem_root));
1800
if ((error=open(name,mode,test_if_locked)))
233
1802
if ((error == EACCES || error == EROFS) && mode == O_RDWR &&
234
(getTable()->db_stat & HA_TRY_READ_ONLY))
1803
(table->db_stat & HA_TRY_READ_ONLY))
236
getTable()->db_stat|=HA_READ_ONLY;
237
error= doOpen(identifier, O_RDONLY,test_if_locked);
1805
table->db_stat|=HA_READ_ONLY;
1806
error=open(name,O_RDONLY,test_if_locked);
242
errno= error; /* Safeguard */
1811
my_errno= error; /* Safeguard */
246
if (getTable()->getShare()->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
247
getTable()->db_stat|=HA_READ_ONLY;
1815
if (table->s->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
1816
table->db_stat|=HA_READ_ONLY;
248
1817
(void) extra(HA_EXTRA_NO_READCHECK); // Not needed in SQL
250
/* ref is already allocated for us if we're called from Cursor::clone() */
251
if (!ref && !(ref= (unsigned char*) getTable()->alloc_root(ALIGN_SIZE(ref_length)*2)))
1819
/* ref is already allocated for us if we're called from handler::clone() */
1820
if (!ref && !(ref= (uchar*) alloc_root(&table->mem_root,
1821
ALIGN_SIZE(ref_length)*2)))
254
1824
error=HA_ERR_OUT_OF_MEM;
257
1827
dup_ref=ref+ALIGN_SIZE(ref_length);
1828
cached_table_flags= table_flags();
1834
one has to use this method when to find
1835
random position by record as the plain
1836
position() call doesn't work for some
1837
handlers for random position
1840
int handler::rnd_pos_by_record(uchar *record)
1845
if (inited && (error= ha_index_end()))
1847
if ((error= ha_rnd_init(false)))
1850
return(rnd_pos(record, ref));
1149
3678
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1151
3680
*range_info= mrr_cur_range.ptr;
3685
/* **************************************************************************
3686
* DS-MRR implementation
3687
***************************************************************************/
3690
DS-MRR: Initialize and start MRR scan
3692
Initialize and start the MRR scan. Depending on the mode parameter, this
3693
may use default or DS-MRR implementation.
3695
@param h Table handler to be used
3696
@param key Index to be used
3697
@param seq_funcs Interval sequence enumeration functions
3698
@param seq_init_param Interval sequence enumeration parameter
3699
@param n_ranges Number of ranges in the sequence.
3700
@param mode HA_MRR_* modes to use
3701
@param buf INOUT Buffer to use
3703
@retval 0 Ok, Scan started.
3707
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3708
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3709
uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3713
Item *pushed_cond= NULL;
3715
keyno= h->active_index;
3717
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3719
use_default_impl= true;
3720
return(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3721
n_ranges, mode, buf));
3723
rowids_buf= buf->buffer;
3724
//psergey-todo: don't add key_length as it is not needed anymore
3725
rowids_buf += key->key_length + h->ref_length;
3727
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3728
rowids_buf_end= buf->buffer_end;
3730
elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3731
rowids_buf_last= rowids_buf +
3732
((rowids_buf_end - rowids_buf)/ elem_size)*
3734
rowids_buf_end= rowids_buf_last;
3736
/* Create a separate handler object to do rndpos() calls. */
3737
THD *thd= current_thd;
3738
if (!(new_h2= h->clone(thd->mem_root)) ||
3739
new_h2->ha_external_lock(thd, F_RDLCK))
3745
if (keyno == h->pushed_idx_cond_keyno)
3746
pushed_cond= h->pushed_idx_cond;
3747
if (h->ha_index_end())
3754
table->prepare_for_position();
3755
new_h2->extra(HA_EXTRA_KEYREAD);
3757
if (h2->ha_index_init(keyno, false) ||
3758
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3761
use_default_impl= false;
3764
h2->idx_cond_push(keyno, pushed_cond);
3765
if (dsmrr_fill_buffer(new_h2))
3769
If the above call has scanned through all intervals in *seq, then
3770
adjust *buf to indicate that the remaining buffer space will not be used.
3773
buf->end_of_used_area= rowids_buf_last;
3775
if (h->ha_rnd_init(false))
3780
h2->ha_index_or_rnd_end();
3781
h2->ha_external_lock(thd, F_UNLCK);
3788
void DsMrr_impl::dsmrr_close()
3792
h2->ha_external_lock(current_thd, F_UNLCK);
3797
use_default_impl= true;
3802
static int rowid_cmp(void *h, uchar *a, uchar *b)
3804
return ((handler*)h)->cmp_ref(a, b);
3809
DS-MRR: Fill the buffer with rowids and sort it by rowid
3811
{This is an internal function of DiskSweep MRR implementation}
3812
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3813
buffer. When the buffer is full or scan is completed, sort the buffer by
3816
The function assumes that rowids buffer is empty when it is invoked.
3818
@param h Table handler
3820
@retval 0 OK, the next portion of rowids is in the buffer,
3825
int DsMrr_impl::dsmrr_fill_buffer(handler *unused __attribute__((__unused__)))
3830
rowids_buf_cur= rowids_buf;
3831
while ((rowids_buf_cur < rowids_buf_end) &&
3832
!(res= h2->handler::multi_range_read_next(&range_info)))
3834
/* Put rowid, or {rowid, range_id} pair into the buffer */
3835
h2->position(table->record[0]);
3836
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3837
rowids_buf_cur += h->ref_length;
3841
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3842
rowids_buf_cur += sizeof(void*);
3846
if (res && res != HA_ERR_END_OF_FILE)
3848
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3850
/* Sort the buffer contents by rowid */
3851
uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3852
uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3854
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3856
rowids_buf_last= rowids_buf_cur;
3857
rowids_buf_cur= rowids_buf;
3863
DS-MRR implementation: multi_range_read_next() function
3866
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
3870
if (use_default_impl)
3871
return h->handler::multi_range_read_next(range_info);
3873
if (rowids_buf_cur == rowids_buf_last)
3877
res= HA_ERR_END_OF_FILE;
3880
res= dsmrr_fill_buffer(h);
3885
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3886
if (rowids_buf_cur == rowids_buf_last)
3888
res= HA_ERR_END_OF_FILE;
3892
res= h->rnd_pos(table->record[0], rowids_buf_cur);
3893
rowids_buf_cur += h->ref_length;
3896
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3897
rowids_buf_cur += sizeof(void*);
3908
DS-MRR implementation: multi_range_read_info() function
3910
int DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, uint *bufsz,
3911
uint *flags, COST_VECT *cost)
3914
uint def_flags= *flags;
3915
uint def_bufsz= *bufsz;
3917
/* Get cost/flags/mem_usage of default MRR implementation */
3918
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3922
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3923
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3925
/* Default implementation is choosen */
3934
DS-MRR Implementation: multi_range_read_info_const() function
3937
ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
3938
void *seq_init_param, uint n_ranges,
3939
uint *bufsz, uint *flags, COST_VECT *cost)
3942
uint def_flags= *flags;
3943
uint def_bufsz= *bufsz;
3944
/* Get cost/flags/mem_usage of default MRR implementation */
3945
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3946
n_ranges, &def_bufsz,
3948
if (rows == HA_POS_ERROR)
3950
/* Default implementation can't perform MRR scan => we can't either */
3955
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3956
use the default MRR implementation (we need it for UPDATE/DELETE).
3957
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3959
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3960
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3967
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3974
Check if key has partially-covered columns
3976
We can't use DS-MRR to perform range scans when the ranges are over
3977
partially-covered keys, because we'll not have full key part values
3978
(we'll have their prefixes from the index) and will not be able to check
3979
if we've reached the end the range.
3981
@param keyno Key to check
3984
Allow use of DS-MRR in cases where the index has partially-covered
3985
components but they are not used for scanning.
3991
bool DsMrr_impl::key_uses_partial_cols(uint keyno)
3993
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3994
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3995
for (; kp != kp_end; kp++)
3997
if (!kp->field->part_of_key.is_set(keyno))
4005
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
4007
Make the choice between using Default MRR implementation and DS-MRR.
4008
This function contains common functionality factored out of dsmrr_info()
4009
and dsmrr_info_const(). The function assumes that the default MRR
4010
implementation's applicability requirements are satisfied.
4012
@param keyno Index number
4013
@param rows E(full rows to be retrieved)
4014
@param flags IN MRR flags provided by the MRR user
4015
OUT If DS-MRR is choosen, flags of DS-MRR implementation
4016
else the value is not modified
4017
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
4018
else the value is not modified
4019
@param cost IN Cost of default MRR implementation
4020
OUT If DS-MRR is choosen, cost of DS-MRR scan
4021
else the value is not modified
4023
@retval true Default MRR implementation should be used
4024
@retval false DS-MRR implementation should be used
4027
bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
4028
uint *bufsz, COST_VECT *cost)
4030
COST_VECT dsmrr_cost;
4032
THD *thd= current_thd;
4033
if ((thd->variables.optimizer_use_mrr == 2) ||
4034
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
4035
(keyno == table->s->primary_key &&
4036
h->primary_key_is_clustered()) ||
4037
key_uses_partial_cols(keyno))
4039
/* Use the default implementation */
4040
*flags |= HA_MRR_USE_DEFAULT_IMPL;
4044
uint add_len= table->key_info[keyno].key_length + h->ref_length;
4046
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
4052
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
4053
DS-MRR and Default implementations cost. This allows one to force use of
4054
DS-MRR whenever it is applicable without affecting other cost-based
4057
if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
4058
dsmrr_cost.total_cost() > cost->total_cost())
4061
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
4063
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
4064
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
4070
/* Use the default MRR implementation */
4077
static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
4081
Get cost of DS-MRR scan
4083
@param keynr Index to be used
4084
@param rows E(Number of rows to be scanned)
4085
@param flags Scan parameters (HA_MRR_* flags)
4086
@param buffer_size INOUT Buffer size
4087
@param cost OUT The cost
4090
@retval true Error, DS-MRR cannot be used (the buffer is too small
4094
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
4095
uint *buffer_size, COST_VECT *cost)
4097
uint32_t max_buff_entries, elem_size;
4098
ha_rows rows_in_full_step, rows_in_last_step;
4100
double index_read_cost;
4102
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
4103
max_buff_entries = *buffer_size / elem_size;
4105
if (!max_buff_entries)
4106
return true; /* Buffer has not enough space for even 1 rowid */
4108
/* Number of iterations we'll make with full buffer */
4109
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
4112
Get numbers of rows we'll be processing in
4113
- non-last sweep, with full buffer
4114
- last iteration, with non-full buffer
4116
rows_in_full_step= max_buff_entries;
4117
rows_in_last_step= rows % max_buff_entries;
4119
/* Adjust buffer size if we expect to use only part of the buffer */
4122
get_sort_and_sweep_cost(table, rows, cost);
4123
cost->multiply(n_full_steps);
4128
*buffer_size= max(*buffer_size,
4129
(size_t)(1.2*rows_in_last_step) * elem_size +
4130
h->ref_length + table->key_info[keynr].key_length);
4133
COST_VECT last_step_cost;
4134
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
4135
cost->add(&last_step_cost);
4137
if (n_full_steps != 0)
4138
cost->mem_cost= *buffer_size;
4140
cost->mem_cost= (double)rows_in_last_step * elem_size;
4142
/* Total cost of all index accesses */
4143
index_read_cost= h->index_only_read_time(keynr, (double)rows);
4144
cost->add_io(index_read_cost, 1 /* Random seeks */);
4150
Get cost of one sort-and-sweep step
4153
get_sort_and_sweep_cost()
4154
table Table being accessed
4155
nrows Number of rows to be sorted and retrieved
4159
Get cost of these operations:
4160
- sort an array of #nrows ROWIDs using qsort
4161
- read #nrows records from table in a sweep.
4165
void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
4169
get_sweep_read_cost(table, nrows, false, cost);
4170
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
4171
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
4174
cost->cpu_cost += cmp_op * log2(cmp_op);
4182
Get cost of reading nrows table records in a "disk sweep"
4184
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
4185
for an ordered sequence of rowids.
4187
We assume hard disk IO. The read is performed as follows:
4189
1. The disk head is moved to the needed cylinder
4190
2. The controller waits for the plate to rotate
4191
3. The data is transferred
4193
Time to do #3 is insignificant compared to #2+#1.
4195
Time to move the disk head is proportional to head travel distance.
4197
Time to wait for the plate to rotate depends on whether the disk head
4200
If disk head wasn't moved, the wait time is proportional to distance
4201
between the previous block and the block we're reading.
4203
If the head was moved, we don't know how much we'll need to wait for the
4204
plate to rotate. We assume the wait time to be a variate with a mean of
4205
0.5 of full rotation time.
4207
Our cost units are "random disk seeks". The cost of random disk seek is
4208
actually not a constant, it depends one range of cylinders we're going
4209
to access. We make it constant by introducing a fuzzy concept of "typical
4210
datafile length" (it's fuzzy as it's hard to tell whether it should
4211
include index file, temp.tables etc). Then random seek cost is:
4213
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
4215
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
4217
@param table Table to be accessed
4218
@param nrows Number of rows to retrieve
4219
@param interrupted true <=> Assume that the disk sweep will be
4220
interrupted by other disk IO. false - otherwise.
4221
@param cost OUT The cost.
4224
void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
4228
if (table->file->primary_key_is_clustered())
4230
cost->io_count= table->file->read_time(table->s->primary_key,
4231
(uint) nrows, nrows);
4236
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
4238
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
4239
if (busy_blocks < 1.0)
4242
cost->io_count= busy_blocks;
4246
/* Assume reading is done in one 'sweep' */
4247
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
4248
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);