1
/* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
2
* vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
4
* Copyright (C) 2008 Sun Microsystems, Inc.
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; version 2 of the License.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
23
19
Handler-calling-functions
30
#include "drizzled/error.h"
31
#include "drizzled/field/epoch.h"
32
#include "drizzled/gettext.h"
33
#include "drizzled/internal/my_sys.h"
34
#include "drizzled/item/empty_string.h"
35
#include "drizzled/item/int.h"
36
#include "drizzled/lock.h"
37
#include "drizzled/message/table.h"
38
#include "drizzled/my_hash.h"
39
#include "drizzled/optimizer/cost_vector.h"
40
#include "drizzled/plugin/client.h"
41
#include "drizzled/plugin/event_observer.h"
42
#include "drizzled/plugin/storage_engine.h"
43
#include "drizzled/probes.h"
44
#include "drizzled/session.h"
45
#include "drizzled/sql_base.h"
46
#include "drizzled/sql_parse.h"
47
#include "drizzled/transaction_services.h"
22
#ifdef USE_PRAGMA_IMPLEMENTATION
23
#pragma implementation // gcc: Class implementation
26
#include <drizzled/server_includes.h>
27
#include "rpl_filter.h"
28
#include <drizzled/drizzled_error_messages.h>
31
While we have legacy_db_type, we have this array to
32
check for dups and to find handlerton from legacy_db_type.
33
Remove when legacy_db_type is finally gone
35
st_plugin_int *hton2plugin[MAX_HA];
37
static handlerton *installed_htons[128];
39
#define BITMAP_STACKBUF_SIZE (128/8)
41
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NullS,0}, {NullS,0} };
43
/* number of entries in handlertons[] */
45
/* number of storage engines (from handlertons[]) that support 2pc */
46
uint32_t total_ha_2pc= 0;
47
/* size of savepoint storage area (see ha_init) */
48
uint32_t savepoint_alloc_size= 0;
50
static const LEX_STRING sys_table_aliases[]=
52
{ C_STRING_WITH_LEN("INNOBASE") }, { C_STRING_WITH_LEN("INNODB") },
53
{ C_STRING_WITH_LEN("HEAP") }, { C_STRING_WITH_LEN("MEMORY") },
57
const char *ha_row_type[] = {
58
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
61
const char *tx_isolation_names[] =
62
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
64
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
65
tx_isolation_names, NULL};
67
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
68
uint known_extensions_id= 0;
72
static plugin_ref ha_default_plugin(THD *thd)
74
if (thd->variables.table_plugin)
75
return thd->variables.table_plugin;
76
return my_plugin_lock(thd, &global_system_variables.table_plugin);
81
Return the default storage engine handlerton for thread
83
@param ha_default_handlerton(thd)
84
@param thd current thread
89
handlerton *ha_default_handlerton(THD *thd)
91
plugin_ref plugin= ha_default_plugin(thd);
93
handlerton *hton= plugin_data(plugin, handlerton*);
100
Return the storage engine handlerton for the supplied name
102
@param thd current thread
103
@param name name of storage engine
106
pointer to storage engine plugin handle
108
plugin_ref ha_resolve_by_name(THD *thd, const LEX_STRING *name)
110
const LEX_STRING *table_alias;
114
/* my_strnncoll is a macro and gcc doesn't do early expansion of macro */
115
if (thd && !my_charset_utf8_general_ci.coll->strnncoll(&my_charset_utf8_general_ci,
116
(const uchar *)name->str, name->length,
117
(const uchar *)STRING_WITH_LEN("DEFAULT"), 0))
118
return ha_default_plugin(thd);
120
if ((plugin= my_plugin_lock_by_name(thd, name, DRIZZLE_STORAGE_ENGINE_PLUGIN)))
122
handlerton *hton= plugin_data(plugin, handlerton *);
123
if (!(hton->flags & HTON_NOT_USER_SELECTABLE))
127
unlocking plugin immediately after locking is relatively low cost.
129
plugin_unlock(thd, plugin);
133
We check for the historical aliases.
135
for (table_alias= sys_table_aliases; table_alias->str; table_alias+= 2)
137
if (!my_strnncoll(&my_charset_utf8_general_ci,
138
(const uchar *)name->str, name->length,
139
(const uchar *)table_alias->str, table_alias->length))
141
name= table_alias + 1;
150
plugin_ref ha_lock_engine(THD *thd, handlerton *hton)
154
st_plugin_int **plugin= hton2plugin + hton->slot;
156
return my_plugin_lock(thd, &plugin);
162
handlerton *ha_resolve_by_legacy_type(THD *thd, enum legacy_db_type db_type)
166
case DB_TYPE_DEFAULT:
167
return ha_default_handlerton(thd);
169
if (db_type > DB_TYPE_UNKNOWN && db_type < DB_TYPE_DEFAULT &&
170
(plugin= ha_lock_engine(thd, installed_htons[db_type])))
171
return plugin_data(plugin, handlerton*);
173
case DB_TYPE_UNKNOWN:
180
Use other database handler if databasehandler is not compiled in.
182
handlerton *ha_checktype(THD *thd, enum legacy_db_type database_type,
183
bool no_substitute, bool report_error)
185
handlerton *hton= ha_resolve_by_legacy_type(thd, database_type);
186
if (ha_storage_engine_is_enabled(hton))
193
const char *engine_name= ha_resolve_storage_engine_name(hton);
194
my_error(ER_FEATURE_DISABLED,MYF(0),engine_name,engine_name);
199
switch (database_type) {
201
return ha_resolve_by_legacy_type(thd, DB_TYPE_HASH);
206
return ha_default_handlerton(thd);
210
handler *get_new_handler(TABLE_SHARE *share, MEM_ROOT *alloc,
215
if (db_type && db_type->state == SHOW_OPTION_YES && db_type->create)
217
if ((file= db_type->create(db_type, share, alloc)))
222
Try the default table type
223
Here the call to current_thd() is ok as we call this function a lot of
224
times but we enter this branch very seldom.
226
return(get_new_handler(share, alloc, ha_default_handlerton(current_thd)));
231
Register handler error messages for use with my_error().
239
int ha_init_errors(void)
241
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
242
const char **errmsgs;
244
/* Allocate a pointer array for the error message strings. */
245
/* Zerofill it to avoid uninitialized gaps. */
246
if (! (errmsgs= (const char**) my_malloc(HA_ERR_ERRORS * sizeof(char*),
247
MYF(MY_WME | MY_ZEROFILL))))
250
/* Set the dedicated error messages. */
251
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
252
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
253
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
254
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
255
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
256
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
257
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
258
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
259
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
260
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
261
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
262
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
263
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
264
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
265
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
266
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
267
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
268
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
269
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
270
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
271
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
272
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
273
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
274
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
275
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
276
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
277
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
278
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
279
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
280
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
281
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
282
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
283
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
284
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
285
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
286
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
287
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
288
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
289
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
290
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
291
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
293
/* Register the error messages for use with my_error(). */
294
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
299
Unregister handler error messages.
306
static int ha_finish_errors(void)
308
const char **errmsgs;
310
/* Allocate a pointer array for the error message strings. */
311
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
313
my_free((uchar*) errmsgs, MYF(0));
318
int ha_finalize_handlerton(st_plugin_int *plugin)
320
handlerton *hton= (handlerton *)plugin->data;
325
case SHOW_OPTION_DISABLED:
327
case SHOW_OPTION_YES:
328
if (installed_htons[hton->db_type] == hton)
329
installed_htons[hton->db_type]= NULL;
333
if (hton && plugin->plugin->deinit)
334
(void)plugin->plugin->deinit(hton);
336
my_free((uchar*)hton, MYF(0));
342
int ha_initialize_handlerton(st_plugin_int *plugin)
346
hton= (handlerton *)my_malloc(sizeof(handlerton),
347
MYF(MY_WME | MY_ZEROFILL));
349
FIXME: the MY_ZEROFILL flag above doesn't zero all the bytes.
351
This was detected after adding get_backup_engine member to handlerton
352
structure. Apparently get_backup_engine was not NULL even though it was
355
memset(hton, 0, sizeof(hton));
356
/* Historical Requirement */
357
plugin->data= hton; // shortcut for the future
358
if (plugin->plugin->init)
360
if (plugin->plugin->init(hton))
362
sql_print_error(_("Plugin '%s' init function returned error."),
369
the switch below and hton->state should be removed when
370
command-line options for plugins will be implemented
372
switch (hton->state) {
375
case SHOW_OPTION_YES:
378
/* now check the db_type for conflict */
379
if (hton->db_type <= DB_TYPE_UNKNOWN ||
380
hton->db_type >= DB_TYPE_DEFAULT ||
381
installed_htons[hton->db_type])
383
int idx= (int) DB_TYPE_FIRST_DYNAMIC;
385
while (idx < (int) DB_TYPE_DEFAULT && installed_htons[idx])
388
if (idx == (int) DB_TYPE_DEFAULT)
390
sql_print_warning(_("Too many storage engines!"));
393
if (hton->db_type != DB_TYPE_UNKNOWN)
394
sql_print_warning(_("Storage engine '%s' has conflicting typecode. "
395
"Assigning value %d."), plugin->plugin->name, idx);
396
hton->db_type= (enum legacy_db_type) idx;
398
installed_htons[hton->db_type]= hton;
399
tmp= hton->savepoint_offset;
400
hton->savepoint_offset= savepoint_alloc_size;
401
savepoint_alloc_size+= tmp;
402
hton->slot= total_ha++;
403
hton2plugin[hton->slot]=plugin;
410
hton->state= SHOW_OPTION_DISABLED;
415
This is entirely for legacy. We will create a new "disk based" hton and a
416
"memory" hton which will be configurable longterm. We should be able to
417
remove partition and myisammrg.
419
switch (hton->db_type) {
439
assert(total_ha < MAX_HA);
441
Check if there is a transaction-capable storage engine besides the
442
binary log (which is considered a transaction-capable storage engine in
445
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
446
savepoint_alloc_size+= sizeof(SAVEPOINT);
455
This should be eventualy based on the graceful shutdown flag.
456
So if flag is equal to HA_PANIC_CLOSE, the deallocate
459
if (ha_finish_errors())
465
static bool dropdb_handlerton(THD *unused1 __attribute__((unused)),
469
handlerton *hton= plugin_data(plugin, handlerton *);
470
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
471
hton->drop_database(hton, (char *)path);
476
void ha_drop_database(char* path)
478
plugin_foreach(NULL, dropdb_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
482
static bool closecon_handlerton(THD *thd, plugin_ref plugin,
483
void *unused __attribute__((unused)))
485
handlerton *hton= plugin_data(plugin, handlerton *);
487
there's no need to rollback here as all transactions must
488
be rolled back already
490
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
491
thd_get_ha_data(thd, hton))
492
hton->close_connection(hton, thd);
499
don't bother to rollback here, it's done already
501
void ha_close_connection(THD* thd)
503
plugin_foreach(thd, closecon_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
506
/* ========================================================================
507
======================= TRANSACTIONS ===================================*/
510
Transaction handling in the server
511
==================================
513
In each client connection, MySQL maintains two transactional
515
- a statement transaction,
516
- a standard, also called normal transaction.
520
"Statement transaction" is a non-standard term that comes
521
from the times when MySQL supported BerkeleyDB storage engine.
523
First of all, it should be said that in BerkeleyDB auto-commit
524
mode auto-commits operations that are atomic to the storage
525
engine itself, such as a write of a record, and are too
526
high-granular to be atomic from the application perspective
527
(MySQL). One SQL statement could involve many BerkeleyDB
528
auto-committed operations and thus BerkeleyDB auto-commit was of
531
Secondly, instead of SQL standard savepoints, BerkeleyDB
532
provided the concept of "nested transactions". In a nutshell,
533
transactions could be arbitrarily nested, but when the parent
534
transaction was committed or aborted, all its child (nested)
535
transactions were handled committed or aborted as well.
536
Commit of a nested transaction, in turn, made its changes
537
visible, but not durable: it destroyed the nested transaction,
538
all its changes would become available to the parent and
539
currently active nested transactions of this parent.
541
So the mechanism of nested transactions was employed to
542
provide "all or nothing" guarantee of SQL statements
543
required by the standard.
544
A nested transaction would be created at start of each SQL
545
statement, and destroyed (committed or aborted) at statement
546
end. Such nested transaction was internally referred to as
547
a "statement transaction" and gave birth to the term.
549
<Historical note ends>
551
Since then a statement transaction is started for each statement
552
that accesses transactional tables or uses the binary log. If
553
the statement succeeds, the statement transaction is committed.
554
If the statement fails, the transaction is rolled back. Commits
555
of statement transactions are not durable -- each such
556
transaction is nested in the normal transaction, and if the
557
normal transaction is rolled back, the effects of all enclosed
558
statement transactions are undone as well. Technically,
559
a statement transaction can be viewed as a savepoint which is
560
maintained automatically in order to make effects of one
563
The normal transaction is started by the user and is ended
564
usually upon a user request as well. The normal transaction
565
encloses transactions of all statements issued between
566
its beginning and its end.
567
In autocommit mode, the normal transaction is equivalent
568
to the statement transaction.
570
Since MySQL supports PSEA (pluggable storage engine
571
architecture), more than one transactional engine can be
572
active at a time. Hence transactions, from the server
573
point of view, are always distributed. In particular,
574
transactional state is maintained independently for each
575
engine. In order to commit a transaction the two phase
576
commit protocol is employed.
578
Not all statements are executed in context of a transaction.
579
Administrative and status information statements do not modify
580
engine data, and thus do not start a statement transaction and
581
also have no effect on the normal transaction. Examples of such
582
statements are SHOW STATUS and RESET SLAVE.
584
Similarly DDL statements are not transactional,
585
and therefore a transaction is [almost] never started for a DDL
586
statement. The difference between a DDL statement and a purely
587
administrative statement though is that a DDL statement always
588
commits the current transaction before proceeding, if there is
591
At last, SQL statements that work with non-transactional
592
engines also have no effect on the transaction state of the
593
connection. Even though they are written to the binary log,
594
and the binary log is, overall, transactional, the writes
595
are done in "write-through" mode, directly to the binlog
596
file, followed with a OS cache sync, in other words,
597
bypassing the binlog undo log (translog).
598
They do not commit the current normal transaction.
599
A failure of a statement that uses non-transactional tables
600
would cause a rollback of the statement transaction, but
601
in case there no non-transactional tables are used,
602
no statement transaction is started.
607
The server stores its transaction-related data in
608
thd->transaction. This structure has two members of type
609
THD_TRANS. These members correspond to the statement and
610
normal transactions respectively:
612
- thd->transaction.stmt contains a list of engines
613
that are participating in the given statement
614
- thd->transaction.all contains a list of engines that
615
have participated in any of the statement transactions started
616
within the context of the normal transaction.
617
Each element of the list contains a pointer to the storage
618
engine, engine-specific transactional data, and engine-specific
621
In autocommit mode thd->transaction.all is empty.
622
Instead, data of thd->transaction.stmt is
623
used to commit/rollback the normal transaction.
625
The list of registered engines has a few important properties:
626
- no engine is registered in the list twice
627
- engines are present in the list a reverse temporal order --
628
new participants are always added to the beginning of the list.
630
Transaction life cycle
631
----------------------
633
When a new connection is established, thd->transaction
634
members are initialized to an empty state.
635
If a statement uses any tables, all affected engines
636
are registered in the statement engine list. In
637
non-autocommit mode, the same engines are registered in
638
the normal transaction list.
639
At the end of the statement, the server issues a commit
640
or a roll back for all engines in the statement list.
641
At this point transaction flags of an engine, if any, are
642
propagated from the statement list to the list of the normal
644
When commit/rollback is finished, the statement list is
645
cleared. It will be filled in again by the next statement,
646
and emptied again at the next statement's end.
648
The normal transaction is committed in a similar way
649
(by going over all engines in thd->transaction.all list)
650
but at different times:
651
- upon COMMIT SQL statement is issued by the user
652
- implicitly, by the server, at the beginning of a DDL statement
653
or SET AUTOCOMMIT={0|1} statement.
655
The normal transaction can be rolled back as well:
656
- if the user has requested so, by issuing ROLLBACK SQL
658
- if one of the storage engines requested a rollback
659
by setting thd->transaction_rollback_request. This may
660
happen in case, e.g., when the transaction in the engine was
661
chosen a victim of the internal deadlock resolution algorithm
662
and rolled back internally. When such a situation happens, there
663
is little the server can do and the only option is to rollback
664
transactions in all other participating engines. In this case
665
the rollback is accompanied by an error sent to the user.
667
As follows from the use cases above, the normal transaction
668
is never committed when there is an outstanding statement
669
transaction. In most cases there is no conflict, since
670
commits of the normal transaction are issued by a stand-alone
671
administrative or DDL statement, thus no outstanding statement
672
transaction of the previous statement exists. Besides,
673
all statements that manipulate with the normal transaction
674
are prohibited in stored functions and triggers, therefore
675
no conflicting situation can occur in a sub-statement either.
676
The remaining rare cases when the server explicitly has
677
to commit the statement transaction prior to committing the normal
678
one cover error-handling scenarios (see for example
681
When committing a statement or a normal transaction, the server
682
either uses the two-phase commit protocol, or issues a commit
683
in each engine independently. The two-phase commit protocol
685
- all participating engines support two-phase commit (provide
686
handlerton::prepare PSEA API call) and
687
- transactions in at least two engines modify data (i.e. are
690
Note that the two phase commit is used for
691
statement transactions, even though they are not durable anyway.
692
This is done to ensure logical consistency of data in a multiple-
694
For example, imagine that some day MySQL supports unique
695
constraint checks deferred till the end of statement. In such
696
case a commit in one of the engines may yield ER_DUP_KEY,
697
and MySQL should be able to gracefully abort statement
698
transactions of other participants.
700
After the normal transaction has been committed,
701
thd->transaction.all list is cleared.
703
When a connection is closed, the current normal transaction, if
706
Roles and responsibilities
707
--------------------------
709
The server has no way to know that an engine participates in
710
the statement and a transaction has been started
711
in it unless the engine says so. Thus, in order to be
712
a part of a transaction, the engine must "register" itself.
713
This is done by invoking trans_register_ha() server call.
714
Normally the engine registers itself whenever handler::external_lock()
715
is called. trans_register_ha() can be invoked many times: if
716
an engine is already registered, the call does nothing.
717
In case autocommit is not set, the engine must register itself
718
twice -- both in the statement list and in the normal transaction
720
In which list to register is a parameter of trans_register_ha().
722
Note, that although the registration interface in itself is
723
fairly clear, the current usage practice often leads to undesired
724
effects. E.g. since a call to trans_register_ha() in most engines
725
is embedded into implementation of handler::external_lock(), some
726
DDL statements start a transaction (at least from the server
727
point of view) even though they are not expected to. E.g.
728
CREATE TABLE does not start a transaction, since
729
handler::external_lock() is never called during CREATE TABLE. But
730
CREATE TABLE ... SELECT does, since handler::external_lock() is
731
called for the table that is being selected from. This has no
732
practical effects currently, but must be kept in mind
735
Once an engine is registered, the server will do the rest
738
During statement execution, whenever any of data-modifying
739
PSEA API methods is used, e.g. handler::write_row() or
740
handler::update_row(), the read-write flag is raised in the
741
statement transaction for the involved engine.
742
Currently All PSEA calls are "traced", and the data can not be
743
changed in a way other than issuing a PSEA call. Important:
744
unless this invariant is preserved the server will not know that
745
a transaction in a given engine is read-write and will not
746
involve the two-phase commit protocol!
748
At the end of a statement, server call
749
ha_autocommit_or_rollback() is invoked. This call in turn
750
invokes handlerton::prepare() for every involved engine.
751
Prepare is followed by a call to handlerton::commit_one_phase()
752
If a one-phase commit will suffice, handlerton::prepare() is not
753
invoked and the server only calls handlerton::commit_one_phase().
754
At statement commit, the statement-related read-write engine
755
flag is propagated to the corresponding flag in the normal
756
transaction. When the commit is complete, the list of registered
759
Rollback is handled in a similar fashion.
761
Additional notes on DDL and the normal transaction.
762
---------------------------------------------------
764
DDLs and operations with non-transactional engines
765
do not "register" in thd->transaction lists, and thus do not
766
modify the transaction state. Besides, each DDL in
767
MySQL is prefixed with an implicit normal transaction commit
768
(a call to end_active_trans()), and thus leaves nothing
770
However, as it has been pointed out with CREATE TABLE .. SELECT,
771
some DDL statements can start a *new* transaction.
773
Behaviour of the server in this case is currently badly
775
DDL statements use a form of "semantic" logging
776
to maintain atomicity: if CREATE TABLE .. SELECT failed,
777
the newly created table is deleted.
778
In addition, some DDL statements issue interim transaction
779
commits: e.g. ALTER Table issues a commit after data is copied
780
from the original table to the internal temporary table. Other
781
statements, e.g. CREATE TABLE ... SELECT do not always commit
783
And finally there is a group of DDL statements such as
784
RENAME/DROP Table that doesn't start a new transaction
787
This diversity makes it hard to say what will happen if
788
by chance a stored function is invoked during a DDL --
789
whether any modifications it makes will be committed or not
790
is not clear. Fortunately, SQL grammar of few DDLs allows
791
invocation of a stored function.
793
A consistent behaviour is perhaps to always commit the normal
794
transaction after all DDLs, just like the statement transaction
795
is always committed at the end of all statements.
799
Register a storage engine for a transaction.
801
Every storage engine MUST call this function when it starts
802
a transaction or a statement (that is it must be called both for the
803
"beginning of transaction" and "beginning of statement").
804
Only storage engines registered for the transaction/statement
805
will know when to commit/rollback it.
808
trans_register_ha is idempotent - storage engine may register many
809
times per transaction.
812
void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
815
Ha_trx_info *ha_info;
819
trans= &thd->transaction.all;
820
thd->server_status|= SERVER_STATUS_IN_TRANS;
823
trans= &thd->transaction.stmt;
825
ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
827
if (ha_info->is_started())
828
return; /* already registered, return */
830
ha_info->register_ha(trans, ht_arg);
832
trans->no_2pc|=(ht_arg->prepare==0);
833
if (thd->transaction.xid_state.xid.is_null())
834
thd->transaction.xid_state.xid.set(thd->query_id);
843
1 error, transaction was rolled back
845
int ha_prepare(THD *thd)
848
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
849
Ha_trx_info *ha_info= trans->ha_list;
852
for (; ha_info; ha_info= ha_info->next())
855
handlerton *ht= ha_info->ht();
856
status_var_increment(thd->status_var.ha_prepare_count);
859
if ((err= ht->prepare(ht, thd, all)))
861
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
862
ha_rollback_trans(thd, all);
869
push_warning_printf(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN,
870
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
871
ha_resolve_storage_engine_name(ht));
879
Check if we can skip the two-phase commit.
881
A helper function to evaluate if two-phase commit is mandatory.
882
As a side effect, propagates the read-only/read-write flags
883
of the statement transaction to its enclosing normal transaction.
885
@retval true we must run a two-phase commit. Returned
886
if we have at least two engines with read-write changes.
887
@retval false Don't need two-phase commit. Even if we have two
888
transactional engines, we can run two independent
889
commits if changes in one of the engines are read-only.
894
ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
897
/* The number of storage engines that have actual changes. */
898
unsigned rw_ha_count= 0;
899
Ha_trx_info *ha_info;
901
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
903
if (ha_info->is_trx_read_write())
908
Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
909
assert(ha_info != ha_info_all);
911
Merge read-only/read-write information about statement
912
transaction to its enclosing normal transaction. Do this
913
only if in a real transaction -- that is, if we know
914
that ha_info_all is registered in thd->transaction.all.
915
Since otherwise we only clutter the normal transaction flags.
917
if (ha_info_all->is_started()) /* false if autocommit. */
918
ha_info_all->coalesce_trx_with(ha_info);
920
else if (rw_ha_count > 1)
923
It is a normal transaction, so we don't need to merge read/write
924
information up, and the need for two-phase commit has been
925
already established. Break the loop prematurely.
930
return rw_ha_count > 1;
938
1 transaction was rolled back
940
2 error during commit, data may be inconsistent
943
Since we don't support nested statement transactions in 5.0,
944
we can't commit or rollback stmt transactions while we are inside
945
stored functions or triggers. So we simply do nothing now.
946
TODO: This should be fixed in later ( >= 5.1) releases.
948
int ha_commit_trans(THD *thd, bool all)
950
int error= 0, cookie= 0;
952
'all' means that this is either an explicit commit issued by
953
user, or an implicit commit issued by a DDL.
955
THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
956
bool is_real_trans= all || thd->transaction.all.ha_list == 0;
957
Ha_trx_info *ha_info= trans->ha_list;
958
my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
961
We must not commit the normal transaction if a statement
962
transaction is pending. Otherwise statement transaction
963
flags will not get propagated to its normal transaction's
966
assert(thd->transaction.stmt.ha_list == NULL ||
967
trans == &thd->transaction.stmt);
969
if (thd->in_sub_stmt)
972
Since we don't support nested statement transactions in 5.0,
973
we can't commit or rollback stmt transactions while we are inside
974
stored functions or triggers. So we simply do nothing now.
975
TODO: This should be fixed in later ( >= 5.1) releases.
980
We assume that all statements which commit or rollback main transaction
981
are prohibited inside of stored functions or triggers. So they should
982
bail out with error even before ha_commit_trans() call. To be 100% safe
983
let us throw error in non-debug builds.
986
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
993
if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
995
ha_rollback_trans(thd, all);
1001
&& ! thd->slave_thread
1004
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
1005
ha_rollback_trans(thd, all);
1010
must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
1012
if (!trans->no_2pc && must_2pc)
1014
for (; ha_info && !error; ha_info= ha_info->next())
1017
handlerton *ht= ha_info->ht();
1019
Do not call two-phase commit if this particular
1020
transaction is read-only. This allows for simpler
1021
implementation in engines that are always read-only.
1023
if (! ha_info->is_trx_read_write())
1026
Sic: we know that prepare() is not NULL since otherwise
1027
trans->no_2pc would have been set.
1029
if ((err= ht->prepare(ht, thd, all)))
1031
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1034
status_var_increment(thd->status_var.ha_prepare_count);
1036
if (error || (is_real_trans && xid &&
1037
(error= !(cookie= tc_log->log_xid(thd, xid)))))
1039
ha_rollback_trans(thd, all);
1044
error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
1046
tc_log->unlog(cookie, xid);
1049
start_waiting_global_read_lock(thd);
1056
This function does not care about global read lock. A caller should.
1058
int ha_commit_one_phase(THD *thd, bool all)
1061
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1062
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1063
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1066
for (; ha_info; ha_info= ha_info_next)
1069
handlerton *ht= ha_info->ht();
1070
if ((err= ht->commit(ht, thd, all)))
1072
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1075
status_var_increment(thd->status_var.ha_commit_count);
1076
ha_info_next= ha_info->next();
1077
ha_info->reset(); /* keep it conveniently zero-filled */
1082
thd->transaction.xid_state.xid.null();
1085
thd->variables.tx_isolation=thd->session_tx_isolation;
1086
thd->transaction.cleanup();
1093
int ha_rollback_trans(THD *thd, bool all)
1096
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1097
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1098
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1101
We must not rollback the normal transaction if a statement
1102
transaction is pending.
1104
assert(thd->transaction.stmt.ha_list == NULL ||
1105
trans == &thd->transaction.stmt);
1107
if (thd->in_sub_stmt)
1110
If we are inside stored function or trigger we should not commit or
1111
rollback current statement transaction. See comment in ha_commit_trans()
1112
call for more information.
1117
my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1122
for (; ha_info; ha_info= ha_info_next)
1125
handlerton *ht= ha_info->ht();
1126
if ((err= ht->rollback(ht, thd, all)))
1128
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1131
status_var_increment(thd->status_var.ha_rollback_count);
1132
ha_info_next= ha_info->next();
1133
ha_info->reset(); /* keep it conveniently zero-filled */
1138
thd->transaction.xid_state.xid.null();
1141
thd->variables.tx_isolation=thd->session_tx_isolation;
1142
thd->transaction.cleanup();
1146
thd->transaction_rollback_request= false;
1149
If a non-transactional table was updated, warn; don't warn if this is a
1150
slave thread (because when a slave thread executes a ROLLBACK, it has
1151
been read from the binary log, so it's 100% sure and normal to produce
1152
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
1153
slave SQL thread, it would not stop the thread but just be printed in
1154
the error log; but we don't want users to wonder why they have this
1155
message in the error log, so we don't send it.
1157
if (is_real_trans && thd->transaction.all.modified_non_trans_table &&
1158
!thd->slave_thread && thd->killed != THD::KILL_CONNECTION)
1159
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN,
1160
ER_WARNING_NOT_COMPLETE_ROLLBACK,
1161
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
1166
This is used to commit or rollback a single statement depending on
1170
Note that if the autocommit is on, then the following call inside
1171
InnoDB will commit or rollback the whole transaction (= the statement). The
1172
autocommit mechanism built into InnoDB is based on counting locks, but if
1173
the user has used LOCK TABLES then that mechanism does not know to do the
1176
int ha_autocommit_or_rollback(THD *thd, int error)
1178
if (thd->transaction.stmt.ha_list)
1182
if (ha_commit_trans(thd, 0))
1187
(void) ha_rollback_trans(thd, 0);
1188
if (thd->transaction_rollback_request && !thd->in_sub_stmt)
1189
(void) ha_rollback(thd);
1192
thd->variables.tx_isolation=thd->session_tx_isolation;
1203
static bool xacommit_handlerton(THD *unused1 __attribute__((unused)),
1207
handlerton *hton= plugin_data(plugin, handlerton *);
1208
if (hton->state == SHOW_OPTION_YES && hton->recover)
1210
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
1211
((struct xahton_st *)arg)->result= 0;
1216
static bool xarollback_handlerton(THD *unused1 __attribute__((unused)),
1220
handlerton *hton= plugin_data(plugin, handlerton *);
1221
if (hton->state == SHOW_OPTION_YES && hton->recover)
1223
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
1224
((struct xahton_st *)arg)->result= 0;
1230
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
1232
struct xahton_st xaop;
1236
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
1237
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
1243
recover() step of xa.
1246
there are three modes of operation:
1247
- automatic recover after a crash
1248
in this case commit_list != 0, tc_heuristic_recover==0
1249
all xids from commit_list are committed, others are rolled back
1250
- manual (heuristic) recover
1251
in this case commit_list==0, tc_heuristic_recover != 0
1252
DBA has explicitly specified that all prepared transactions should
1253
be committed (or rolled back).
1254
- no recovery (MySQL did not detect a crash)
1255
in this case commit_list==0, tc_heuristic_recover == 0
1256
there should be no prepared transactions in this case.
1260
int len, found_foreign_xids, found_my_xids;
1266
static bool xarecover_handlerton(THD *unused __attribute__((unused)),
1270
handlerton *hton= plugin_data(plugin, handlerton *);
1271
struct xarecover_st *info= (struct xarecover_st *) arg;
1274
if (hton->state == SHOW_OPTION_YES && hton->recover)
1276
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
1278
sql_print_information(_("Found %d prepared transaction(s) in %s"),
1279
got, ha_resolve_storage_engine_name(hton));
1280
for (int i=0; i < got; i ++)
1282
my_xid x=info->list[i].get_my_xid();
1283
if (!x) // not "mine" - that is generated by external TM
1285
xid_cache_insert(info->list+i, XA_PREPARED);
1286
info->found_foreign_xids++;
1291
info->found_my_xids++;
1295
if (info->commit_list ?
1296
hash_search(info->commit_list, (uchar *)&x, sizeof(x)) != 0 :
1297
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
1299
hton->commit_by_xid(hton, info->list+i);
1303
hton->rollback_by_xid(hton, info->list+i);
1306
if (got < info->len)
1313
int ha_recover(HASH *commit_list)
1315
struct xarecover_st info;
1316
info.found_foreign_xids= info.found_my_xids= 0;
1317
info.commit_list= commit_list;
1318
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1321
/* commit_list and tc_heuristic_recover cannot be set both */
1322
assert(info.commit_list==0 || tc_heuristic_recover==0);
1323
/* if either is set, total_ha_2pc must be set too */
1324
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1326
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1329
if (info.commit_list)
1330
sql_print_information(_("Starting crash recovery..."));
1333
#ifndef WILL_BE_DELETED_LATER
1336
for now, only InnoDB supports 2pc. It means we can always safely
1337
rollback all pending transactions, without risking inconsistent data
1340
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1341
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1346
for (info.len= MAX_XID_LIST_SIZE ;
1347
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1349
info.list=(XID *)my_malloc(info.len*sizeof(XID), MYF(0));
1353
sql_print_error(ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1357
plugin_foreach(NULL, xarecover_handlerton,
1358
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1360
my_free((uchar*)info.list, MYF(0));
1361
if (info.found_foreign_xids)
1362
sql_print_warning(_("Found %d prepared XA transactions"),
1363
info.found_foreign_xids);
1364
if (info.dry_run && info.found_my_xids)
1366
sql_print_error(_("Found %d prepared transactions! It means that drizzled "
1367
"was not shut down properly last time and critical "
1368
"recovery information (last binlog or %s file) was "
1369
"manually deleted after a crash. You have to start "
1370
"drizzled with the --tc-heuristic-recover switch to "
1371
"commit or rollback pending transactions."),
1372
info.found_my_xids, opt_tc_log_file);
1375
if (info.commit_list)
1376
sql_print_information(_("Crash recovery finished."));
1381
return the list of XID's to a client, the same way SHOW commands do.
1384
I didn't find in XA specs that an RM cannot return the same XID twice,
1385
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1386
It can be easily fixed later, if necessary.
1388
bool mysql_xa_recover(THD *thd)
1390
List<Item> field_list;
1391
Protocol *protocol= thd->protocol;
1395
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1396
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1397
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1398
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1400
if (protocol->send_fields(&field_list,
1401
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1404
pthread_mutex_lock(&LOCK_xid_cache);
1405
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1407
if (xs->xa_state==XA_PREPARED)
1409
protocol->prepare_for_resend();
1410
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1411
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1412
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1413
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1415
if (protocol->write())
1417
pthread_mutex_unlock(&LOCK_xid_cache);
1423
pthread_mutex_unlock(&LOCK_xid_cache);
1430
This function should be called when MySQL sends rows of a SELECT result set
1431
or the EOF mark to the client. It releases a possible adaptive hash index
1432
S-latch held by thd in InnoDB and also releases a possible InnoDB query
1433
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to
1434
keep them over several calls of the InnoDB handler interface when a join
1435
is executed. But when we let the control to pass to the client they have
1436
to be released because if the application program uses mysql_use_result(),
1437
it may deadlock on the S-latch if the application on another connection
1438
performs another SQL query. In MySQL-4.1 this is even more important because
1439
there a connection can have several SELECT queries open at the same time.
1441
@param thd the thread handle of the current connection
1446
static bool release_temporary_latches(THD *thd, plugin_ref plugin,
1447
void *unused __attribute__((unused)))
1449
handlerton *hton= plugin_data(plugin, handlerton *);
1451
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1452
hton->release_temporary_latches(hton, thd);
1458
int ha_release_temporary_latches(THD *thd)
1460
plugin_foreach(thd, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1466
int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
1469
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1470
&thd->transaction.all);
1471
Ha_trx_info *ha_info, *ha_info_next;
1475
rolling back to savepoint in all storage engines that were part of the
1476
transaction when the savepoint was set
1478
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1481
handlerton *ht= ha_info->ht();
1483
assert(ht->savepoint_set != 0);
1484
if ((err= ht->savepoint_rollback(ht, thd,
1485
(uchar *)(sv+1)+ht->savepoint_offset)))
1487
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1490
status_var_increment(thd->status_var.ha_savepoint_rollback_count);
1491
trans->no_2pc|= ht->prepare == 0;
1494
rolling back the transaction in all storage engines that were not part of
1495
the transaction when the savepoint was set
1497
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1498
ha_info= ha_info_next)
1501
handlerton *ht= ha_info->ht();
1502
if ((err= ht->rollback(ht, thd, !thd->in_sub_stmt)))
1504
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1507
status_var_increment(thd->status_var.ha_rollback_count);
1508
ha_info_next= ha_info->next();
1509
ha_info->reset(); /* keep it conveniently zero-filled */
1511
trans->ha_list= sv->ha_list;
1517
according to the sql standard (ISO/IEC 9075-2:2003)
1518
section "4.33.4 SQL-statements and transaction states",
1519
SAVEPOINT is *not* transaction-initiating SQL-statement
1521
int ha_savepoint(THD *thd, SAVEPOINT *sv)
1524
THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1525
&thd->transaction.all);
1526
Ha_trx_info *ha_info= trans->ha_list;
1527
for (; ha_info; ha_info= ha_info->next())
1530
handlerton *ht= ha_info->ht();
1532
if (! ht->savepoint_set)
1534
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1538
if ((err= ht->savepoint_set(ht, thd, (uchar *)(sv+1)+ht->savepoint_offset)))
1540
my_error(ER_GET_ERRNO, MYF(0), err);
1543
status_var_increment(thd->status_var.ha_savepoint_count);
1546
Remember the list of registered storage engines. All new
1547
engines are prepended to the beginning of the list.
1549
sv->ha_list= trans->ha_list;
1553
int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
1556
Ha_trx_info *ha_info= sv->ha_list;
1558
for (; ha_info; ha_info= ha_info->next())
1561
handlerton *ht= ha_info->ht();
1562
/* Savepoint life time is enclosed into transaction life time. */
1564
if (!ht->savepoint_release)
1566
if ((err= ht->savepoint_release(ht, thd,
1567
(uchar *)(sv+1) + ht->savepoint_offset)))
1569
my_error(ER_GET_ERRNO, MYF(0), err);
1577
static bool snapshot_handlerton(THD *thd, plugin_ref plugin, void *arg)
1579
handlerton *hton= plugin_data(plugin, handlerton *);
1580
if (hton->state == SHOW_OPTION_YES &&
1581
hton->start_consistent_snapshot)
1583
hton->start_consistent_snapshot(hton, thd);
1584
*((bool *)arg)= false;
1589
int ha_start_consistent_snapshot(THD *thd)
1593
plugin_foreach(thd, snapshot_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1596
Same idea as when one wants to CREATE TABLE in one engine which does not
1600
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1601
"This MySQL server does not support any "
1602
"consistent-read capable storage engine");
1607
static bool flush_handlerton(THD *thd __attribute__((unused)),
1609
void *arg __attribute__((unused)))
1611
handlerton *hton= plugin_data(plugin, handlerton *);
1612
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1613
hton->flush_logs(hton))
1619
bool ha_flush_logs(handlerton *db_type)
1621
if (db_type == NULL)
1623
if (plugin_foreach(NULL, flush_handlerton,
1624
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1629
if (db_type->state != SHOW_OPTION_YES ||
1630
(db_type->flush_logs && db_type->flush_logs(db_type)))
1636
static const char *check_lowercase_names(handler *file, const char *path,
1639
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1642
/* Ensure that table handler get path in lower case */
1643
if (tmp_path != path)
1644
stpcpy(tmp_path, path);
1647
we only should turn into lowercase database/table part
1648
so start the process after homedirectory
1650
my_casedn_str(files_charset_info, tmp_path + mysql_data_home_len);
1656
An interceptor to hijack the text of the error message without
1657
setting an error in the thread. We need the text to present it
1658
in the form of a warning to the user.
1661
struct Ha_delete_table_error_handler: public Internal_error_handler
1664
virtual bool handle_error(uint sql_errno,
1665
const char *message,
1666
DRIZZLE_ERROR::enum_warning_level level,
1668
char buff[DRIZZLE_ERRMSG_SIZE];
1673
Ha_delete_table_error_handler::
1674
handle_error(uint sql_errno __attribute__((unused)),
1675
const char *message,
1676
DRIZZLE_ERROR::enum_warning_level level __attribute__((unused)),
1677
THD *thd __attribute__((unused)))
1679
/* Grab the error message */
1680
strmake(buff, message, sizeof(buff)-1);
1686
This should return ENOENT if the file doesn't exists.
1687
The .frm file will be deleted only if we return 0 or ENOENT
1689
int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
1690
const char *db, const char *alias, bool generate_warning)
1693
char tmp_path[FN_REFLEN];
1696
TABLE_SHARE dummy_share;
1698
memset(&dummy_table, 0, sizeof(dummy_table));
1699
memset(&dummy_share, 0, sizeof(dummy_share));
1700
dummy_table.s= &dummy_share;
1702
/* DB_TYPE_UNKNOWN is used in ALTER Table when renaming only .frm files */
1703
if (table_type == NULL ||
1704
! (file=get_new_handler((TABLE_SHARE*)0, thd->mem_root, table_type)))
1707
path= check_lowercase_names(file, path, tmp_path);
1708
if ((error= file->ha_delete_table(path)) && generate_warning)
1711
Because file->print_error() use my_error() to generate the error message
1712
we use an internal error handler to intercept it and store the text
1713
in a temporary buffer. Later the message will be presented to user
1716
Ha_delete_table_error_handler ha_delete_table_error_handler;
1718
/* Fill up strucutures that print_error may need */
1719
dummy_share.path.str= (char*) path;
1720
dummy_share.path.length= strlen(path);
1721
dummy_share.db.str= (char*) db;
1722
dummy_share.db.length= strlen(db);
1723
dummy_share.table_name.str= (char*) alias;
1724
dummy_share.table_name.length= strlen(alias);
1725
dummy_table.alias= alias;
1727
file->change_table_ptr(&dummy_table, &dummy_share);
1729
thd->push_internal_handler(&ha_delete_table_error_handler);
1730
file->print_error(error, 0);
1732
thd->pop_internal_handler();
1735
XXX: should we convert *all* errors to warnings here?
1736
What if the error is fatal?
1738
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_ERROR, error,
1739
ha_delete_table_error_handler.buff);
54
1745
/****************************************************************************
55
** General Cursor functions
1746
** General handler functions
56
1747
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
61
estimation_rows_to_insert(0),
63
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
64
ref_length(sizeof(internal::my_off_t)),
67
next_insert_id(0), insert_id_for_cur_row(0)
72
assert(locked == false);
73
/* TODO: assert(inited == NONE); */
78
* @note this only used in
79
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
80
* of the writing of this comment. -Brian
82
Cursor *Cursor::clone(memory::Root *mem_root)
84
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
1748
handler *handler::clone(MEM_ROOT *mem_root)
1750
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
87
Allocate Cursor->ref here because otherwise ha_open will allocate it
88
on this->table->mem_root and we will not be able to reclaim that memory
89
when the clone Cursor object is destroyed.
1752
Allocate handler->ref here because otherwise ha_open will allocate it
1753
on this->table->mem_root and we will not be able to reclaim that memory
1754
when the clone handler object is destroyed.
91
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1756
if (!(new_handler->ref= (uchar*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
94
identifier::Table identifier(getTable()->getShare()->getSchemaName(),
95
getTable()->getShare()->getTableName(),
96
getTable()->getShare()->getType());
98
if (new_handler && !new_handler->ha_open(identifier,
99
getTable()->getDBStat(),
1758
if (new_handler && !new_handler->ha_open(table,
1759
table->s->normalized_path.str,
100
1761
HA_OPEN_IGNORE_IF_LOCKED))
101
1762
return new_handler;
107
given a buffer with a key value, and a map of keyparts
108
that are present in this value, returns the length of the value
110
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
112
/* works only with key prefixes */
113
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
115
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
116
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
119
while (key_part_found < end_key_part_found && keypart_map_arg)
121
length+= key_part_found->store_length;
122
keypart_map_arg >>= 1;
128
int Cursor::startIndexScan(uint32_t idx, bool sorted)
131
assert(inited == NONE);
132
if (!(result= doStartIndexScan(idx, sorted)))
138
int Cursor::endIndexScan()
140
assert(inited==INDEX);
143
return(doEndIndexScan());
146
int Cursor::startTableScan(bool scan)
149
assert(inited==NONE || (inited==RND && scan));
150
inited= (result= doStartTableScan(scan)) ? NONE: RND;
155
int Cursor::endTableScan()
159
return(doEndTableScan());
162
int Cursor::ha_index_or_rnd_end()
164
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
167
void Cursor::ha_start_bulk_insert(ha_rows rows)
169
estimation_rows_to_insert= rows;
170
start_bulk_insert(rows);
173
int Cursor::ha_end_bulk_insert()
175
estimation_rows_to_insert= 0;
176
return end_bulk_insert();
179
const key_map *Cursor::keys_to_use_for_scanning()
181
return &key_map_empty;
184
bool Cursor::has_transactions()
186
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
189
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
191
(getTable()->in_use->status_var.*offset)++;
194
void **Cursor::ha_data(Session *session) const
196
return session->getEngineData(getEngine());
199
bool Cursor::is_fatal_error(int error, uint32_t flags)
202
((flags & HA_CHECK_DUP_KEY) &&
203
(error == HA_ERR_FOUND_DUPP_KEY ||
204
error == HA_ERR_FOUND_DUPP_UNIQUE)))
210
ha_rows Cursor::records() { return stats.records; }
211
uint64_t Cursor::tableSize() { return stats.index_file_length + stats.data_file_length; }
212
uint64_t Cursor::rowSize() { return getTable()->getRecordLength() + getTable()->sizeFields(); }
214
int Cursor::doOpen(const identifier::Table &identifier, int mode, uint32_t test_if_locked)
216
return open(identifier.getPath().c_str(), mode, test_if_locked);
1768
void handler::ha_statistic_increment(ulong SSV::*offset) const
1770
status_var_increment(table->in_use->status_var.*offset);
1773
void **handler::ha_data(THD *thd) const
1775
return thd_ha_data(thd, ht);
1778
THD *handler::ha_thd(void) const
1780
assert(!table || !table->in_use || table->in_use == current_thd);
1781
return (table && table->in_use) ? table->in_use : current_thd;
220
Open database-Cursor.
1785
Open database-handler.
222
1787
Try O_RDONLY if cannot open as O_RDWR
223
1788
Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
225
int Cursor::ha_open(const identifier::Table &identifier,
1790
int handler::ha_open(Table *table_arg, const char *name, int mode,
231
if ((error= doOpen(identifier, mode, test_if_locked)))
1796
assert(table->s == table_share);
1797
assert(alloc_root_inited(&table->mem_root));
1799
if ((error=open(name,mode,test_if_locked)))
233
1801
if ((error == EACCES || error == EROFS) && mode == O_RDWR &&
234
(getTable()->db_stat & HA_TRY_READ_ONLY))
1802
(table->db_stat & HA_TRY_READ_ONLY))
236
getTable()->db_stat|=HA_READ_ONLY;
237
error= doOpen(identifier, O_RDONLY,test_if_locked);
1804
table->db_stat|=HA_READ_ONLY;
1805
error=open(name,O_RDONLY,test_if_locked);
242
errno= error; /* Safeguard */
1810
my_errno= error; /* Safeguard */
246
if (getTable()->getShare()->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
247
getTable()->db_stat|=HA_READ_ONLY;
1814
if (table->s->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
1815
table->db_stat|=HA_READ_ONLY;
248
1816
(void) extra(HA_EXTRA_NO_READCHECK); // Not needed in SQL
250
/* ref is already allocated for us if we're called from Cursor::clone() */
251
if (!ref && !(ref= (unsigned char*) getTable()->alloc_root(ALIGN_SIZE(ref_length)*2)))
1818
/* ref is already allocated for us if we're called from handler::clone() */
1819
if (!ref && !(ref= (uchar*) alloc_root(&table->mem_root,
1820
ALIGN_SIZE(ref_length)*2)))
254
1823
error=HA_ERR_OUT_OF_MEM;
257
1826
dup_ref=ref+ALIGN_SIZE(ref_length);
1827
cached_table_flags= table_flags();
1833
one has to use this method when to find
1834
random position by record as the plain
1835
position() call doesn't work for some
1836
handlers for random position
1839
int handler::rnd_pos_by_record(uchar *record)
1844
if (inited && (error= ha_index_end()))
1846
if ((error= ha_rnd_init(false)))
1849
return(rnd_pos(record, ref));
1149
3676
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1151
3678
*range_info= mrr_cur_range.ptr;
3683
/* **************************************************************************
3684
* DS-MRR implementation
3685
***************************************************************************/
3688
DS-MRR: Initialize and start MRR scan
3690
Initialize and start the MRR scan. Depending on the mode parameter, this
3691
may use default or DS-MRR implementation.
3693
@param h Table handler to be used
3694
@param key Index to be used
3695
@param seq_funcs Interval sequence enumeration functions
3696
@param seq_init_param Interval sequence enumeration parameter
3697
@param n_ranges Number of ranges in the sequence.
3698
@param mode HA_MRR_* modes to use
3699
@param buf INOUT Buffer to use
3701
@retval 0 Ok, Scan started.
3705
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3706
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3707
uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3711
Item *pushed_cond= NULL;
3713
keyno= h->active_index;
3715
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3717
use_default_impl= true;
3718
return(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3719
n_ranges, mode, buf));
3721
rowids_buf= buf->buffer;
3722
//psergey-todo: don't add key_length as it is not needed anymore
3723
rowids_buf += key->key_length + h->ref_length;
3725
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3726
rowids_buf_end= buf->buffer_end;
3728
elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3729
rowids_buf_last= rowids_buf +
3730
((rowids_buf_end - rowids_buf)/ elem_size)*
3732
rowids_buf_end= rowids_buf_last;
3734
/* Create a separate handler object to do rndpos() calls. */
3735
THD *thd= current_thd;
3736
if (!(new_h2= h->clone(thd->mem_root)) ||
3737
new_h2->ha_external_lock(thd, F_RDLCK))
3743
if (keyno == h->pushed_idx_cond_keyno)
3744
pushed_cond= h->pushed_idx_cond;
3745
if (h->ha_index_end())
3752
table->prepare_for_position();
3753
new_h2->extra(HA_EXTRA_KEYREAD);
3755
if (h2->ha_index_init(keyno, false) ||
3756
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3759
use_default_impl= false;
3762
h2->idx_cond_push(keyno, pushed_cond);
3763
if (dsmrr_fill_buffer(new_h2))
3767
If the above call has scanned through all intervals in *seq, then
3768
adjust *buf to indicate that the remaining buffer space will not be used.
3771
buf->end_of_used_area= rowids_buf_last;
3773
if (h->ha_rnd_init(false))
3778
h2->ha_index_or_rnd_end();
3779
h2->ha_external_lock(thd, F_UNLCK);
3786
void DsMrr_impl::dsmrr_close()
3790
h2->ha_external_lock(current_thd, F_UNLCK);
3795
use_default_impl= true;
3800
static int rowid_cmp(void *h, uchar *a, uchar *b)
3802
return ((handler*)h)->cmp_ref(a, b);
3807
DS-MRR: Fill the buffer with rowids and sort it by rowid
3809
{This is an internal function of DiskSweep MRR implementation}
3810
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3811
buffer. When the buffer is full or scan is completed, sort the buffer by
3814
The function assumes that rowids buffer is empty when it is invoked.
3816
@param h Table handler
3818
@retval 0 OK, the next portion of rowids is in the buffer,
3823
int DsMrr_impl::dsmrr_fill_buffer(handler *unused __attribute__((unused)))
3828
rowids_buf_cur= rowids_buf;
3829
while ((rowids_buf_cur < rowids_buf_end) &&
3830
!(res= h2->handler::multi_range_read_next(&range_info)))
3832
/* Put rowid, or {rowid, range_id} pair into the buffer */
3833
h2->position(table->record[0]);
3834
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3835
rowids_buf_cur += h->ref_length;
3839
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3840
rowids_buf_cur += sizeof(void*);
3844
if (res && res != HA_ERR_END_OF_FILE)
3846
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3848
/* Sort the buffer contents by rowid */
3849
uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3850
uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3852
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3854
rowids_buf_last= rowids_buf_cur;
3855
rowids_buf_cur= rowids_buf;
3861
DS-MRR implementation: multi_range_read_next() function
3864
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
3868
if (use_default_impl)
3869
return h->handler::multi_range_read_next(range_info);
3871
if (rowids_buf_cur == rowids_buf_last)
3875
res= HA_ERR_END_OF_FILE;
3878
res= dsmrr_fill_buffer(h);
3883
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3884
if (rowids_buf_cur == rowids_buf_last)
3886
res= HA_ERR_END_OF_FILE;
3890
res= h->rnd_pos(table->record[0], rowids_buf_cur);
3891
rowids_buf_cur += h->ref_length;
3894
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3895
rowids_buf_cur += sizeof(void*);
3906
DS-MRR implementation: multi_range_read_info() function
3908
int DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, uint *bufsz,
3909
uint *flags, COST_VECT *cost)
3912
uint def_flags= *flags;
3913
uint def_bufsz= *bufsz;
3915
/* Get cost/flags/mem_usage of default MRR implementation */
3916
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3920
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3921
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3923
/* Default implementation is choosen */
3932
DS-MRR Implementation: multi_range_read_info_const() function
3935
ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
3936
void *seq_init_param, uint n_ranges,
3937
uint *bufsz, uint *flags, COST_VECT *cost)
3940
uint def_flags= *flags;
3941
uint def_bufsz= *bufsz;
3942
/* Get cost/flags/mem_usage of default MRR implementation */
3943
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3944
n_ranges, &def_bufsz,
3946
if (rows == HA_POS_ERROR)
3948
/* Default implementation can't perform MRR scan => we can't either */
3953
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3954
use the default MRR implementation (we need it for UPDATE/DELETE).
3955
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3957
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3958
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3965
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3972
Check if key has partially-covered columns
3974
We can't use DS-MRR to perform range scans when the ranges are over
3975
partially-covered keys, because we'll not have full key part values
3976
(we'll have their prefixes from the index) and will not be able to check
3977
if we've reached the end the range.
3979
@param keyno Key to check
3982
Allow use of DS-MRR in cases where the index has partially-covered
3983
components but they are not used for scanning.
3989
bool DsMrr_impl::key_uses_partial_cols(uint keyno)
3991
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3992
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3993
for (; kp != kp_end; kp++)
3995
if (!kp->field->part_of_key.is_set(keyno))
4003
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
4005
Make the choice between using Default MRR implementation and DS-MRR.
4006
This function contains common functionality factored out of dsmrr_info()
4007
and dsmrr_info_const(). The function assumes that the default MRR
4008
implementation's applicability requirements are satisfied.
4010
@param keyno Index number
4011
@param rows E(full rows to be retrieved)
4012
@param flags IN MRR flags provided by the MRR user
4013
OUT If DS-MRR is choosen, flags of DS-MRR implementation
4014
else the value is not modified
4015
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
4016
else the value is not modified
4017
@param cost IN Cost of default MRR implementation
4018
OUT If DS-MRR is choosen, cost of DS-MRR scan
4019
else the value is not modified
4021
@retval true Default MRR implementation should be used
4022
@retval false DS-MRR implementation should be used
4025
bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
4026
uint *bufsz, COST_VECT *cost)
4028
COST_VECT dsmrr_cost;
4030
THD *thd= current_thd;
4031
if ((thd->variables.optimizer_use_mrr == 2) ||
4032
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
4033
(keyno == table->s->primary_key &&
4034
h->primary_key_is_clustered()) ||
4035
key_uses_partial_cols(keyno))
4037
/* Use the default implementation */
4038
*flags |= HA_MRR_USE_DEFAULT_IMPL;
4042
uint add_len= table->key_info[keyno].key_length + h->ref_length;
4044
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
4050
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
4051
DS-MRR and Default implementations cost. This allows one to force use of
4052
DS-MRR whenever it is applicable without affecting other cost-based
4055
if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
4056
dsmrr_cost.total_cost() > cost->total_cost())
4059
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
4061
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
4062
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
4068
/* Use the default MRR implementation */
4075
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
4079
Get cost of DS-MRR scan
4081
@param keynr Index to be used
4082
@param rows E(Number of rows to be scanned)
4083
@param flags Scan parameters (HA_MRR_* flags)
4084
@param buffer_size INOUT Buffer size
4085
@param cost OUT The cost
4088
@retval true Error, DS-MRR cannot be used (the buffer is too small
4092
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
4093
uint *buffer_size, COST_VECT *cost)
4095
uint32_t max_buff_entries, elem_size;
4096
ha_rows rows_in_full_step, rows_in_last_step;
4098
double index_read_cost;
4100
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
4101
max_buff_entries = *buffer_size / elem_size;
4103
if (!max_buff_entries)
4104
return true; /* Buffer has not enough space for even 1 rowid */
4106
/* Number of iterations we'll make with full buffer */
4107
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
4110
Get numbers of rows we'll be processing in
4111
- non-last sweep, with full buffer
4112
- last iteration, with non-full buffer
4114
rows_in_full_step= max_buff_entries;
4115
rows_in_last_step= rows % max_buff_entries;
4117
/* Adjust buffer size if we expect to use only part of the buffer */
4120
get_sort_and_sweep_cost(table, rows, cost);
4121
cost->multiply(n_full_steps);
4126
*buffer_size= max((ulong)*buffer_size,
4127
(size_t)(1.2*rows_in_last_step) * elem_size +
4128
h->ref_length + table->key_info[keynr].key_length);
4131
COST_VECT last_step_cost;
4132
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
4133
cost->add(&last_step_cost);
4135
if (n_full_steps != 0)
4136
cost->mem_cost= *buffer_size;
4138
cost->mem_cost= (double)rows_in_last_step * elem_size;
4140
/* Total cost of all index accesses */
4141
index_read_cost= h->index_only_read_time(keynr, (double)rows);
4142
cost->add_io(index_read_cost, 1 /* Random seeks */);
4148
Get cost of one sort-and-sweep step
4151
get_sort_and_sweep_cost()
4152
table Table being accessed
4153
nrows Number of rows to be sorted and retrieved
4157
Get cost of these operations:
4158
- sort an array of #nrows ROWIDs using qsort
4159
- read #nrows records from table in a sweep.
4163
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
4167
get_sweep_read_cost(table, nrows, false, cost);
4168
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
4169
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
4172
cost->cpu_cost += cmp_op * log2(cmp_op);
4180
Get cost of reading nrows table records in a "disk sweep"
4182
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
4183
for an ordered sequence of rowids.
4185
We assume hard disk IO. The read is performed as follows:
4187
1. The disk head is moved to the needed cylinder
4188
2. The controller waits for the plate to rotate
4189
3. The data is transferred
4191
Time to do #3 is insignificant compared to #2+#1.
4193
Time to move the disk head is proportional to head travel distance.
4195
Time to wait for the plate to rotate depends on whether the disk head
4198
If disk head wasn't moved, the wait time is proportional to distance
4199
between the previous block and the block we're reading.
4201
If the head was moved, we don't know how much we'll need to wait for the
4202
plate to rotate. We assume the wait time to be a variate with a mean of
4203
0.5 of full rotation time.
4205
Our cost units are "random disk seeks". The cost of random disk seek is
4206
actually not a constant, it depends one range of cylinders we're going
4207
to access. We make it constant by introducing a fuzzy concept of "typical
4208
datafile length" (it's fuzzy as it's hard to tell whether it should
4209
include index file, temp.tables etc). Then random seek cost is:
4211
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
4213
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
4215
@param table Table to be accessed
4216
@param nrows Number of rows to retrieve
4217
@param interrupted true <=> Assume that the disk sweep will be
4218
interrupted by other disk IO. false - otherwise.
4219
@param cost OUT The cost.
4222
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
4226
if (table->file->primary_key_is_clustered())
4228
cost->io_count= table->file->read_time(table->s->primary_key,
4229
(uint) nrows, nrows);
4234
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
4236
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
4237
if (busy_blocks < 1.0)
4240
cost->io_count= busy_blocks;
4244
/* Assume reading is done in one 'sweep' */
4245
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
4246
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);