1
/* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
2
* vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
4
* Copyright (C) 2008 Sun Microsystems, Inc.
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; version 2 of the License.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1
/* Copyright (C) 2000-2006 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
23
19
Handler-calling-functions
30
#include <drizzled/error.h>
31
#include <drizzled/field/epoch.h>
32
#include <drizzled/gettext.h>
33
#include <drizzled/internal/my_sys.h>
34
#include <drizzled/item/empty_string.h>
35
#include <drizzled/item/int.h>
36
#include <drizzled/lock.h>
37
#include <drizzled/message/table.h>
38
#include <drizzled/my_hash.h>
39
#include <drizzled/optimizer/cost_vector.h>
40
#include <drizzled/plugin/client.h>
41
#include <drizzled/plugin/event_observer.h>
42
#include <drizzled/plugin/storage_engine.h>
43
#include <drizzled/probes.h>
44
#include <drizzled/session.h>
45
#include <drizzled/sql_base.h>
46
#include <drizzled/sql_parse.h>
47
#include <drizzled/transaction_services.h>
22
#include <drizzled/server_includes.h>
23
#include "rpl_filter.h"
24
#include <drizzled/drizzled_error_messages.h>
27
While we have legacy_db_type, we have this array to
28
check for dups and to find handlerton from legacy_db_type.
29
Remove when legacy_db_type is finally gone
31
st_plugin_int *hton2plugin[MAX_HA];
33
static handlerton *installed_htons[128];
35
#define BITMAP_STACKBUF_SIZE (128/8)
37
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0}, {NULL,0} };
39
/* number of entries in handlertons[] */
41
/* number of storage engines (from handlertons[]) that support 2pc */
42
uint32_t total_ha_2pc= 0;
43
/* size of savepoint storage area (see ha_init) */
44
uint32_t savepoint_alloc_size= 0;
46
static const LEX_STRING sys_table_aliases[]=
48
{ C_STRING_WITH_LEN("INNOBASE") }, { C_STRING_WITH_LEN("INNODB") },
49
{ C_STRING_WITH_LEN("HEAP") }, { C_STRING_WITH_LEN("MEMORY") },
53
const char *ha_row_type[] = {
54
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
57
const char *tx_isolation_names[] =
58
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
60
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
61
tx_isolation_names, NULL};
63
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
64
uint32_t known_extensions_id= 0;
68
static plugin_ref ha_default_plugin(THD *thd)
70
if (thd->variables.table_plugin)
71
return thd->variables.table_plugin;
72
return my_plugin_lock(thd, &global_system_variables.table_plugin);
77
Return the default storage engine handlerton for thread
79
@param ha_default_handlerton(thd)
80
@param thd current thread
85
handlerton *ha_default_handlerton(THD *thd)
87
plugin_ref plugin= ha_default_plugin(thd);
89
handlerton *hton= plugin_data(plugin, handlerton*);
96
Return the storage engine handlerton for the supplied name
98
@param thd current thread
99
@param name name of storage engine
102
pointer to storage engine plugin handle
104
plugin_ref ha_resolve_by_name(THD *thd, const LEX_STRING *name)
106
const LEX_STRING *table_alias;
110
/* my_strnncoll is a macro and gcc doesn't do early expansion of macro */
111
if (thd && !my_charset_utf8_general_ci.coll->strnncoll(&my_charset_utf8_general_ci,
112
(const unsigned char *)name->str, name->length,
113
(const unsigned char *)STRING_WITH_LEN("DEFAULT"), 0))
114
return ha_default_plugin(thd);
116
if ((plugin= my_plugin_lock_by_name(thd, name, DRIZZLE_STORAGE_ENGINE_PLUGIN)))
118
handlerton *hton= plugin_data(plugin, handlerton *);
119
if (!(hton->flags & HTON_NOT_USER_SELECTABLE))
123
unlocking plugin immediately after locking is relatively low cost.
125
plugin_unlock(thd, plugin);
129
We check for the historical aliases.
131
for (table_alias= sys_table_aliases; table_alias->str; table_alias+= 2)
133
if (!my_strnncoll(&my_charset_utf8_general_ci,
134
(const unsigned char *)name->str, name->length,
135
(const unsigned char *)table_alias->str, table_alias->length))
137
name= table_alias + 1;
146
plugin_ref ha_lock_engine(THD *thd, handlerton *hton)
150
st_plugin_int **plugin= hton2plugin + hton->slot;
152
return my_plugin_lock(thd, &plugin);
158
handlerton *ha_resolve_by_legacy_type(THD *thd, enum legacy_db_type db_type)
162
case DB_TYPE_DEFAULT:
163
return ha_default_handlerton(thd);
165
if (db_type > DB_TYPE_UNKNOWN && db_type < DB_TYPE_DEFAULT &&
166
(plugin= ha_lock_engine(thd, installed_htons[db_type])))
167
return plugin_data(plugin, handlerton*);
169
case DB_TYPE_UNKNOWN:
176
Use other database handler if databasehandler is not compiled in.
178
handlerton *ha_checktype(THD *thd, enum legacy_db_type database_type,
179
bool no_substitute, bool report_error)
181
handlerton *hton= ha_resolve_by_legacy_type(thd, database_type);
182
if (ha_storage_engine_is_enabled(hton))
189
const char *engine_name= ha_resolve_storage_engine_name(hton);
190
my_error(ER_FEATURE_DISABLED,MYF(0),engine_name,engine_name);
195
return ha_default_handlerton(thd);
199
handler *get_new_handler(TABLE_SHARE *share, MEM_ROOT *alloc,
204
if (db_type && db_type->state == SHOW_OPTION_YES && db_type->create)
206
if ((file= db_type->create(db_type, share, alloc)))
211
Try the default table type
212
Here the call to current_thd() is ok as we call this function a lot of
213
times but we enter this branch very seldom.
215
return(get_new_handler(share, alloc, ha_default_handlerton(current_thd)));
220
Register handler error messages for use with my_error().
228
int ha_init_errors(void)
230
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
231
const char **errmsgs;
233
/* Allocate a pointer array for the error message strings. */
234
/* Zerofill it to avoid uninitialized gaps. */
235
if (! (errmsgs= (const char**) my_malloc(HA_ERR_ERRORS * sizeof(char*),
236
MYF(MY_WME | MY_ZEROFILL))))
239
/* Set the dedicated error messages. */
240
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND));
241
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY));
242
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable");
243
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function");
244
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE));
245
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE));
246
SETMSG(HA_ERR_OUT_OF_MEM, "Table handler out of memory");
247
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'");
248
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported");
249
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE));
250
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update");
251
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted");
252
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL));
253
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'");
254
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last");
255
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA));
256
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row");
257
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option");
258
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE));
259
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset");
260
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE));
261
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR));
262
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE));
263
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT));
264
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL));
265
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION));
266
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK));
267
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN));
268
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2));
269
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2));
270
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name");
271
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size");
272
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'");
273
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR));
274
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine");
275
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED));
276
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key");
277
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE));
278
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY));
279
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED));
280
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE));
282
/* Register the error messages for use with my_error(). */
283
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
288
Unregister handler error messages.
295
static int ha_finish_errors(void)
297
const char **errmsgs;
299
/* Allocate a pointer array for the error message strings. */
300
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
302
free((unsigned char*) errmsgs);
307
int ha_finalize_handlerton(st_plugin_int *plugin)
309
handlerton *hton= (handlerton *)plugin->data;
314
case SHOW_OPTION_DISABLED:
316
case SHOW_OPTION_YES:
317
if (installed_htons[hton->db_type] == hton)
318
installed_htons[hton->db_type]= NULL;
322
if (hton && plugin->plugin->deinit)
323
(void)plugin->plugin->deinit(hton);
325
free((unsigned char*)hton);
331
int ha_initialize_handlerton(st_plugin_int *plugin)
335
hton= (handlerton *)my_malloc(sizeof(handlerton),
336
MYF(MY_WME | MY_ZEROFILL));
338
FIXME: the MY_ZEROFILL flag above doesn't zero all the bytes.
340
This was detected after adding get_backup_engine member to handlerton
341
structure. Apparently get_backup_engine was not NULL even though it was
344
memset(hton, 0, sizeof(hton));
345
/* Historical Requirement */
346
plugin->data= hton; // shortcut for the future
347
if (plugin->plugin->init)
349
if (plugin->plugin->init(hton))
351
sql_print_error(_("Plugin '%s' init function returned error."),
358
the switch below and hton->state should be removed when
359
command-line options for plugins will be implemented
361
switch (hton->state) {
364
case SHOW_OPTION_YES:
367
/* now check the db_type for conflict */
368
if (hton->db_type <= DB_TYPE_UNKNOWN ||
369
hton->db_type >= DB_TYPE_DEFAULT ||
370
installed_htons[hton->db_type])
372
int idx= (int) DB_TYPE_FIRST_DYNAMIC;
374
while (idx < (int) DB_TYPE_DEFAULT && installed_htons[idx])
377
if (idx == (int) DB_TYPE_DEFAULT)
379
sql_print_warning(_("Too many storage engines!"));
382
if (hton->db_type != DB_TYPE_UNKNOWN)
383
sql_print_warning(_("Storage engine '%s' has conflicting typecode. "
384
"Assigning value %d."), plugin->plugin->name, idx);
385
hton->db_type= (enum legacy_db_type) idx;
387
installed_htons[hton->db_type]= hton;
388
tmp= hton->savepoint_offset;
389
hton->savepoint_offset= savepoint_alloc_size;
390
savepoint_alloc_size+= tmp;
391
hton->slot= total_ha++;
392
hton2plugin[hton->slot]=plugin;
399
hton->state= SHOW_OPTION_DISABLED;
404
This is entirely for legacy. We will create a new "disk based" hton and a
405
"memory" hton which will be configurable longterm. We should be able to
406
remove partition and myisammrg.
408
if (strcmp(plugin->plugin->name, "MEMORY") == 0)
411
if (strcmp(plugin->plugin->name, "MyISAM") == 0)
423
assert(total_ha < MAX_HA);
425
Check if there is a transaction-capable storage engine besides the
426
binary log (which is considered a transaction-capable storage engine in
429
opt_using_transactions= total_ha>(uint32_t)opt_bin_log;
430
savepoint_alloc_size+= sizeof(SAVEPOINT);
439
This should be eventualy based on the graceful shutdown flag.
440
So if flag is equal to HA_PANIC_CLOSE, the deallocate
443
if (ha_finish_errors())
449
static bool dropdb_handlerton(THD *unused1 __attribute__((unused)),
453
handlerton *hton= plugin_data(plugin, handlerton *);
454
if (hton->state == SHOW_OPTION_YES && hton->drop_database)
455
hton->drop_database(hton, (char *)path);
460
void ha_drop_database(char* path)
462
plugin_foreach(NULL, dropdb_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, path);
466
static bool closecon_handlerton(THD *thd, plugin_ref plugin,
467
void *unused __attribute__((unused)))
469
handlerton *hton= plugin_data(plugin, handlerton *);
471
there's no need to rollback here as all transactions must
472
be rolled back already
474
if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
475
thd_get_ha_data(thd, hton))
476
hton->close_connection(hton, thd);
483
don't bother to rollback here, it's done already
485
void ha_close_connection(THD* thd)
487
plugin_foreach(thd, closecon_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, 0);
490
/* ========================================================================
491
======================= TRANSACTIONS ===================================*/
494
Transaction handling in the server
495
==================================
497
In each client connection, MySQL maintains two transactional
499
- a statement transaction,
500
- a standard, also called normal transaction.
504
"Statement transaction" is a non-standard term that comes
505
from the times when MySQL supported BerkeleyDB storage engine.
507
First of all, it should be said that in BerkeleyDB auto-commit
508
mode auto-commits operations that are atomic to the storage
509
engine itself, such as a write of a record, and are too
510
high-granular to be atomic from the application perspective
511
(MySQL). One SQL statement could involve many BerkeleyDB
512
auto-committed operations and thus BerkeleyDB auto-commit was of
515
Secondly, instead of SQL standard savepoints, BerkeleyDB
516
provided the concept of "nested transactions". In a nutshell,
517
transactions could be arbitrarily nested, but when the parent
518
transaction was committed or aborted, all its child (nested)
519
transactions were handled committed or aborted as well.
520
Commit of a nested transaction, in turn, made its changes
521
visible, but not durable: it destroyed the nested transaction,
522
all its changes would become available to the parent and
523
currently active nested transactions of this parent.
525
So the mechanism of nested transactions was employed to
526
provide "all or nothing" guarantee of SQL statements
527
required by the standard.
528
A nested transaction would be created at start of each SQL
529
statement, and destroyed (committed or aborted) at statement
530
end. Such nested transaction was internally referred to as
531
a "statement transaction" and gave birth to the term.
533
<Historical note ends>
535
Since then a statement transaction is started for each statement
536
that accesses transactional tables or uses the binary log. If
537
the statement succeeds, the statement transaction is committed.
538
If the statement fails, the transaction is rolled back. Commits
539
of statement transactions are not durable -- each such
540
transaction is nested in the normal transaction, and if the
541
normal transaction is rolled back, the effects of all enclosed
542
statement transactions are undone as well. Technically,
543
a statement transaction can be viewed as a savepoint which is
544
maintained automatically in order to make effects of one
547
The normal transaction is started by the user and is ended
548
usually upon a user request as well. The normal transaction
549
encloses transactions of all statements issued between
550
its beginning and its end.
551
In autocommit mode, the normal transaction is equivalent
552
to the statement transaction.
554
Since MySQL supports PSEA (pluggable storage engine
555
architecture), more than one transactional engine can be
556
active at a time. Hence transactions, from the server
557
point of view, are always distributed. In particular,
558
transactional state is maintained independently for each
559
engine. In order to commit a transaction the two phase
560
commit protocol is employed.
562
Not all statements are executed in context of a transaction.
563
Administrative and status information statements do not modify
564
engine data, and thus do not start a statement transaction and
565
also have no effect on the normal transaction. Examples of such
566
statements are SHOW STATUS and RESET SLAVE.
568
Similarly DDL statements are not transactional,
569
and therefore a transaction is [almost] never started for a DDL
570
statement. The difference between a DDL statement and a purely
571
administrative statement though is that a DDL statement always
572
commits the current transaction before proceeding, if there is
575
At last, SQL statements that work with non-transactional
576
engines also have no effect on the transaction state of the
577
connection. Even though they are written to the binary log,
578
and the binary log is, overall, transactional, the writes
579
are done in "write-through" mode, directly to the binlog
580
file, followed with a OS cache sync, in other words,
581
bypassing the binlog undo log (translog).
582
They do not commit the current normal transaction.
583
A failure of a statement that uses non-transactional tables
584
would cause a rollback of the statement transaction, but
585
in case there no non-transactional tables are used,
586
no statement transaction is started.
591
The server stores its transaction-related data in
592
thd->transaction. This structure has two members of type
593
THD_TRANS. These members correspond to the statement and
594
normal transactions respectively:
596
- thd->transaction.stmt contains a list of engines
597
that are participating in the given statement
598
- thd->transaction.all contains a list of engines that
599
have participated in any of the statement transactions started
600
within the context of the normal transaction.
601
Each element of the list contains a pointer to the storage
602
engine, engine-specific transactional data, and engine-specific
605
In autocommit mode thd->transaction.all is empty.
606
Instead, data of thd->transaction.stmt is
607
used to commit/rollback the normal transaction.
609
The list of registered engines has a few important properties:
610
- no engine is registered in the list twice
611
- engines are present in the list a reverse temporal order --
612
new participants are always added to the beginning of the list.
614
Transaction life cycle
615
----------------------
617
When a new connection is established, thd->transaction
618
members are initialized to an empty state.
619
If a statement uses any tables, all affected engines
620
are registered in the statement engine list. In
621
non-autocommit mode, the same engines are registered in
622
the normal transaction list.
623
At the end of the statement, the server issues a commit
624
or a roll back for all engines in the statement list.
625
At this point transaction flags of an engine, if any, are
626
propagated from the statement list to the list of the normal
628
When commit/rollback is finished, the statement list is
629
cleared. It will be filled in again by the next statement,
630
and emptied again at the next statement's end.
632
The normal transaction is committed in a similar way
633
(by going over all engines in thd->transaction.all list)
634
but at different times:
635
- upon COMMIT SQL statement is issued by the user
636
- implicitly, by the server, at the beginning of a DDL statement
637
or SET AUTOCOMMIT={0|1} statement.
639
The normal transaction can be rolled back as well:
640
- if the user has requested so, by issuing ROLLBACK SQL
642
- if one of the storage engines requested a rollback
643
by setting thd->transaction_rollback_request. This may
644
happen in case, e.g., when the transaction in the engine was
645
chosen a victim of the internal deadlock resolution algorithm
646
and rolled back internally. When such a situation happens, there
647
is little the server can do and the only option is to rollback
648
transactions in all other participating engines. In this case
649
the rollback is accompanied by an error sent to the user.
651
As follows from the use cases above, the normal transaction
652
is never committed when there is an outstanding statement
653
transaction. In most cases there is no conflict, since
654
commits of the normal transaction are issued by a stand-alone
655
administrative or DDL statement, thus no outstanding statement
656
transaction of the previous statement exists. Besides,
657
all statements that manipulate with the normal transaction
658
are prohibited in stored functions and triggers, therefore
659
no conflicting situation can occur in a sub-statement either.
660
The remaining rare cases when the server explicitly has
661
to commit the statement transaction prior to committing the normal
662
one cover error-handling scenarios (see for example
665
When committing a statement or a normal transaction, the server
666
either uses the two-phase commit protocol, or issues a commit
667
in each engine independently. The two-phase commit protocol
669
- all participating engines support two-phase commit (provide
670
handlerton::prepare PSEA API call) and
671
- transactions in at least two engines modify data (i.e. are
674
Note that the two phase commit is used for
675
statement transactions, even though they are not durable anyway.
676
This is done to ensure logical consistency of data in a multiple-
678
For example, imagine that some day MySQL supports unique
679
constraint checks deferred till the end of statement. In such
680
case a commit in one of the engines may yield ER_DUP_KEY,
681
and MySQL should be able to gracefully abort statement
682
transactions of other participants.
684
After the normal transaction has been committed,
685
thd->transaction.all list is cleared.
687
When a connection is closed, the current normal transaction, if
690
Roles and responsibilities
691
--------------------------
693
The server has no way to know that an engine participates in
694
the statement and a transaction has been started
695
in it unless the engine says so. Thus, in order to be
696
a part of a transaction, the engine must "register" itself.
697
This is done by invoking trans_register_ha() server call.
698
Normally the engine registers itself whenever handler::external_lock()
699
is called. trans_register_ha() can be invoked many times: if
700
an engine is already registered, the call does nothing.
701
In case autocommit is not set, the engine must register itself
702
twice -- both in the statement list and in the normal transaction
704
In which list to register is a parameter of trans_register_ha().
706
Note, that although the registration interface in itself is
707
fairly clear, the current usage practice often leads to undesired
708
effects. E.g. since a call to trans_register_ha() in most engines
709
is embedded into implementation of handler::external_lock(), some
710
DDL statements start a transaction (at least from the server
711
point of view) even though they are not expected to. E.g.
712
CREATE TABLE does not start a transaction, since
713
handler::external_lock() is never called during CREATE TABLE. But
714
CREATE TABLE ... SELECT does, since handler::external_lock() is
715
called for the table that is being selected from. This has no
716
practical effects currently, but must be kept in mind
719
Once an engine is registered, the server will do the rest
722
During statement execution, whenever any of data-modifying
723
PSEA API methods is used, e.g. handler::write_row() or
724
handler::update_row(), the read-write flag is raised in the
725
statement transaction for the involved engine.
726
Currently All PSEA calls are "traced", and the data can not be
727
changed in a way other than issuing a PSEA call. Important:
728
unless this invariant is preserved the server will not know that
729
a transaction in a given engine is read-write and will not
730
involve the two-phase commit protocol!
732
At the end of a statement, server call
733
ha_autocommit_or_rollback() is invoked. This call in turn
734
invokes handlerton::prepare() for every involved engine.
735
Prepare is followed by a call to handlerton::commit_one_phase()
736
If a one-phase commit will suffice, handlerton::prepare() is not
737
invoked and the server only calls handlerton::commit_one_phase().
738
At statement commit, the statement-related read-write engine
739
flag is propagated to the corresponding flag in the normal
740
transaction. When the commit is complete, the list of registered
743
Rollback is handled in a similar fashion.
745
Additional notes on DDL and the normal transaction.
746
---------------------------------------------------
748
DDLs and operations with non-transactional engines
749
do not "register" in thd->transaction lists, and thus do not
750
modify the transaction state. Besides, each DDL in
751
MySQL is prefixed with an implicit normal transaction commit
752
(a call to end_active_trans()), and thus leaves nothing
754
However, as it has been pointed out with CREATE TABLE .. SELECT,
755
some DDL statements can start a *new* transaction.
757
Behaviour of the server in this case is currently badly
759
DDL statements use a form of "semantic" logging
760
to maintain atomicity: if CREATE TABLE .. SELECT failed,
761
the newly created table is deleted.
762
In addition, some DDL statements issue interim transaction
763
commits: e.g. ALTER Table issues a commit after data is copied
764
from the original table to the internal temporary table. Other
765
statements, e.g. CREATE TABLE ... SELECT do not always commit
767
And finally there is a group of DDL statements such as
768
RENAME/DROP Table that doesn't start a new transaction
771
This diversity makes it hard to say what will happen if
772
by chance a stored function is invoked during a DDL --
773
whether any modifications it makes will be committed or not
774
is not clear. Fortunately, SQL grammar of few DDLs allows
775
invocation of a stored function.
777
A consistent behaviour is perhaps to always commit the normal
778
transaction after all DDLs, just like the statement transaction
779
is always committed at the end of all statements.
783
Register a storage engine for a transaction.
785
Every storage engine MUST call this function when it starts
786
a transaction or a statement (that is it must be called both for the
787
"beginning of transaction" and "beginning of statement").
788
Only storage engines registered for the transaction/statement
789
will know when to commit/rollback it.
792
trans_register_ha is idempotent - storage engine may register many
793
times per transaction.
796
void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
799
Ha_trx_info *ha_info;
803
trans= &thd->transaction.all;
804
thd->server_status|= SERVER_STATUS_IN_TRANS;
807
trans= &thd->transaction.stmt;
809
ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
811
if (ha_info->is_started())
812
return; /* already registered, return */
814
ha_info->register_ha(trans, ht_arg);
816
trans->no_2pc|=(ht_arg->prepare==0);
817
if (thd->transaction.xid_state.xid.is_null())
818
thd->transaction.xid_state.xid.set(thd->query_id);
827
1 error, transaction was rolled back
829
int ha_prepare(THD *thd)
832
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
833
Ha_trx_info *ha_info= trans->ha_list;
836
for (; ha_info; ha_info= ha_info->next())
839
handlerton *ht= ha_info->ht();
840
status_var_increment(thd->status_var.ha_prepare_count);
843
if ((err= ht->prepare(ht, thd, all)))
845
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
846
ha_rollback_trans(thd, all);
853
push_warning_printf(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN,
854
ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
855
ha_resolve_storage_engine_name(ht));
863
Check if we can skip the two-phase commit.
865
A helper function to evaluate if two-phase commit is mandatory.
866
As a side effect, propagates the read-only/read-write flags
867
of the statement transaction to its enclosing normal transaction.
869
@retval true we must run a two-phase commit. Returned
870
if we have at least two engines with read-write changes.
871
@retval false Don't need two-phase commit. Even if we have two
872
transactional engines, we can run two independent
873
commits if changes in one of the engines are read-only.
878
ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
881
/* The number of storage engines that have actual changes. */
882
unsigned rw_ha_count= 0;
883
Ha_trx_info *ha_info;
885
for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
887
if (ha_info->is_trx_read_write())
892
Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
893
assert(ha_info != ha_info_all);
895
Merge read-only/read-write information about statement
896
transaction to its enclosing normal transaction. Do this
897
only if in a real transaction -- that is, if we know
898
that ha_info_all is registered in thd->transaction.all.
899
Since otherwise we only clutter the normal transaction flags.
901
if (ha_info_all->is_started()) /* false if autocommit. */
902
ha_info_all->coalesce_trx_with(ha_info);
904
else if (rw_ha_count > 1)
907
It is a normal transaction, so we don't need to merge read/write
908
information up, and the need for two-phase commit has been
909
already established. Break the loop prematurely.
914
return rw_ha_count > 1;
922
1 transaction was rolled back
924
2 error during commit, data may be inconsistent
927
Since we don't support nested statement transactions in 5.0,
928
we can't commit or rollback stmt transactions while we are inside
929
stored functions or triggers. So we simply do nothing now.
930
TODO: This should be fixed in later ( >= 5.1) releases.
932
int ha_commit_trans(THD *thd, bool all)
934
int error= 0, cookie= 0;
936
'all' means that this is either an explicit commit issued by
937
user, or an implicit commit issued by a DDL.
939
THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
940
bool is_real_trans= all || thd->transaction.all.ha_list == 0;
941
Ha_trx_info *ha_info= trans->ha_list;
942
my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
945
We must not commit the normal transaction if a statement
946
transaction is pending. Otherwise statement transaction
947
flags will not get propagated to its normal transaction's
950
assert(thd->transaction.stmt.ha_list == NULL ||
951
trans == &thd->transaction.stmt);
957
if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
959
ha_rollback_trans(thd, all);
965
&& ! thd->slave_thread
968
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
969
ha_rollback_trans(thd, all);
974
must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
976
if (!trans->no_2pc && must_2pc)
978
for (; ha_info && !error; ha_info= ha_info->next())
981
handlerton *ht= ha_info->ht();
983
Do not call two-phase commit if this particular
984
transaction is read-only. This allows for simpler
985
implementation in engines that are always read-only.
987
if (! ha_info->is_trx_read_write())
990
Sic: we know that prepare() is not NULL since otherwise
991
trans->no_2pc would have been set.
993
if ((err= ht->prepare(ht, thd, all)))
995
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
998
status_var_increment(thd->status_var.ha_prepare_count);
1000
if (error || (is_real_trans && xid &&
1001
(error= !(cookie= tc_log->log_xid(thd, xid)))))
1003
ha_rollback_trans(thd, all);
1008
error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
1010
tc_log->unlog(cookie, xid);
1013
start_waiting_global_read_lock(thd);
1020
This function does not care about global read lock. A caller should.
1022
int ha_commit_one_phase(THD *thd, bool all)
1025
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1026
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1027
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1030
for (; ha_info; ha_info= ha_info_next)
1033
handlerton *ht= ha_info->ht();
1034
if ((err= ht->commit(ht, thd, all)))
1036
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1039
status_var_increment(thd->status_var.ha_commit_count);
1040
ha_info_next= ha_info->next();
1041
ha_info->reset(); /* keep it conveniently zero-filled */
1046
thd->transaction.xid_state.xid.null();
1049
thd->variables.tx_isolation=thd->session_tx_isolation;
1050
thd->transaction.cleanup();
1057
int ha_rollback_trans(THD *thd, bool all)
1060
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1061
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1062
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1065
We must not rollback the normal transaction if a statement
1066
transaction is pending.
1068
assert(thd->transaction.stmt.ha_list == NULL ||
1069
trans == &thd->transaction.stmt);
1073
for (; ha_info; ha_info= ha_info_next)
1076
handlerton *ht= ha_info->ht();
1077
if ((err= ht->rollback(ht, thd, all)))
1079
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1082
status_var_increment(thd->status_var.ha_rollback_count);
1083
ha_info_next= ha_info->next();
1084
ha_info->reset(); /* keep it conveniently zero-filled */
1089
thd->transaction.xid_state.xid.null();
1092
thd->variables.tx_isolation=thd->session_tx_isolation;
1093
thd->transaction.cleanup();
1097
thd->transaction_rollback_request= false;
1100
If a non-transactional table was updated, warn; don't warn if this is a
1101
slave thread (because when a slave thread executes a ROLLBACK, it has
1102
been read from the binary log, so it's 100% sure and normal to produce
1103
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
1104
slave SQL thread, it would not stop the thread but just be printed in
1105
the error log; but we don't want users to wonder why they have this
1106
message in the error log, so we don't send it.
1108
if (is_real_trans && thd->transaction.all.modified_non_trans_table &&
1109
!thd->slave_thread && thd->killed != THD::KILL_CONNECTION)
1110
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN,
1111
ER_WARNING_NOT_COMPLETE_ROLLBACK,
1112
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
1117
This is used to commit or rollback a single statement depending on
1121
Note that if the autocommit is on, then the following call inside
1122
InnoDB will commit or rollback the whole transaction (= the statement). The
1123
autocommit mechanism built into InnoDB is based on counting locks, but if
1124
the user has used LOCK TABLES then that mechanism does not know to do the
1127
int ha_autocommit_or_rollback(THD *thd, int error)
1129
if (thd->transaction.stmt.ha_list)
1133
if (ha_commit_trans(thd, 0))
1138
(void) ha_rollback_trans(thd, 0);
1139
if (thd->transaction_rollback_request)
1140
(void) ha_rollback(thd);
1143
thd->variables.tx_isolation=thd->session_tx_isolation;
1154
static bool xacommit_handlerton(THD *unused1 __attribute__((unused)),
1158
handlerton *hton= plugin_data(plugin, handlerton *);
1159
if (hton->state == SHOW_OPTION_YES && hton->recover)
1161
hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
1162
((struct xahton_st *)arg)->result= 0;
1167
static bool xarollback_handlerton(THD *unused1 __attribute__((unused)),
1171
handlerton *hton= plugin_data(plugin, handlerton *);
1172
if (hton->state == SHOW_OPTION_YES && hton->recover)
1174
hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
1175
((struct xahton_st *)arg)->result= 0;
1181
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
1183
struct xahton_st xaop;
1187
plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
1188
DRIZZLE_STORAGE_ENGINE_PLUGIN, &xaop);
1194
recover() step of xa.
1197
there are three modes of operation:
1198
- automatic recover after a crash
1199
in this case commit_list != 0, tc_heuristic_recover==0
1200
all xids from commit_list are committed, others are rolled back
1201
- manual (heuristic) recover
1202
in this case commit_list==0, tc_heuristic_recover != 0
1203
DBA has explicitly specified that all prepared transactions should
1204
be committed (or rolled back).
1205
- no recovery (MySQL did not detect a crash)
1206
in this case commit_list==0, tc_heuristic_recover == 0
1207
there should be no prepared transactions in this case.
1211
int len, found_foreign_xids, found_my_xids;
1217
static bool xarecover_handlerton(THD *unused __attribute__((unused)),
1221
handlerton *hton= plugin_data(plugin, handlerton *);
1222
struct xarecover_st *info= (struct xarecover_st *) arg;
1225
if (hton->state == SHOW_OPTION_YES && hton->recover)
1227
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
1229
sql_print_information(_("Found %d prepared transaction(s) in %s"),
1230
got, ha_resolve_storage_engine_name(hton));
1231
for (int i=0; i < got; i ++)
1233
my_xid x=info->list[i].get_my_xid();
1234
if (!x) // not "mine" - that is generated by external TM
1236
xid_cache_insert(info->list+i, XA_PREPARED);
1237
info->found_foreign_xids++;
1242
info->found_my_xids++;
1246
if (info->commit_list ?
1247
hash_search(info->commit_list, (unsigned char *)&x, sizeof(x)) != 0 :
1248
tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
1250
hton->commit_by_xid(hton, info->list+i);
1254
hton->rollback_by_xid(hton, info->list+i);
1257
if (got < info->len)
1264
int ha_recover(HASH *commit_list)
1266
struct xarecover_st info;
1267
info.found_foreign_xids= info.found_my_xids= 0;
1268
info.commit_list= commit_list;
1269
info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1272
/* commit_list and tc_heuristic_recover cannot be set both */
1273
assert(info.commit_list==0 || tc_heuristic_recover==0);
1274
/* if either is set, total_ha_2pc must be set too */
1275
assert(info.dry_run || total_ha_2pc>(uint32_t)opt_bin_log);
1277
if (total_ha_2pc <= (uint32_t)opt_bin_log)
1280
if (info.commit_list)
1281
sql_print_information(_("Starting crash recovery..."));
1284
#ifndef WILL_BE_DELETED_LATER
1287
for now, only InnoDB supports 2pc. It means we can always safely
1288
rollback all pending transactions, without risking inconsistent data
1291
assert(total_ha_2pc == (uint32_t) opt_bin_log+1); // only InnoDB and binlog
1292
tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1297
for (info.len= MAX_XID_LIST_SIZE ;
1298
info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1300
info.list=(XID *)my_malloc(info.len*sizeof(XID), MYF(0));
1304
sql_print_error(ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1308
plugin_foreach(NULL, xarecover_handlerton,
1309
DRIZZLE_STORAGE_ENGINE_PLUGIN, &info);
1311
free((unsigned char*)info.list);
1312
if (info.found_foreign_xids)
1313
sql_print_warning(_("Found %d prepared XA transactions"),
1314
info.found_foreign_xids);
1315
if (info.dry_run && info.found_my_xids)
1317
sql_print_error(_("Found %d prepared transactions! It means that drizzled "
1318
"was not shut down properly last time and critical "
1319
"recovery information (last binlog or %s file) was "
1320
"manually deleted after a crash. You have to start "
1321
"drizzled with the --tc-heuristic-recover switch to "
1322
"commit or rollback pending transactions."),
1323
info.found_my_xids, opt_tc_log_file);
1326
if (info.commit_list)
1327
sql_print_information(_("Crash recovery finished."));
1332
return the list of XID's to a client, the same way SHOW commands do.
1335
I didn't find in XA specs that an RM cannot return the same XID twice,
1336
so mysql_xa_recover does not filter XID's to ensure uniqueness.
1337
It can be easily fixed later, if necessary.
1339
bool mysql_xa_recover(THD *thd)
1341
List<Item> field_list;
1342
Protocol *protocol= thd->protocol;
1346
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1347
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1348
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1349
field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1351
if (protocol->send_fields(&field_list,
1352
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1355
pthread_mutex_lock(&LOCK_xid_cache);
1356
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1358
if (xs->xa_state==XA_PREPARED)
1360
protocol->prepare_for_resend();
1361
protocol->store_int64_t((int64_t)xs->xid.formatID, false);
1362
protocol->store_int64_t((int64_t)xs->xid.gtrid_length, false);
1363
protocol->store_int64_t((int64_t)xs->xid.bqual_length, false);
1364
protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1366
if (protocol->write())
1368
pthread_mutex_unlock(&LOCK_xid_cache);
1374
pthread_mutex_unlock(&LOCK_xid_cache);
1381
This function should be called when MySQL sends rows of a SELECT result set
1382
or the EOF mark to the client. It releases a possible adaptive hash index
1383
S-latch held by thd in InnoDB and also releases a possible InnoDB query
1384
FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to
1385
keep them over several calls of the InnoDB handler interface when a join
1386
is executed. But when we let the control to pass to the client they have
1387
to be released because if the application program uses mysql_use_result(),
1388
it may deadlock on the S-latch if the application on another connection
1389
performs another SQL query. In MySQL-4.1 this is even more important because
1390
there a connection can have several SELECT queries open at the same time.
1392
@param thd the thread handle of the current connection
1397
static bool release_temporary_latches(THD *thd, plugin_ref plugin,
1398
void *unused __attribute__((unused)))
1400
handlerton *hton= plugin_data(plugin, handlerton *);
1402
if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1403
hton->release_temporary_latches(hton, thd);
1409
int ha_release_temporary_latches(THD *thd)
1411
plugin_foreach(thd, release_temporary_latches, DRIZZLE_STORAGE_ENGINE_PLUGIN,
1417
int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
1420
THD_TRANS *trans= &thd->transaction.all;
1421
Ha_trx_info *ha_info, *ha_info_next;
1425
rolling back to savepoint in all storage engines that were part of the
1426
transaction when the savepoint was set
1428
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1431
handlerton *ht= ha_info->ht();
1433
assert(ht->savepoint_set != 0);
1434
if ((err= ht->savepoint_rollback(ht, thd,
1435
(unsigned char *)(sv+1)+ht->savepoint_offset)))
1437
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1440
status_var_increment(thd->status_var.ha_savepoint_rollback_count);
1441
trans->no_2pc|= ht->prepare == 0;
1444
rolling back the transaction in all storage engines that were not part of
1445
the transaction when the savepoint was set
1447
for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1448
ha_info= ha_info_next)
1451
handlerton *ht= ha_info->ht();
1452
if ((err= ht->rollback(ht, thd, !(0))))
1454
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1457
status_var_increment(thd->status_var.ha_rollback_count);
1458
ha_info_next= ha_info->next();
1459
ha_info->reset(); /* keep it conveniently zero-filled */
1461
trans->ha_list= sv->ha_list;
1467
according to the sql standard (ISO/IEC 9075-2:2003)
1468
section "4.33.4 SQL-statements and transaction states",
1469
SAVEPOINT is *not* transaction-initiating SQL-statement
1471
int ha_savepoint(THD *thd, SAVEPOINT *sv)
1474
THD_TRANS *trans= &thd->transaction.all;
1475
Ha_trx_info *ha_info= trans->ha_list;
1476
for (; ha_info; ha_info= ha_info->next())
1479
handlerton *ht= ha_info->ht();
1481
if (! ht->savepoint_set)
1483
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1487
if ((err= ht->savepoint_set(ht, thd, (unsigned char *)(sv+1)+ht->savepoint_offset)))
1489
my_error(ER_GET_ERRNO, MYF(0), err);
1492
status_var_increment(thd->status_var.ha_savepoint_count);
1495
Remember the list of registered storage engines. All new
1496
engines are prepended to the beginning of the list.
1498
sv->ha_list= trans->ha_list;
1502
int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
1505
Ha_trx_info *ha_info= sv->ha_list;
1507
for (; ha_info; ha_info= ha_info->next())
1510
handlerton *ht= ha_info->ht();
1511
/* Savepoint life time is enclosed into transaction life time. */
1513
if (!ht->savepoint_release)
1515
if ((err= ht->savepoint_release(ht, thd,
1516
(unsigned char *)(sv+1) + ht->savepoint_offset)))
1518
my_error(ER_GET_ERRNO, MYF(0), err);
1526
static bool snapshot_handlerton(THD *thd, plugin_ref plugin, void *arg)
1528
handlerton *hton= plugin_data(plugin, handlerton *);
1529
if (hton->state == SHOW_OPTION_YES &&
1530
hton->start_consistent_snapshot)
1532
hton->start_consistent_snapshot(hton, thd);
1533
*((bool *)arg)= false;
1538
int ha_start_consistent_snapshot(THD *thd)
1542
plugin_foreach(thd, snapshot_handlerton, DRIZZLE_STORAGE_ENGINE_PLUGIN, &warn);
1545
Same idea as when one wants to CREATE TABLE in one engine which does not
1549
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1550
"This MySQL server does not support any "
1551
"consistent-read capable storage engine");
1556
static bool flush_handlerton(THD *thd __attribute__((unused)),
1558
void *arg __attribute__((unused)))
1560
handlerton *hton= plugin_data(plugin, handlerton *);
1561
if (hton->state == SHOW_OPTION_YES && hton->flush_logs &&
1562
hton->flush_logs(hton))
1568
bool ha_flush_logs(handlerton *db_type)
1570
if (db_type == NULL)
1572
if (plugin_foreach(NULL, flush_handlerton,
1573
DRIZZLE_STORAGE_ENGINE_PLUGIN, 0))
1578
if (db_type->state != SHOW_OPTION_YES ||
1579
(db_type->flush_logs && db_type->flush_logs(db_type)))
1585
static const char *check_lowercase_names(handler *file, const char *path,
1588
if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1591
/* Ensure that table handler get path in lower case */
1592
if (tmp_path != path)
1593
my_stpcpy(tmp_path, path);
1596
we only should turn into lowercase database/table part
1597
so start the process after homedirectory
1599
my_casedn_str(files_charset_info, tmp_path + mysql_data_home_len);
1605
An interceptor to hijack the text of the error message without
1606
setting an error in the thread. We need the text to present it
1607
in the form of a warning to the user.
1610
struct Ha_delete_table_error_handler: public Internal_error_handler
1613
virtual bool handle_error(uint32_t sql_errno,
1614
const char *message,
1615
DRIZZLE_ERROR::enum_warning_level level,
1617
char buff[DRIZZLE_ERRMSG_SIZE];
1622
Ha_delete_table_error_handler::
1623
handle_error(uint32_t sql_errno __attribute__((unused)),
1624
const char *message,
1625
DRIZZLE_ERROR::enum_warning_level level __attribute__((unused)),
1626
THD *thd __attribute__((unused)))
1628
/* Grab the error message */
1629
strmake(buff, message, sizeof(buff)-1);
1635
This should return ENOENT if the file doesn't exists.
1636
The .frm file will be deleted only if we return 0 or ENOENT
1638
int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
1639
const char *db, const char *alias, bool generate_warning)
1642
char tmp_path[FN_REFLEN];
1645
TABLE_SHARE dummy_share;
1647
memset(&dummy_table, 0, sizeof(dummy_table));
1648
memset(&dummy_share, 0, sizeof(dummy_share));
1649
dummy_table.s= &dummy_share;
1651
/* DB_TYPE_UNKNOWN is used in ALTER Table when renaming only .frm files */
1652
if (table_type == NULL ||
1653
! (file=get_new_handler((TABLE_SHARE*)0, thd->mem_root, table_type)))
1656
path= check_lowercase_names(file, path, tmp_path);
1657
if ((error= file->ha_delete_table(path)) && generate_warning)
1660
Because file->print_error() use my_error() to generate the error message
1661
we use an internal error handler to intercept it and store the text
1662
in a temporary buffer. Later the message will be presented to user
1665
Ha_delete_table_error_handler ha_delete_table_error_handler;
1667
/* Fill up strucutures that print_error may need */
1668
dummy_share.path.str= (char*) path;
1669
dummy_share.path.length= strlen(path);
1670
dummy_share.db.str= (char*) db;
1671
dummy_share.db.length= strlen(db);
1672
dummy_share.table_name.str= (char*) alias;
1673
dummy_share.table_name.length= strlen(alias);
1674
dummy_table.alias= alias;
1676
file->change_table_ptr(&dummy_table, &dummy_share);
1678
thd->push_internal_handler(&ha_delete_table_error_handler);
1679
file->print_error(error, 0);
1681
thd->pop_internal_handler();
1684
XXX: should we convert *all* errors to warnings here?
1685
What if the error is fatal?
1687
push_warning(thd, DRIZZLE_ERROR::WARN_LEVEL_ERROR, error,
1688
ha_delete_table_error_handler.buff);
54
1694
/****************************************************************************
55
** General Cursor functions
1695
** General handler functions
56
1696
****************************************************************************/
57
Cursor::Cursor(plugin::StorageEngine &engine_arg,
61
estimation_rows_to_insert(0),
63
key_used_on_scan(MAX_KEY), active_index(MAX_KEY),
64
ref_length(sizeof(internal::my_off_t)),
67
next_insert_id(0), insert_id_for_cur_row(0)
72
assert(locked == false);
73
/* TODO: assert(inited == NONE); */
78
* @note this only used in
79
* optimizer::QuickRangeSelect::init_ror_merged_scan(bool reuse_handler) as
80
* of the writing of this comment. -Brian
82
Cursor *Cursor::clone(memory::Root *mem_root)
84
Cursor *new_handler= getTable()->getMutableShare()->db_type()->getCursor(*getTable());
1697
handler *handler::clone(MEM_ROOT *mem_root)
1699
handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
87
Allocate Cursor->ref here because otherwise ha_open will allocate it
88
on this->table->mem_root and we will not be able to reclaim that memory
89
when the clone Cursor object is destroyed.
1701
Allocate handler->ref here because otherwise ha_open will allocate it
1702
on this->table->mem_root and we will not be able to reclaim that memory
1703
when the clone handler object is destroyed.
91
if (!(new_handler->ref= (unsigned char*) mem_root->alloc_root(ALIGN_SIZE(ref_length)*2)))
1705
if (!(new_handler->ref= (unsigned char*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
94
identifier::Table identifier(getTable()->getShare()->getSchemaName(),
95
getTable()->getShare()->getTableName(),
96
getTable()->getShare()->getType());
98
if (new_handler && !new_handler->ha_open(identifier,
99
getTable()->getDBStat(),
1707
if (new_handler && !new_handler->ha_open(table,
1708
table->s->normalized_path.str,
100
1710
HA_OPEN_IGNORE_IF_LOCKED))
101
1711
return new_handler;
107
given a buffer with a key value, and a map of keyparts
108
that are present in this value, returns the length of the value
110
uint32_t Cursor::calculate_key_len(uint32_t key_position, key_part_map keypart_map_arg)
112
/* works only with key prefixes */
113
assert(((keypart_map_arg + 1) & keypart_map_arg) == 0);
115
const KeyPartInfo *key_part_found= getTable()->getShare()->getKeyInfo(key_position).key_part;
116
const KeyPartInfo *end_key_part_found= key_part_found + getTable()->getShare()->getKeyInfo(key_position).key_parts;
119
while (key_part_found < end_key_part_found && keypart_map_arg)
121
length+= key_part_found->store_length;
122
keypart_map_arg >>= 1;
128
int Cursor::startIndexScan(uint32_t idx, bool sorted)
131
assert(inited == NONE);
132
if (!(result= doStartIndexScan(idx, sorted)))
138
int Cursor::endIndexScan()
140
assert(inited==INDEX);
143
return(doEndIndexScan());
146
int Cursor::startTableScan(bool scan)
149
assert(inited==NONE || (inited==RND && scan));
150
inited= (result= doStartTableScan(scan)) ? NONE: RND;
155
int Cursor::endTableScan()
159
return(doEndTableScan());
162
int Cursor::ha_index_or_rnd_end()
164
return inited == INDEX ? endIndexScan() : inited == RND ? endTableScan() : 0;
167
void Cursor::ha_start_bulk_insert(ha_rows rows)
169
estimation_rows_to_insert= rows;
170
start_bulk_insert(rows);
173
int Cursor::ha_end_bulk_insert()
175
estimation_rows_to_insert= 0;
176
return end_bulk_insert();
179
const key_map *Cursor::keys_to_use_for_scanning()
181
return &key_map_empty;
184
bool Cursor::has_transactions()
186
return (getTable()->getShare()->db_type()->check_flag(HTON_BIT_DOES_TRANSACTIONS));
189
void Cursor::ha_statistic_increment(uint64_t system_status_var::*offset) const
191
(getTable()->in_use->status_var.*offset)++;
194
void **Cursor::ha_data(Session *session) const
196
return session->getEngineData(getEngine());
199
bool Cursor::is_fatal_error(int error, uint32_t flags)
202
((flags & HA_CHECK_DUP_KEY) &&
203
(error == HA_ERR_FOUND_DUPP_KEY ||
204
error == HA_ERR_FOUND_DUPP_UNIQUE)))
210
ha_rows Cursor::records() { return stats.records; }
211
uint64_t Cursor::tableSize() { return stats.index_file_length + stats.data_file_length; }
212
uint64_t Cursor::rowSize() { return getTable()->getRecordLength() + getTable()->sizeFields(); }
214
int Cursor::doOpen(const identifier::Table &identifier, int mode, uint32_t test_if_locked)
216
return open(identifier.getPath().c_str(), mode, test_if_locked);
1717
void handler::ha_statistic_increment(ulong SSV::*offset) const
1719
status_var_increment(table->in_use->status_var.*offset);
1722
void **handler::ha_data(THD *thd) const
1724
return thd_ha_data(thd, ht);
1727
THD *handler::ha_thd(void) const
1729
assert(!table || !table->in_use || table->in_use == current_thd);
1730
return (table && table->in_use) ? table->in_use : current_thd;
220
Open database-Cursor.
1734
Open database-handler.
222
1736
Try O_RDONLY if cannot open as O_RDWR
223
1737
Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
225
int Cursor::ha_open(const identifier::Table &identifier,
1739
int handler::ha_open(Table *table_arg, const char *name, int mode,
231
if ((error= doOpen(identifier, mode, test_if_locked)))
1745
assert(table->s == table_share);
1746
assert(alloc_root_inited(&table->mem_root));
1748
if ((error=open(name,mode,test_if_locked)))
233
1750
if ((error == EACCES || error == EROFS) && mode == O_RDWR &&
234
(getTable()->db_stat & HA_TRY_READ_ONLY))
1751
(table->db_stat & HA_TRY_READ_ONLY))
236
getTable()->db_stat|=HA_READ_ONLY;
237
error= doOpen(identifier, O_RDONLY,test_if_locked);
1753
table->db_stat|=HA_READ_ONLY;
1754
error=open(name,O_RDONLY,test_if_locked);
242
errno= error; /* Safeguard */
1759
my_errno= error; /* Safeguard */
246
if (getTable()->getShare()->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
247
getTable()->db_stat|=HA_READ_ONLY;
1763
if (table->s->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
1764
table->db_stat|=HA_READ_ONLY;
248
1765
(void) extra(HA_EXTRA_NO_READCHECK); // Not needed in SQL
250
/* ref is already allocated for us if we're called from Cursor::clone() */
251
if (!ref && !(ref= (unsigned char*) getTable()->alloc_root(ALIGN_SIZE(ref_length)*2)))
1767
/* ref is already allocated for us if we're called from handler::clone() */
1768
if (!ref && !(ref= (unsigned char*) alloc_root(&table->mem_root,
1769
ALIGN_SIZE(ref_length)*2)))
254
1772
error=HA_ERR_OUT_OF_MEM;
257
1775
dup_ref=ref+ALIGN_SIZE(ref_length);
1776
cached_table_flags= table_flags();
1782
one has to use this method when to find
1783
random position by record as the plain
1784
position() call doesn't work for some
1785
handlers for random position
1788
int handler::rnd_pos_by_record(unsigned char *record)
1793
if (inited && (error= ha_index_end()))
1795
if ((error= ha_rnd_init(false)))
1798
return(rnd_pos(record, ref));
1149
3625
while ((result == HA_ERR_END_OF_FILE) && !range_res);
1151
3627
*range_info= mrr_cur_range.ptr;
3632
/* **************************************************************************
3633
* DS-MRR implementation
3634
***************************************************************************/
3637
DS-MRR: Initialize and start MRR scan
3639
Initialize and start the MRR scan. Depending on the mode parameter, this
3640
may use default or DS-MRR implementation.
3642
@param h Table handler to be used
3643
@param key Index to be used
3644
@param seq_funcs Interval sequence enumeration functions
3645
@param seq_init_param Interval sequence enumeration parameter
3646
@param n_ranges Number of ranges in the sequence.
3647
@param mode HA_MRR_* modes to use
3648
@param buf INOUT Buffer to use
3650
@retval 0 Ok, Scan started.
3654
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3655
RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3656
uint32_t n_ranges, uint32_t mode, HANDLER_BUFFER *buf)
3660
Item *pushed_cond= NULL;
3662
keyno= h->active_index;
3664
if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3666
use_default_impl= true;
3667
return(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3668
n_ranges, mode, buf));
3670
rowids_buf= buf->buffer;
3671
//psergey-todo: don't add key_length as it is not needed anymore
3672
rowids_buf += key->key_length + h->ref_length;
3674
is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3675
rowids_buf_end= buf->buffer_end;
3677
elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3678
rowids_buf_last= rowids_buf +
3679
((rowids_buf_end - rowids_buf)/ elem_size)*
3681
rowids_buf_end= rowids_buf_last;
3683
/* Create a separate handler object to do rndpos() calls. */
3684
THD *thd= current_thd;
3685
if (!(new_h2= h->clone(thd->mem_root)) ||
3686
new_h2->ha_external_lock(thd, F_RDLCK))
3692
if (keyno == h->pushed_idx_cond_keyno)
3693
pushed_cond= h->pushed_idx_cond;
3694
if (h->ha_index_end())
3701
table->prepare_for_position();
3702
new_h2->extra(HA_EXTRA_KEYREAD);
3704
if (h2->ha_index_init(keyno, false) ||
3705
h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3708
use_default_impl= false;
3711
h2->idx_cond_push(keyno, pushed_cond);
3712
if (dsmrr_fill_buffer(new_h2))
3716
If the above call has scanned through all intervals in *seq, then
3717
adjust *buf to indicate that the remaining buffer space will not be used.
3720
buf->end_of_used_area= rowids_buf_last;
3722
if (h->ha_rnd_init(false))
3727
h2->ha_index_or_rnd_end();
3728
h2->ha_external_lock(thd, F_UNLCK);
3735
void DsMrr_impl::dsmrr_close()
3739
h2->ha_external_lock(current_thd, F_UNLCK);
3744
use_default_impl= true;
3749
static int rowid_cmp(void *h, unsigned char *a, unsigned char *b)
3751
return ((handler*)h)->cmp_ref(a, b);
3756
DS-MRR: Fill the buffer with rowids and sort it by rowid
3758
{This is an internal function of DiskSweep MRR implementation}
3759
Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into
3760
buffer. When the buffer is full or scan is completed, sort the buffer by
3763
The function assumes that rowids buffer is empty when it is invoked.
3765
@param h Table handler
3767
@retval 0 OK, the next portion of rowids is in the buffer,
3772
int DsMrr_impl::dsmrr_fill_buffer(handler *unused __attribute__((unused)))
3777
rowids_buf_cur= rowids_buf;
3778
while ((rowids_buf_cur < rowids_buf_end) &&
3779
!(res= h2->handler::multi_range_read_next(&range_info)))
3781
/* Put rowid, or {rowid, range_id} pair into the buffer */
3782
h2->position(table->record[0]);
3783
memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
3784
rowids_buf_cur += h->ref_length;
3788
memcpy(rowids_buf_cur, &range_info, sizeof(void*));
3789
rowids_buf_cur += sizeof(void*);
3793
if (res && res != HA_ERR_END_OF_FILE)
3795
dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
3797
/* Sort the buffer contents by rowid */
3798
uint32_t elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3799
uint32_t n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
3801
my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
3803
rowids_buf_last= rowids_buf_cur;
3804
rowids_buf_cur= rowids_buf;
3810
DS-MRR implementation: multi_range_read_next() function
3813
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
3817
if (use_default_impl)
3818
return h->handler::multi_range_read_next(range_info);
3820
if (rowids_buf_cur == rowids_buf_last)
3824
res= HA_ERR_END_OF_FILE;
3827
res= dsmrr_fill_buffer(h);
3832
/* Return EOF if there are no rowids in the buffer after re-fill attempt */
3833
if (rowids_buf_cur == rowids_buf_last)
3835
res= HA_ERR_END_OF_FILE;
3839
res= h->rnd_pos(table->record[0], rowids_buf_cur);
3840
rowids_buf_cur += h->ref_length;
3843
memcpy(range_info, rowids_buf_cur, sizeof(void*));
3844
rowids_buf_cur += sizeof(void*);
3855
DS-MRR implementation: multi_range_read_info() function
3857
int DsMrr_impl::dsmrr_info(uint32_t keyno, uint32_t n_ranges, uint32_t rows, uint32_t *bufsz,
3858
uint32_t *flags, COST_VECT *cost)
3861
uint32_t def_flags= *flags;
3862
uint32_t def_bufsz= *bufsz;
3864
/* Get cost/flags/mem_usage of default MRR implementation */
3865
res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
3869
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3870
choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
3872
/* Default implementation is choosen */
3881
DS-MRR Implementation: multi_range_read_info_const() function
3884
ha_rows DsMrr_impl::dsmrr_info_const(uint32_t keyno, RANGE_SEQ_IF *seq,
3885
void *seq_init_param, uint32_t n_ranges,
3886
uint32_t *bufsz, uint32_t *flags, COST_VECT *cost)
3889
uint32_t def_flags= *flags;
3890
uint32_t def_bufsz= *bufsz;
3891
/* Get cost/flags/mem_usage of default MRR implementation */
3892
rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
3893
n_ranges, &def_bufsz,
3895
if (rows == HA_POS_ERROR)
3897
/* Default implementation can't perform MRR scan => we can't either */
3902
If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
3903
use the default MRR implementation (we need it for UPDATE/DELETE).
3904
Otherwise, make a choice based on cost and @@optimizer_use_mrr.
3906
if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
3907
choose_mrr_impl(keyno, rows, flags, bufsz, cost))
3914
*flags &= ~HA_MRR_USE_DEFAULT_IMPL;
3921
Check if key has partially-covered columns
3923
We can't use DS-MRR to perform range scans when the ranges are over
3924
partially-covered keys, because we'll not have full key part values
3925
(we'll have their prefixes from the index) and will not be able to check
3926
if we've reached the end the range.
3928
@param keyno Key to check
3931
Allow use of DS-MRR in cases where the index has partially-covered
3932
components but they are not used for scanning.
3938
bool DsMrr_impl::key_uses_partial_cols(uint32_t keyno)
3940
KEY_PART_INFO *kp= table->key_info[keyno].key_part;
3941
KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
3942
for (; kp != kp_end; kp++)
3944
if (!kp->field->part_of_key.is_set(keyno))
3952
DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
3954
Make the choice between using Default MRR implementation and DS-MRR.
3955
This function contains common functionality factored out of dsmrr_info()
3956
and dsmrr_info_const(). The function assumes that the default MRR
3957
implementation's applicability requirements are satisfied.
3959
@param keyno Index number
3960
@param rows E(full rows to be retrieved)
3961
@param flags IN MRR flags provided by the MRR user
3962
OUT If DS-MRR is choosen, flags of DS-MRR implementation
3963
else the value is not modified
3964
@param bufsz IN If DS-MRR is choosen, buffer use of DS-MRR implementation
3965
else the value is not modified
3966
@param cost IN Cost of default MRR implementation
3967
OUT If DS-MRR is choosen, cost of DS-MRR scan
3968
else the value is not modified
3970
@retval true Default MRR implementation should be used
3971
@retval false DS-MRR implementation should be used
3974
bool DsMrr_impl::choose_mrr_impl(uint32_t keyno, ha_rows rows, uint32_t *flags,
3975
uint32_t *bufsz, COST_VECT *cost)
3977
COST_VECT dsmrr_cost;
3979
THD *thd= current_thd;
3980
if ((thd->variables.optimizer_use_mrr == 2) ||
3981
(*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
3982
(keyno == table->s->primary_key &&
3983
h->primary_key_is_clustered()) ||
3984
key_uses_partial_cols(keyno))
3986
/* Use the default implementation */
3987
*flags |= HA_MRR_USE_DEFAULT_IMPL;
3991
uint32_t add_len= table->key_info[keyno].key_length + h->ref_length;
3993
if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
3999
If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
4000
DS-MRR and Default implementations cost. This allows one to force use of
4001
DS-MRR whenever it is applicable without affecting other cost-based
4004
if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
4005
dsmrr_cost.total_cost() > cost->total_cost())
4008
if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
4010
*flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */
4011
*flags &= ~HA_MRR_SORTED; /* We will return unordered output */
4017
/* Use the default MRR implementation */
4024
static void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost);
4028
Get cost of DS-MRR scan
4030
@param keynr Index to be used
4031
@param rows E(Number of rows to be scanned)
4032
@param flags Scan parameters (HA_MRR_* flags)
4033
@param buffer_size INOUT Buffer size
4034
@param cost OUT The cost
4037
@retval true Error, DS-MRR cannot be used (the buffer is too small
4041
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint32_t keynr, ha_rows rows, uint32_t flags,
4042
uint32_t *buffer_size, COST_VECT *cost)
4044
uint32_t max_buff_entries, elem_size;
4045
ha_rows rows_in_full_step, rows_in_last_step;
4046
uint32_t n_full_steps;
4047
double index_read_cost;
4049
elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
4050
max_buff_entries = *buffer_size / elem_size;
4052
if (!max_buff_entries)
4053
return true; /* Buffer has not enough space for even 1 rowid */
4055
/* Number of iterations we'll make with full buffer */
4056
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
4059
Get numbers of rows we'll be processing in
4060
- non-last sweep, with full buffer
4061
- last iteration, with non-full buffer
4063
rows_in_full_step= max_buff_entries;
4064
rows_in_last_step= rows % max_buff_entries;
4066
/* Adjust buffer size if we expect to use only part of the buffer */
4069
get_sort_and_sweep_cost(table, rows, cost);
4070
cost->multiply(n_full_steps);
4075
*buffer_size= cmax((ulong)*buffer_size,
4076
(size_t)(1.2*rows_in_last_step) * elem_size +
4077
h->ref_length + table->key_info[keynr].key_length);
4080
COST_VECT last_step_cost;
4081
get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
4082
cost->add(&last_step_cost);
4084
if (n_full_steps != 0)
4085
cost->mem_cost= *buffer_size;
4087
cost->mem_cost= (double)rows_in_last_step * elem_size;
4089
/* Total cost of all index accesses */
4090
index_read_cost= h->index_only_read_time(keynr, (double)rows);
4091
cost->add_io(index_read_cost, 1 /* Random seeks */);
4097
Get cost of one sort-and-sweep step
4100
get_sort_and_sweep_cost()
4101
table Table being accessed
4102
nrows Number of rows to be sorted and retrieved
4106
Get cost of these operations:
4107
- sort an array of #nrows ROWIDs using qsort
4108
- read #nrows records from table in a sweep.
4112
void get_sort_and_sweep_cost(Table *table, ha_rows nrows, COST_VECT *cost)
4116
get_sweep_read_cost(table, nrows, false, cost);
4117
/* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
4118
double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
4121
cost->cpu_cost += cmp_op * log2(cmp_op);
4129
Get cost of reading nrows table records in a "disk sweep"
4131
A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
4132
for an ordered sequence of rowids.
4134
We assume hard disk IO. The read is performed as follows:
4136
1. The disk head is moved to the needed cylinder
4137
2. The controller waits for the plate to rotate
4138
3. The data is transferred
4140
Time to do #3 is insignificant compared to #2+#1.
4142
Time to move the disk head is proportional to head travel distance.
4144
Time to wait for the plate to rotate depends on whether the disk head
4147
If disk head wasn't moved, the wait time is proportional to distance
4148
between the previous block and the block we're reading.
4150
If the head was moved, we don't know how much we'll need to wait for the
4151
plate to rotate. We assume the wait time to be a variate with a mean of
4152
0.5 of full rotation time.
4154
Our cost units are "random disk seeks". The cost of random disk seek is
4155
actually not a constant, it depends one range of cylinders we're going
4156
to access. We make it constant by introducing a fuzzy concept of "typical
4157
datafile length" (it's fuzzy as it's hard to tell whether it should
4158
include index file, temp.tables etc). Then random seek cost is:
4160
1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
4162
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
4164
@param table Table to be accessed
4165
@param nrows Number of rows to retrieve
4166
@param interrupted true <=> Assume that the disk sweep will be
4167
interrupted by other disk IO. false - otherwise.
4168
@param cost OUT The cost.
4171
void get_sweep_read_cost(Table *table, ha_rows nrows, bool interrupted,
4175
if (table->file->primary_key_is_clustered())
4177
cost->io_count= table->file->read_time(table->s->primary_key,
4178
(uint) nrows, nrows);
4183
ceil(uint64_t2double(table->file->stats.data_file_length) / IO_SIZE);
4185
n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
4186
if (busy_blocks < 1.0)
4189
cost->io_count= busy_blocks;
4193
/* Assume reading is done in one 'sweep' */
4194
cost->avg_io_cost= (DISK_SEEK_BASE_COST +
4195
DISK_SEEK_PROP_COST*n_blocks/busy_blocks);