1208.3.3
by brian
Adding missingfile. |
1 |
/* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
|
2 |
* vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
|
|
3 |
*
|
|
4 |
* Copyright (C) 2008 Sun Microsystems
|
|
5 |
*
|
|
6 |
* This program is free software; you can redistribute it and/or modify
|
|
7 |
* it under the terms of the GNU General Public License as published by
|
|
8 |
* the Free Software Foundation; version 2 of the License.
|
|
9 |
*
|
|
10 |
* This program is distributed in the hope that it will be useful,
|
|
11 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 |
* GNU General Public License for more details.
|
|
14 |
*
|
|
15 |
* You should have received a copy of the GNU General Public License
|
|
16 |
* along with this program; if not, write to the Free Software
|
|
17 |
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
18 |
*/
|
|
19 |
||
20 |
/**
|
|
21 |
@file Cursor.cc
|
|
22 |
||
23 |
Handler-calling-functions
|
|
24 |
*/
|
|
25 |
||
26 |
#include "drizzled/server_includes.h" |
|
27 |
#include "mysys/hash.h" |
|
28 |
#include "drizzled/error.h" |
|
29 |
#include "drizzled/gettext.h" |
|
30 |
#include "drizzled/probes.h" |
|
31 |
#include "drizzled/sql_parse.h" |
|
32 |
#include "drizzled/cost_vect.h" |
|
33 |
#include "drizzled/session.h" |
|
34 |
#include "drizzled/sql_base.h" |
|
35 |
#include "drizzled/replication_services.h" |
|
36 |
#include "drizzled/lock.h" |
|
37 |
#include "drizzled/item/int.h" |
|
38 |
#include "drizzled/item/empty_string.h" |
|
39 |
#include "drizzled/unireg.h" // for mysql_frm_type |
|
40 |
#include "drizzled/field/timestamp.h" |
|
41 |
#include "drizzled/message/table.pb.h" |
|
42 |
#include "drizzled/plugin/client.h" |
|
43 |
||
44 |
using namespace std; |
|
45 |
using namespace drizzled; |
|
46 |
||
47 |
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NULL,0} }; |
|
48 |
||
49 |
/* number of entries in storage_engines[] */
|
|
50 |
uint32_t total_ha= 0; |
|
51 |
/* number of storage engines (from storage_engines[]) that support 2pc */
|
|
52 |
uint32_t total_ha_2pc= 0; |
|
53 |
/* size of savepoint storage area (see ha_init) */
|
|
54 |
uint32_t savepoint_alloc_size= 0; |
|
55 |
||
56 |
const char *ha_row_type[] = { |
|
57 |
"", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?" |
|
58 |
};
|
|
59 |
||
60 |
const char *tx_isolation_names[] = |
|
61 |
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE", |
|
62 |
NULL}; |
|
63 |
||
64 |
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"", |
|
65 |
tx_isolation_names, NULL}; |
|
66 |
||
67 |
||
68 |
/**
|
|
69 |
Register Cursor error messages for use with my_error().
|
|
70 |
||
71 |
@retval
|
|
72 |
0 OK
|
|
73 |
@retval
|
|
74 |
!=0 Error
|
|
75 |
*/
|
|
76 |
||
77 |
int ha_init_errors(void) |
|
78 |
{
|
|
79 |
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
|
|
80 |
const char **errmsgs; |
|
81 |
||
82 |
/* Allocate a pointer array for the error message strings. */
|
|
83 |
/* Zerofill it to avoid uninitialized gaps. */
|
|
84 |
if (! (errmsgs= (const char**) malloc(HA_ERR_ERRORS * sizeof(char*)))) |
|
85 |
return 1; |
|
86 |
memset(errmsgs, 0, HA_ERR_ERRORS * sizeof(char *)); |
|
87 |
||
88 |
/* Set the dedicated error messages. */
|
|
89 |
SETMSG(HA_ERR_KEY_NOT_FOUND, ER(ER_KEY_NOT_FOUND)); |
|
90 |
SETMSG(HA_ERR_FOUND_DUPP_KEY, ER(ER_DUP_KEY)); |
|
91 |
SETMSG(HA_ERR_RECORD_CHANGED, "Update wich is recoverable"); |
|
92 |
SETMSG(HA_ERR_WRONG_INDEX, "Wrong index given to function"); |
|
93 |
SETMSG(HA_ERR_CRASHED, ER(ER_NOT_KEYFILE)); |
|
94 |
SETMSG(HA_ERR_WRONG_IN_RECORD, ER(ER_CRASHED_ON_USAGE)); |
|
95 |
SETMSG(HA_ERR_OUT_OF_MEM, "Table Cursor out of memory"); |
|
96 |
SETMSG(HA_ERR_NOT_A_TABLE, "Incorrect file format '%.64s'"); |
|
97 |
SETMSG(HA_ERR_WRONG_COMMAND, "Command not supported"); |
|
98 |
SETMSG(HA_ERR_OLD_FILE, ER(ER_OLD_KEYFILE)); |
|
99 |
SETMSG(HA_ERR_NO_ACTIVE_RECORD, "No record read in update"); |
|
100 |
SETMSG(HA_ERR_RECORD_DELETED, "Intern record deleted"); |
|
101 |
SETMSG(HA_ERR_RECORD_FILE_FULL, ER(ER_RECORD_FILE_FULL)); |
|
102 |
SETMSG(HA_ERR_INDEX_FILE_FULL, "No more room in index file '%.64s'"); |
|
103 |
SETMSG(HA_ERR_END_OF_FILE, "End in next/prev/first/last"); |
|
104 |
SETMSG(HA_ERR_UNSUPPORTED, ER(ER_ILLEGAL_HA)); |
|
105 |
SETMSG(HA_ERR_TO_BIG_ROW, "Too big row"); |
|
106 |
SETMSG(HA_WRONG_CREATE_OPTION, "Wrong create option"); |
|
107 |
SETMSG(HA_ERR_FOUND_DUPP_UNIQUE, ER(ER_DUP_UNIQUE)); |
|
108 |
SETMSG(HA_ERR_UNKNOWN_CHARSET, "Can't open charset"); |
|
109 |
SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF, ER(ER_WRONG_MRG_TABLE)); |
|
110 |
SETMSG(HA_ERR_CRASHED_ON_REPAIR, ER(ER_CRASHED_ON_REPAIR)); |
|
111 |
SETMSG(HA_ERR_CRASHED_ON_USAGE, ER(ER_CRASHED_ON_USAGE)); |
|
112 |
SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT, ER(ER_LOCK_WAIT_TIMEOUT)); |
|
113 |
SETMSG(HA_ERR_LOCK_TABLE_FULL, ER(ER_LOCK_TABLE_FULL)); |
|
114 |
SETMSG(HA_ERR_READ_ONLY_TRANSACTION, ER(ER_READ_ONLY_TRANSACTION)); |
|
115 |
SETMSG(HA_ERR_LOCK_DEADLOCK, ER(ER_LOCK_DEADLOCK)); |
|
116 |
SETMSG(HA_ERR_CANNOT_ADD_FOREIGN, ER(ER_CANNOT_ADD_FOREIGN)); |
|
117 |
SETMSG(HA_ERR_NO_REFERENCED_ROW, ER(ER_NO_REFERENCED_ROW_2)); |
|
118 |
SETMSG(HA_ERR_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED_2)); |
|
119 |
SETMSG(HA_ERR_NO_SAVEPOINT, "No savepoint with that name"); |
|
120 |
SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE, "Non unique key block size"); |
|
121 |
SETMSG(HA_ERR_NO_SUCH_TABLE, "No such table: '%.64s'"); |
|
122 |
SETMSG(HA_ERR_TABLE_EXIST, ER(ER_TABLE_EXISTS_ERROR)); |
|
123 |
SETMSG(HA_ERR_NO_CONNECTION, "Could not connect to storage engine"); |
|
124 |
SETMSG(HA_ERR_TABLE_DEF_CHANGED, ER(ER_TABLE_DEF_CHANGED)); |
|
125 |
SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY, "FK constraint would lead to duplicate key"); |
|
126 |
SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE, ER(ER_TABLE_NEEDS_UPGRADE)); |
|
127 |
SETMSG(HA_ERR_TABLE_READONLY, ER(ER_OPEN_AS_READONLY)); |
|
128 |
SETMSG(HA_ERR_AUTOINC_READ_FAILED, ER(ER_AUTOINC_READ_FAILED)); |
|
129 |
SETMSG(HA_ERR_AUTOINC_ERANGE, ER(ER_WARN_DATA_OUT_OF_RANGE)); |
|
130 |
||
131 |
/* Register the error messages for use with my_error(). */
|
|
132 |
return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST); |
|
133 |
}
|
|
134 |
||
135 |
||
136 |
/**
|
|
137 |
Unregister Cursor error messages.
|
|
138 |
||
139 |
@retval
|
|
140 |
0 OK
|
|
141 |
@retval
|
|
142 |
!=0 Error
|
|
143 |
*/
|
|
144 |
static int ha_finish_errors(void) |
|
145 |
{
|
|
146 |
const char **errmsgs; |
|
147 |
||
148 |
/* Allocate a pointer array for the error message strings. */
|
|
149 |
if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST))) |
|
150 |
return 1; |
|
151 |
free((unsigned char*) errmsgs); |
|
152 |
return 0; |
|
153 |
}
|
|
154 |
||
155 |
int ha_init() |
|
156 |
{
|
|
157 |
int error= 0; |
|
158 |
||
159 |
assert(total_ha < MAX_HA); |
|
160 |
/*
|
|
161 |
Check if there is a transaction-capable storage engine besides the
|
|
162 |
binary log (which is considered a transaction-capable storage engine in
|
|
163 |
counting total_ha)
|
|
164 |
*/
|
|
165 |
savepoint_alloc_size+= sizeof(SAVEPOINT); |
|
166 |
return error; |
|
167 |
}
|
|
168 |
||
169 |
int ha_end() |
|
170 |
{
|
|
171 |
int error= 0; |
|
172 |
||
173 |
/*
|
|
174 |
This should be eventualy based on the graceful shutdown flag.
|
|
175 |
So if flag is equal to HA_PANIC_CLOSE, the deallocate
|
|
176 |
the errors.
|
|
177 |
*/
|
|
178 |
if (ha_finish_errors()) |
|
179 |
error= 1; |
|
180 |
||
181 |
return error; |
|
182 |
}
|
|
183 |
||
184 |
||
185 |
||
186 |
/* ========================================================================
|
|
187 |
======================= TRANSACTIONS ===================================*/
|
|
188 |
||
189 |
/**
|
|
190 |
Transaction handling in the server
|
|
191 |
==================================
|
|
192 |
||
193 |
In each client connection, MySQL maintains two transactional
|
|
194 |
states:
|
|
195 |
- a statement transaction,
|
|
196 |
- a standard, also called normal transaction.
|
|
197 |
||
198 |
Historical note
|
|
199 |
---------------
|
|
200 |
"Statement transaction" is a non-standard term that comes
|
|
201 |
from the times when MySQL supported BerkeleyDB storage engine.
|
|
202 |
||
203 |
First of all, it should be said that in BerkeleyDB auto-commit
|
|
204 |
mode auto-commits operations that are atomic to the storage
|
|
205 |
engine itself, such as a write of a record, and are too
|
|
206 |
high-granular to be atomic from the application perspective
|
|
207 |
(MySQL). One SQL statement could involve many BerkeleyDB
|
|
208 |
auto-committed operations and thus BerkeleyDB auto-commit was of
|
|
209 |
little use to MySQL.
|
|
210 |
||
211 |
Secondly, instead of SQL standard savepoints, BerkeleyDB
|
|
212 |
provided the concept of "nested transactions". In a nutshell,
|
|
213 |
transactions could be arbitrarily nested, but when the parent
|
|
214 |
transaction was committed or aborted, all its child (nested)
|
|
215 |
transactions were handled committed or aborted as well.
|
|
216 |
Commit of a nested transaction, in turn, made its changes
|
|
217 |
visible, but not durable: it destroyed the nested transaction,
|
|
218 |
all its changes would become available to the parent and
|
|
219 |
currently active nested transactions of this parent.
|
|
220 |
||
221 |
So the mechanism of nested transactions was employed to
|
|
222 |
provide "all or nothing" guarantee of SQL statements
|
|
223 |
required by the standard.
|
|
224 |
A nested transaction would be created at start of each SQL
|
|
225 |
statement, and destroyed (committed or aborted) at statement
|
|
226 |
end. Such nested transaction was internally referred to as
|
|
227 |
a "statement transaction" and gave birth to the term.
|
|
228 |
||
229 |
<Historical note ends>
|
|
230 |
||
231 |
Since then a statement transaction is started for each statement
|
|
232 |
that accesses transactional tables or uses the binary log. If
|
|
233 |
the statement succeeds, the statement transaction is committed.
|
|
234 |
If the statement fails, the transaction is rolled back. Commits
|
|
235 |
of statement transactions are not durable -- each such
|
|
236 |
transaction is nested in the normal transaction, and if the
|
|
237 |
normal transaction is rolled back, the effects of all enclosed
|
|
238 |
statement transactions are undone as well. Technically,
|
|
239 |
a statement transaction can be viewed as a savepoint which is
|
|
240 |
maintained automatically in order to make effects of one
|
|
241 |
statement atomic.
|
|
242 |
||
243 |
The normal transaction is started by the user and is ended
|
|
244 |
usually upon a user request as well. The normal transaction
|
|
245 |
encloses transactions of all statements issued between
|
|
246 |
its beginning and its end.
|
|
247 |
In autocommit mode, the normal transaction is equivalent
|
|
248 |
to the statement transaction.
|
|
249 |
||
250 |
Since MySQL supports PSEA (pluggable storage engine
|
|
251 |
architecture), more than one transactional engine can be
|
|
252 |
active at a time. Hence transactions, from the server
|
|
253 |
point of view, are always distributed. In particular,
|
|
254 |
transactional state is maintained independently for each
|
|
255 |
engine. In order to commit a transaction the two phase
|
|
256 |
commit protocol is employed.
|
|
257 |
||
258 |
Not all statements are executed in context of a transaction.
|
|
259 |
Administrative and status information statements do not modify
|
|
260 |
engine data, and thus do not start a statement transaction and
|
|
261 |
also have no effect on the normal transaction. Examples of such
|
|
262 |
statements are SHOW STATUS and RESET SLAVE.
|
|
263 |
||
264 |
Similarly DDL statements are not transactional,
|
|
265 |
and therefore a transaction is [almost] never started for a DDL
|
|
266 |
statement. The difference between a DDL statement and a purely
|
|
267 |
administrative statement though is that a DDL statement always
|
|
268 |
commits the current transaction before proceeding, if there is
|
|
269 |
any.
|
|
270 |
||
271 |
At last, SQL statements that work with non-transactional
|
|
272 |
engines also have no effect on the transaction state of the
|
|
273 |
connection. Even though they are written to the binary log,
|
|
274 |
and the binary log is, overall, transactional, the writes
|
|
275 |
are done in "write-through" mode, directly to the binlog
|
|
276 |
file, followed with a OS cache sync, in other words,
|
|
277 |
bypassing the binlog undo log (translog).
|
|
278 |
They do not commit the current normal transaction.
|
|
279 |
A failure of a statement that uses non-transactional tables
|
|
280 |
would cause a rollback of the statement transaction, but
|
|
281 |
in case there no non-transactional tables are used,
|
|
282 |
no statement transaction is started.
|
|
283 |
||
284 |
Data layout
|
|
285 |
-----------
|
|
286 |
||
287 |
The server stores its transaction-related data in
|
|
288 |
session->transaction. This structure has two members of type
|
|
289 |
Session_TRANS. These members correspond to the statement and
|
|
290 |
normal transactions respectively:
|
|
291 |
||
292 |
- session->transaction.stmt contains a list of engines
|
|
293 |
that are participating in the given statement
|
|
294 |
- session->transaction.all contains a list of engines that
|
|
295 |
have participated in any of the statement transactions started
|
|
296 |
within the context of the normal transaction.
|
|
297 |
Each element of the list contains a pointer to the storage
|
|
298 |
engine, engine-specific transactional data, and engine-specific
|
|
299 |
transaction flags.
|
|
300 |
||
301 |
In autocommit mode session->transaction.all is empty.
|
|
302 |
Instead, data of session->transaction.stmt is
|
|
303 |
used to commit/rollback the normal transaction.
|
|
304 |
||
305 |
The list of registered engines has a few important properties:
|
|
306 |
- no engine is registered in the list twice
|
|
307 |
- engines are present in the list a reverse temporal order --
|
|
308 |
new participants are always added to the beginning of the list.
|
|
309 |
||
310 |
Transaction life cycle
|
|
311 |
----------------------
|
|
312 |
||
313 |
When a new connection is established, session->transaction
|
|
314 |
members are initialized to an empty state.
|
|
315 |
If a statement uses any tables, all affected engines
|
|
316 |
are registered in the statement engine list. In
|
|
317 |
non-autocommit mode, the same engines are registered in
|
|
318 |
the normal transaction list.
|
|
319 |
At the end of the statement, the server issues a commit
|
|
320 |
or a roll back for all engines in the statement list.
|
|
321 |
At this point transaction flags of an engine, if any, are
|
|
322 |
propagated from the statement list to the list of the normal
|
|
323 |
transaction.
|
|
324 |
When commit/rollback is finished, the statement list is
|
|
325 |
cleared. It will be filled in again by the next statement,
|
|
326 |
and emptied again at the next statement's end.
|
|
327 |
||
328 |
The normal transaction is committed in a similar way
|
|
329 |
(by going over all engines in session->transaction.all list)
|
|
330 |
but at different times:
|
|
331 |
- upon COMMIT SQL statement is issued by the user
|
|
332 |
- implicitly, by the server, at the beginning of a DDL statement
|
|
333 |
or SET AUTOCOMMIT={0|1} statement.
|
|
334 |
||
335 |
The normal transaction can be rolled back as well:
|
|
336 |
- if the user has requested so, by issuing ROLLBACK SQL
|
|
337 |
statement
|
|
338 |
- if one of the storage engines requested a rollback
|
|
339 |
by setting session->transaction_rollback_request. This may
|
|
340 |
happen in case, e.g., when the transaction in the engine was
|
|
341 |
chosen a victim of the internal deadlock resolution algorithm
|
|
342 |
and rolled back internally. When such a situation happens, there
|
|
343 |
is little the server can do and the only option is to rollback
|
|
344 |
transactions in all other participating engines. In this case
|
|
345 |
the rollback is accompanied by an error sent to the user.
|
|
346 |
||
347 |
As follows from the use cases above, the normal transaction
|
|
348 |
is never committed when there is an outstanding statement
|
|
349 |
transaction. In most cases there is no conflict, since
|
|
350 |
commits of the normal transaction are issued by a stand-alone
|
|
351 |
administrative or DDL statement, thus no outstanding statement
|
|
352 |
transaction of the previous statement exists. Besides,
|
|
353 |
all statements that manipulate with the normal transaction
|
|
354 |
are prohibited in stored functions and triggers, therefore
|
|
355 |
no conflicting situation can occur in a sub-statement either.
|
|
356 |
The remaining rare cases when the server explicitly has
|
|
357 |
to commit the statement transaction prior to committing the normal
|
|
358 |
one cover error-handling scenarios (see for example
|
|
359 |
?).
|
|
360 |
||
361 |
When committing a statement or a normal transaction, the server
|
|
362 |
either uses the two-phase commit protocol, or issues a commit
|
|
363 |
in each engine independently. The two-phase commit protocol
|
|
364 |
is used only if:
|
|
365 |
- all participating engines support two-phase commit (provide
|
|
366 |
plugin::StorageEngine::prepare PSEA API call) and
|
|
367 |
- transactions in at least two engines modify data (i.e. are
|
|
368 |
not read-only).
|
|
369 |
||
370 |
Note that the two phase commit is used for
|
|
371 |
statement transactions, even though they are not durable anyway.
|
|
372 |
This is done to ensure logical consistency of data in a multiple-
|
|
373 |
engine transaction.
|
|
374 |
For example, imagine that some day MySQL supports unique
|
|
375 |
constraint checks deferred till the end of statement. In such
|
|
376 |
case a commit in one of the engines may yield ER_DUP_KEY,
|
|
377 |
and MySQL should be able to gracefully abort statement
|
|
378 |
transactions of other participants.
|
|
379 |
||
380 |
After the normal transaction has been committed,
|
|
381 |
session->transaction.all list is cleared.
|
|
382 |
||
383 |
When a connection is closed, the current normal transaction, if
|
|
384 |
any, is rolled back.
|
|
385 |
||
386 |
Roles and responsibilities
|
|
387 |
--------------------------
|
|
388 |
||
389 |
The server has no way to know that an engine participates in
|
|
390 |
the statement and a transaction has been started
|
|
391 |
in it unless the engine says so. Thus, in order to be
|
|
392 |
a part of a transaction, the engine must "register" itself.
|
|
393 |
This is done by invoking trans_register_ha() server call.
|
|
394 |
Normally the engine registers itself whenever Cursor::external_lock()
|
|
395 |
is called. trans_register_ha() can be invoked many times: if
|
|
396 |
an engine is already registered, the call does nothing.
|
|
397 |
In case autocommit is not set, the engine must register itself
|
|
398 |
twice -- both in the statement list and in the normal transaction
|
|
399 |
list.
|
|
400 |
In which list to register is a parameter of trans_register_ha().
|
|
401 |
||
402 |
Note, that although the registration interface in itself is
|
|
403 |
fairly clear, the current usage practice often leads to undesired
|
|
404 |
effects. E.g. since a call to trans_register_ha() in most engines
|
|
405 |
is embedded into implementation of Cursor::external_lock(), some
|
|
406 |
DDL statements start a transaction (at least from the server
|
|
407 |
point of view) even though they are not expected to. E.g.
|
|
408 |
CREATE TABLE does not start a transaction, since
|
|
409 |
Cursor::external_lock() is never called during CREATE TABLE. But
|
|
410 |
CREATE TABLE ... SELECT does, since Cursor::external_lock() is
|
|
411 |
called for the table that is being selected from. This has no
|
|
412 |
practical effects currently, but must be kept in mind
|
|
413 |
nevertheless.
|
|
414 |
||
415 |
Once an engine is registered, the server will do the rest
|
|
416 |
of the work.
|
|
417 |
||
418 |
During statement execution, whenever any of data-modifying
|
|
419 |
PSEA API methods is used, e.g. Cursor::write_row() or
|
|
420 |
Cursor::update_row(), the read-write flag is raised in the
|
|
421 |
statement transaction for the involved engine.
|
|
422 |
Currently All PSEA calls are "traced", and the data can not be
|
|
423 |
changed in a way other than issuing a PSEA call. Important:
|
|
424 |
unless this invariant is preserved the server will not know that
|
|
425 |
a transaction in a given engine is read-write and will not
|
|
426 |
involve the two-phase commit protocol!
|
|
427 |
||
428 |
At the end of a statement, server call
|
|
429 |
ha_autocommit_or_rollback() is invoked. This call in turn
|
|
430 |
invokes plugin::StorageEngine::prepare() for every involved engine.
|
|
431 |
Prepare is followed by a call to plugin::StorageEngine::commit_one_phase()
|
|
432 |
If a one-phase commit will suffice, plugin::StorageEngine::prepare() is not
|
|
433 |
invoked and the server only calls plugin::StorageEngine::commit_one_phase().
|
|
434 |
At statement commit, the statement-related read-write engine
|
|
435 |
flag is propagated to the corresponding flag in the normal
|
|
436 |
transaction. When the commit is complete, the list of registered
|
|
437 |
engines is cleared.
|
|
438 |
||
439 |
Rollback is handled in a similar fashion.
|
|
440 |
||
441 |
Additional notes on DDL and the normal transaction.
|
|
442 |
---------------------------------------------------
|
|
443 |
||
444 |
DDLs and operations with non-transactional engines
|
|
445 |
do not "register" in session->transaction lists, and thus do not
|
|
446 |
modify the transaction state. Besides, each DDL in
|
|
447 |
MySQL is prefixed with an implicit normal transaction commit
|
|
448 |
(a call to Session::endActiveTransaction()), and thus leaves nothing
|
|
449 |
to modify.
|
|
450 |
However, as it has been pointed out with CREATE TABLE .. SELECT,
|
|
451 |
some DDL statements can start a *new* transaction.
|
|
452 |
||
453 |
Behaviour of the server in this case is currently badly
|
|
454 |
defined.
|
|
455 |
DDL statements use a form of "semantic" logging
|
|
456 |
to maintain atomicity: if CREATE TABLE .. SELECT failed,
|
|
457 |
the newly created table is deleted.
|
|
458 |
In addition, some DDL statements issue interim transaction
|
|
459 |
commits: e.g. ALTER Table issues a commit after data is copied
|
|
460 |
from the original table to the internal temporary table. Other
|
|
461 |
statements, e.g. CREATE TABLE ... SELECT do not always commit
|
|
462 |
after itself.
|
|
463 |
And finally there is a group of DDL statements such as
|
|
464 |
RENAME/DROP Table that doesn't start a new transaction
|
|
465 |
and doesn't commit.
|
|
466 |
||
467 |
This diversity makes it hard to say what will happen if
|
|
468 |
by chance a stored function is invoked during a DDL --
|
|
469 |
whether any modifications it makes will be committed or not
|
|
470 |
is not clear. Fortunately, SQL grammar of few DDLs allows
|
|
471 |
invocation of a stored function.
|
|
472 |
||
473 |
A consistent behaviour is perhaps to always commit the normal
|
|
474 |
transaction after all DDLs, just like the statement transaction
|
|
475 |
is always committed at the end of all statements.
|
|
476 |
*/
|
|
477 |
||
478 |
/**
|
|
479 |
Register a storage engine for a transaction.
|
|
480 |
||
481 |
Every storage engine MUST call this function when it starts
|
|
482 |
a transaction or a statement (that is it must be called both for the
|
|
483 |
"beginning of transaction" and "beginning of statement").
|
|
484 |
Only storage engines registered for the transaction/statement
|
|
485 |
will know when to commit/rollback it.
|
|
486 |
||
487 |
@note
|
|
488 |
trans_register_ha is idempotent - storage engine may register many
|
|
489 |
times per transaction.
|
|
490 |
||
491 |
*/
|
|
492 |
void trans_register_ha(Session *session, bool all, plugin::StorageEngine *engine) |
|
493 |
{
|
|
494 |
Session_TRANS *trans; |
|
495 |
Ha_trx_info *ha_info; |
|
496 |
||
497 |
if (all) |
|
498 |
{
|
|
499 |
trans= &session->transaction.all; |
|
500 |
session->server_status|= SERVER_STATUS_IN_TRANS; |
|
501 |
}
|
|
502 |
else
|
|
503 |
trans= &session->transaction.stmt; |
|
504 |
||
505 |
ha_info= session->ha_data[engine->getSlot()].ha_info + static_cast<unsigned>(all); |
|
506 |
||
507 |
if (ha_info->is_started()) |
|
508 |
return; /* already registered, return */ |
|
509 |
||
510 |
ha_info->register_ha(trans, engine); |
|
511 |
||
512 |
trans->no_2pc|= not engine->has_2pc(); |
|
513 |
if (session->transaction.xid_state.xid.is_null()) |
|
514 |
session->transaction.xid_state.xid.set(session->query_id); |
|
515 |
}
|
|
516 |
||
517 |
/**
|
|
518 |
Check if we can skip the two-phase commit.
|
|
519 |
||
520 |
A helper function to evaluate if two-phase commit is mandatory.
|
|
521 |
As a side effect, propagates the read-only/read-write flags
|
|
522 |
of the statement transaction to its enclosing normal transaction.
|
|
523 |
||
524 |
@retval true we must run a two-phase commit. Returned
|
|
525 |
if we have at least two engines with read-write changes.
|
|
526 |
@retval false Don't need two-phase commit. Even if we have two
|
|
527 |
transactional engines, we can run two independent
|
|
528 |
commits if changes in one of the engines are read-only.
|
|
529 |
*/
|
|
530 |
||
531 |
static
|
|
532 |
bool
|
|
533 |
ha_check_and_coalesce_trx_read_only(Session *session, Ha_trx_info *ha_list, |
|
534 |
bool all) |
|
535 |
{
|
|
536 |
/* The number of storage engines that have actual changes. */
|
|
537 |
unsigned rw_ha_count= 0; |
|
538 |
Ha_trx_info *ha_info; |
|
539 |
||
540 |
for (ha_info= ha_list; ha_info; ha_info= ha_info->next()) |
|
541 |
{
|
|
542 |
if (ha_info->is_trx_read_write()) |
|
543 |
++rw_ha_count; |
|
544 |
||
545 |
if (! all) |
|
546 |
{
|
|
547 |
Ha_trx_info *ha_info_all= &session->ha_data[ha_info->engine()->getSlot()].ha_info[1]; |
|
548 |
assert(ha_info != ha_info_all); |
|
549 |
/*
|
|
550 |
Merge read-only/read-write information about statement
|
|
551 |
transaction to its enclosing normal transaction. Do this
|
|
552 |
only if in a real transaction -- that is, if we know
|
|
553 |
that ha_info_all is registered in session->transaction.all.
|
|
554 |
Since otherwise we only clutter the normal transaction flags.
|
|
555 |
*/
|
|
556 |
if (ha_info_all->is_started()) /* false if autocommit. */ |
|
557 |
ha_info_all->coalesce_trx_with(ha_info); |
|
558 |
}
|
|
559 |
else if (rw_ha_count > 1) |
|
560 |
{
|
|
561 |
/*
|
|
562 |
It is a normal transaction, so we don't need to merge read/write
|
|
563 |
information up, and the need for two-phase commit has been
|
|
564 |
already established. Break the loop prematurely.
|
|
565 |
*/
|
|
566 |
break; |
|
567 |
}
|
|
568 |
}
|
|
569 |
return rw_ha_count > 1; |
|
570 |
}
|
|
571 |
||
572 |
||
573 |
/**
|
|
574 |
@retval
|
|
575 |
0 ok
|
|
576 |
@retval
|
|
577 |
1 transaction was rolled back
|
|
578 |
@retval
|
|
579 |
2 error during commit, data may be inconsistent
|
|
580 |
||
581 |
@todo
|
|
582 |
Since we don't support nested statement transactions in 5.0,
|
|
583 |
we can't commit or rollback stmt transactions while we are inside
|
|
584 |
stored functions or triggers. So we simply do nothing now.
|
|
585 |
TODO: This should be fixed in later ( >= 5.1) releases.
|
|
586 |
*/
|
|
587 |
int ha_commit_trans(Session *session, bool all) |
|
588 |
{
|
|
589 |
int error= 0, cookie= 0; |
|
590 |
/*
|
|
591 |
'all' means that this is either an explicit commit issued by
|
|
592 |
user, or an implicit commit issued by a DDL.
|
|
593 |
*/
|
|
594 |
Session_TRANS *trans= all ? &session->transaction.all : &session->transaction.stmt; |
|
595 |
bool is_real_trans= all || session->transaction.all.ha_list == 0; |
|
596 |
Ha_trx_info *ha_info= trans->ha_list; |
|
597 |
||
598 |
/*
|
|
599 |
We must not commit the normal transaction if a statement
|
|
600 |
transaction is pending. Otherwise statement transaction
|
|
601 |
flags will not get propagated to its normal transaction's
|
|
602 |
counterpart.
|
|
603 |
*/
|
|
604 |
assert(session->transaction.stmt.ha_list == NULL || |
|
605 |
trans == &session->transaction.stmt); |
|
606 |
||
607 |
if (ha_info) |
|
608 |
{
|
|
609 |
bool must_2pc; |
|
610 |
||
611 |
if (is_real_trans && wait_if_global_read_lock(session, 0, 0)) |
|
612 |
{
|
|
613 |
ha_rollback_trans(session, all); |
|
614 |
return 1; |
|
615 |
}
|
|
616 |
||
617 |
must_2pc= ha_check_and_coalesce_trx_read_only(session, ha_info, all); |
|
618 |
||
619 |
if (!trans->no_2pc && must_2pc) |
|
620 |
{
|
|
621 |
for (; ha_info && !error; ha_info= ha_info->next()) |
|
622 |
{
|
|
623 |
int err; |
|
624 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
625 |
/*
|
|
626 |
Do not call two-phase commit if this particular
|
|
627 |
transaction is read-only. This allows for simpler
|
|
628 |
implementation in engines that are always read-only.
|
|
629 |
*/
|
|
630 |
if (! ha_info->is_trx_read_write()) |
|
631 |
continue; |
|
632 |
/*
|
|
633 |
Sic: we know that prepare() is not NULL since otherwise
|
|
634 |
trans->no_2pc would have been set.
|
|
635 |
*/
|
|
636 |
if ((err= engine->prepare(session, all))) |
|
637 |
{
|
|
638 |
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err); |
|
639 |
error= 1; |
|
640 |
}
|
|
641 |
status_var_increment(session->status_var.ha_prepare_count); |
|
642 |
}
|
|
643 |
if (error) |
|
644 |
{
|
|
645 |
ha_rollback_trans(session, all); |
|
646 |
error= 1; |
|
647 |
goto end; |
|
648 |
}
|
|
649 |
}
|
|
650 |
error=ha_commit_one_phase(session, all) ? (cookie ? 2 : 1) : 0; |
|
651 |
end: |
|
652 |
if (is_real_trans) |
|
653 |
start_waiting_global_read_lock(session); |
|
654 |
}
|
|
655 |
return error; |
|
656 |
}
|
|
657 |
||
658 |
/**
|
|
659 |
@note
|
|
660 |
This function does not care about global read lock. A caller should.
|
|
661 |
*/
|
|
662 |
int ha_commit_one_phase(Session *session, bool all) |
|
663 |
{
|
|
664 |
int error=0; |
|
665 |
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt; |
|
666 |
bool is_real_trans=all || session->transaction.all.ha_list == 0; |
|
667 |
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next; |
|
668 |
if (ha_info) |
|
669 |
{
|
|
670 |
for (; ha_info; ha_info= ha_info_next) |
|
671 |
{
|
|
672 |
int err; |
|
673 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
674 |
if ((err= engine->commit(session, all))) |
|
675 |
{
|
|
676 |
my_error(ER_ERROR_DURING_COMMIT, MYF(0), err); |
|
677 |
error=1; |
|
678 |
}
|
|
679 |
status_var_increment(session->status_var.ha_commit_count); |
|
680 |
ha_info_next= ha_info->next(); |
|
681 |
ha_info->reset(); /* keep it conveniently zero-filled */ |
|
682 |
}
|
|
683 |
trans->ha_list= 0; |
|
684 |
trans->no_2pc=0; |
|
685 |
if (is_real_trans) |
|
686 |
session->transaction.xid_state.xid.null(); |
|
687 |
if (all) |
|
688 |
{
|
|
689 |
session->variables.tx_isolation=session->session_tx_isolation; |
|
690 |
session->transaction.cleanup(); |
|
691 |
}
|
|
692 |
}
|
|
693 |
if (error == 0) |
|
694 |
{
|
|
695 |
if (is_real_trans) |
|
696 |
{
|
|
697 |
/*
|
|
698 |
* We commit the normal transaction by finalizing the transaction message
|
|
699 |
* and propogating the message to all registered replicators.
|
|
700 |
*/
|
|
701 |
ReplicationServices &replication_services= ReplicationServices::singleton(); |
|
702 |
replication_services.commitNormalTransaction(session); |
|
703 |
}
|
|
704 |
}
|
|
705 |
return error; |
|
706 |
}
|
|
707 |
||
708 |
||
709 |
int ha_rollback_trans(Session *session, bool all) |
|
710 |
{
|
|
711 |
int error=0; |
|
712 |
Session_TRANS *trans=all ? &session->transaction.all : &session->transaction.stmt; |
|
713 |
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next; |
|
714 |
bool is_real_trans=all || session->transaction.all.ha_list == 0; |
|
715 |
||
716 |
/*
|
|
717 |
We must not rollback the normal transaction if a statement
|
|
718 |
transaction is pending.
|
|
719 |
*/
|
|
720 |
assert(session->transaction.stmt.ha_list == NULL || |
|
721 |
trans == &session->transaction.stmt); |
|
722 |
||
723 |
if (ha_info) |
|
724 |
{
|
|
725 |
for (; ha_info; ha_info= ha_info_next) |
|
726 |
{
|
|
727 |
int err; |
|
728 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
729 |
if ((err= engine->rollback(session, all))) |
|
730 |
{ // cannot happen |
|
731 |
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err); |
|
732 |
error=1; |
|
733 |
}
|
|
734 |
status_var_increment(session->status_var.ha_rollback_count); |
|
735 |
ha_info_next= ha_info->next(); |
|
736 |
ha_info->reset(); /* keep it conveniently zero-filled */ |
|
737 |
}
|
|
738 |
trans->ha_list= 0; |
|
739 |
trans->no_2pc=0; |
|
740 |
if (is_real_trans) |
|
741 |
session->transaction.xid_state.xid.null(); |
|
742 |
if (all) |
|
743 |
{
|
|
744 |
session->variables.tx_isolation=session->session_tx_isolation; |
|
745 |
session->transaction.cleanup(); |
|
746 |
}
|
|
747 |
}
|
|
748 |
if (all) |
|
749 |
session->transaction_rollback_request= false; |
|
750 |
||
751 |
/*
|
|
752 |
If a non-transactional table was updated, warn; don't warn if this is a
|
|
753 |
slave thread (because when a slave thread executes a ROLLBACK, it has
|
|
754 |
been read from the binary log, so it's 100% sure and normal to produce
|
|
755 |
error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
|
|
756 |
slave SQL thread, it would not stop the thread but just be printed in
|
|
757 |
the error log; but we don't want users to wonder why they have this
|
|
758 |
message in the error log, so we don't send it.
|
|
759 |
*/
|
|
760 |
if (is_real_trans && session->transaction.all.modified_non_trans_table && session->killed != Session::KILL_CONNECTION) |
|
761 |
push_warning(session, DRIZZLE_ERROR::WARN_LEVEL_WARN, |
|
762 |
ER_WARNING_NOT_COMPLETE_ROLLBACK, |
|
763 |
ER(ER_WARNING_NOT_COMPLETE_ROLLBACK)); |
|
764 |
return error; |
|
765 |
}
|
|
766 |
||
767 |
/**
|
|
768 |
This is used to commit or rollback a single statement depending on
|
|
769 |
the value of error.
|
|
770 |
||
771 |
@note
|
|
772 |
Note that if the autocommit is on, then the following call inside
|
|
773 |
InnoDB will commit or rollback the whole transaction (= the statement). The
|
|
774 |
autocommit mechanism built into InnoDB is based on counting locks, but if
|
|
775 |
the user has used LOCK TABLES then that mechanism does not know to do the
|
|
776 |
commit.
|
|
777 |
*/
|
|
778 |
int ha_autocommit_or_rollback(Session *session, int error) |
|
779 |
{
|
|
780 |
if (session->transaction.stmt.ha_list) |
|
781 |
{
|
|
782 |
if (!error) |
|
783 |
{
|
|
784 |
if (ha_commit_trans(session, 0)) |
|
785 |
error= 1; |
|
786 |
}
|
|
787 |
else
|
|
788 |
{
|
|
789 |
(void) ha_rollback_trans(session, 0); |
|
790 |
if (session->transaction_rollback_request) |
|
791 |
(void) ha_rollback(session); |
|
792 |
}
|
|
793 |
||
794 |
session->variables.tx_isolation=session->session_tx_isolation; |
|
795 |
}
|
|
796 |
||
797 |
return error; |
|
798 |
}
|
|
799 |
||
800 |
/**
|
|
801 |
return the list of XID's to a client, the same way SHOW commands do.
|
|
802 |
||
803 |
@note
|
|
804 |
I didn't find in XA specs that an RM cannot return the same XID twice,
|
|
805 |
so mysql_xa_recover does not filter XID's to ensure uniqueness.
|
|
806 |
It can be easily fixed later, if necessary.
|
|
807 |
*/
|
|
808 |
bool mysql_xa_recover(Session *session) |
|
809 |
{
|
|
810 |
List<Item> field_list; |
|
811 |
int i= 0; |
|
812 |
XID_STATE *xs; |
|
813 |
||
814 |
field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS)); |
|
815 |
field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS)); |
|
816 |
field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS)); |
|
817 |
field_list.push_back(new Item_empty_string("data",XIDDATASIZE)); |
|
818 |
||
819 |
if (session->client->sendFields(&field_list)) |
|
820 |
return 1; |
|
821 |
||
822 |
pthread_mutex_lock(&LOCK_xid_cache); |
|
823 |
while ((xs= (XID_STATE*)hash_element(&xid_cache, i++))) |
|
824 |
{
|
|
825 |
if (xs->xa_state==XA_PREPARED) |
|
826 |
{
|
|
827 |
session->client->store((int64_t)xs->xid.formatID); |
|
828 |
session->client->store((int64_t)xs->xid.gtrid_length); |
|
829 |
session->client->store((int64_t)xs->xid.bqual_length); |
|
830 |
session->client->store(xs->xid.data, |
|
831 |
xs->xid.gtrid_length+xs->xid.bqual_length); |
|
832 |
if (session->client->flush()) |
|
833 |
{
|
|
834 |
pthread_mutex_unlock(&LOCK_xid_cache); |
|
835 |
return 1; |
|
836 |
}
|
|
837 |
}
|
|
838 |
}
|
|
839 |
||
840 |
pthread_mutex_unlock(&LOCK_xid_cache); |
|
841 |
session->my_eof(); |
|
842 |
return 0; |
|
843 |
}
|
|
844 |
||
845 |
||
846 |
int ha_rollback_to_savepoint(Session *session, SAVEPOINT *sv) |
|
847 |
{
|
|
848 |
int error= 0; |
|
849 |
Session_TRANS *trans= &session->transaction.all; |
|
850 |
Ha_trx_info *ha_info, *ha_info_next; |
|
851 |
||
852 |
trans->no_2pc=0; |
|
853 |
/*
|
|
854 |
rolling back to savepoint in all storage engines that were part of the
|
|
855 |
transaction when the savepoint was set
|
|
856 |
*/
|
|
857 |
for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next()) |
|
858 |
{
|
|
859 |
int err; |
|
860 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
861 |
assert(engine); |
|
862 |
if ((err= engine->savepoint_rollback(session, |
|
863 |
(void *)(sv+1)))) |
|
864 |
{ // cannot happen |
|
865 |
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err); |
|
866 |
error= 1; |
|
867 |
}
|
|
868 |
status_var_increment(session->status_var.ha_savepoint_rollback_count); |
|
869 |
trans->no_2pc|= not engine->has_2pc(); |
|
870 |
}
|
|
871 |
/*
|
|
872 |
rolling back the transaction in all storage engines that were not part of
|
|
873 |
the transaction when the savepoint was set
|
|
874 |
*/
|
|
875 |
for (ha_info= trans->ha_list; ha_info != sv->ha_list; |
|
876 |
ha_info= ha_info_next) |
|
877 |
{
|
|
878 |
int err; |
|
879 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
880 |
if ((err= engine->rollback(session, !(0)))) |
|
881 |
{ // cannot happen |
|
882 |
my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err); |
|
883 |
error= 1; |
|
884 |
}
|
|
885 |
status_var_increment(session->status_var.ha_rollback_count); |
|
886 |
ha_info_next= ha_info->next(); |
|
887 |
ha_info->reset(); /* keep it conveniently zero-filled */ |
|
888 |
}
|
|
889 |
trans->ha_list= sv->ha_list; |
|
890 |
return error; |
|
891 |
}
|
|
892 |
||
893 |
/**
|
|
894 |
@note
|
|
895 |
according to the sql standard (ISO/IEC 9075-2:2003)
|
|
896 |
section "4.33.4 SQL-statements and transaction states",
|
|
897 |
SAVEPOINT is *not* transaction-initiating SQL-statement
|
|
898 |
*/
|
|
899 |
int ha_savepoint(Session *session, SAVEPOINT *sv) |
|
900 |
{
|
|
901 |
int error= 0; |
|
902 |
Session_TRANS *trans= &session->transaction.all; |
|
903 |
Ha_trx_info *ha_info= trans->ha_list; |
|
904 |
for (; ha_info; ha_info= ha_info->next()) |
|
905 |
{
|
|
906 |
int err; |
|
907 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
908 |
assert(engine); |
|
909 |
#ifdef NOT_IMPLEMENTED /*- TODO (examine this againt the original code base) */ |
|
910 |
if (! engine->savepoint_set) |
|
911 |
{
|
|
912 |
my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT"); |
|
913 |
error= 1; |
|
914 |
break; |
|
915 |
}
|
|
916 |
#endif
|
|
917 |
if ((err= engine->savepoint_set(session, (void *)(sv+1)))) |
|
918 |
{ // cannot happen |
|
919 |
my_error(ER_GET_ERRNO, MYF(0), err); |
|
920 |
error= 1; |
|
921 |
}
|
|
922 |
status_var_increment(session->status_var.ha_savepoint_count); |
|
923 |
}
|
|
924 |
/*
|
|
925 |
Remember the list of registered storage engines. All new
|
|
926 |
engines are prepended to the beginning of the list.
|
|
927 |
*/
|
|
928 |
sv->ha_list= trans->ha_list; |
|
929 |
return error; |
|
930 |
}
|
|
931 |
||
932 |
int ha_release_savepoint(Session *session, SAVEPOINT *sv) |
|
933 |
{
|
|
934 |
int error= 0; |
|
935 |
Ha_trx_info *ha_info= sv->ha_list; |
|
936 |
||
937 |
for (; ha_info; ha_info= ha_info->next()) |
|
938 |
{
|
|
939 |
int err; |
|
940 |
plugin::StorageEngine *engine= ha_info->engine(); |
|
941 |
/* Savepoint life time is enclosed into transaction life time. */
|
|
942 |
assert(engine); |
|
943 |
if ((err= engine->savepoint_release(session, |
|
944 |
(void *)(sv+1)))) |
|
945 |
{ // cannot happen |
|
946 |
my_error(ER_GET_ERRNO, MYF(0), err); |
|
947 |
error= 1; |
|
948 |
}
|
|
949 |
}
|
|
950 |
return error; |
|
951 |
}
|