1
/******************************************************
6
Created 1/8/1997 Heikki Tuuri
7
*******************************************************/
12
#include "row0undo.ic"
16
#include "mach0data.h"
21
#include "trx0purge.h"
27
#include "row0mysql.h"
30
/* How to undo row operations?
31
(1) For an insert, we have stored a prefix of the clustered index record
32
in the undo log. Using it, we look for the clustered record, and using
33
that we look for the records in the secondary indexes. The insert operation
34
may have been left incomplete, if the database crashed, for example.
35
We may have look at the trx id and roll ptr to make sure the record in the
36
clustered index is really the one for which the undo log record was
37
written. We can use the framework we get from the original insert op.
38
(2) Delete marking: We can use the framework we get from the original
39
delete mark op. We only have to check the trx id.
40
(3) Update: This may be the most complicated. We have to use the framework
41
we get from the original update op.
43
What if the same trx repeatedly deletes and inserts an identical row.
44
Then the row id changes and also roll ptr. What if the row id was not
45
part of the ordering fields in the clustered index? Maybe we have to write
46
it to undo log. Well, maybe not, because if we order the row id and trx id
47
in descending order, then the only undeleted copy is the first in the
48
index. Our searches in row operations always position the cursor before
49
the first record in the result set. But, if there is no key defined for
50
a table, then it would be desirable that row id is in ascending order.
51
So, lets store row id in descending order only if it is not an ordering
52
field in the clustered index.
54
NOTE: Deletes and inserts may lead to situation where there are identical
55
records in a secondary index. Is that a problem in the B-tree? Yes.
56
Also updates can lead to this, unless trx id and roll ptr are included in
58
(1) Fix in clustered indexes: include row id, trx id, and roll ptr
59
in node pointers of B-tree.
60
(2) Fix in secondary indexes: include all fields in node pointers, and
61
if an entry is inserted, check if it is equal to the right neighbor,
62
in which case update the right neighbor: the neighbor must be delete
63
marked, set it unmarked and write the trx id of the current transaction.
65
What if the same trx repeatedly updates the same row, updating a secondary
66
index field or not? Updating a clustered index ordering field?
68
(1) If it does not update the secondary index and not the clustered index
69
ord field. Then the secondary index record stays unchanged, but the
70
trx id in the secondary index record may be smaller than in the clustered
71
index record. This is no problem?
72
(2) If it updates secondary index ord field but not clustered: then in
73
secondary index there are delete marked records, which differ in an
74
ord field. No problem.
75
(3) Updates clustered ord field but not secondary, and secondary index
76
is unique. Then the record in secondary index is just updated at the
80
Problem with duplicate records:
81
Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
82
bigger trx id has inserted and delete marked a similar row, our trx inserts
83
again a similar row, and a trx with an even bigger id delete marks it. Then
84
the position of the row should change in the index if the trx id affects
85
the alphabetical ordering.
87
Fix 2: If an insert encounters a similar row marked deleted, we turn the
88
insert into an 'update' of the row marked deleted. Then we must write undo
89
info on the update. A problem: what if a purge operation tries to remove
90
the delete marked row?
92
We can think of the database row versions as a linked list which starts
93
from the record in the clustered index, and is linked by roll ptrs
94
through undo logs. The secondary index records are references which tell
95
what kinds of records can be found in this linked list for a record
96
in the clustered index.
98
How to do the purge? A record can be removed from the clustered index
99
if its linked list becomes empty, i.e., the row has been marked deleted
100
and its roll ptr points to the record in the undo log we are going through,
101
doing the purge. Similarly, during a rollback, a record can be removed
102
if the stored roll ptr in the undo log points to a trx already (being) purged,
103
or if the roll ptr is NULL, i.e., it was a fresh insert. */
105
/************************************************************************
106
Creates a row undo node to a query graph. */
109
row_undo_node_create(
110
/*=================*/
111
/* out, own: undo node */
112
trx_t* trx, /* in: transaction */
113
que_thr_t* parent, /* in: parent node, i.e., a thr node */
114
mem_heap_t* heap) /* in: memory heap where created */
118
ut_ad(trx && parent && heap);
120
undo = mem_heap_alloc(heap, sizeof(undo_node_t));
122
undo->common.type = QUE_NODE_UNDO;
123
undo->common.parent = parent;
125
undo->state = UNDO_NODE_FETCH_NEXT;
128
btr_pcur_init(&(undo->pcur));
130
undo->heap = mem_heap_create(256);
135
/***************************************************************
136
Looks for the clustered index record when node has the row reference.
137
The pcur in node is used in the search. If found, stores the row to node,
138
and stores the position of pcur, and detaches it. The pcur must be closed
139
by the caller in any case. */
142
row_undo_search_clust_to_pcur(
143
/*==========================*/
144
/* out: TRUE if found; NOTE the node->pcur
145
must be closed by the caller, regardless of
147
undo_node_t* node) /* in: row undo node */
149
dict_index_t* clust_index;
154
mem_heap_t* heap = NULL;
155
ulint offsets_[REC_OFFS_NORMAL_SIZE];
156
ulint* offsets = offsets_;
157
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
161
clust_index = dict_table_get_first_index(node->table);
163
found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
164
node->table, node->ref, &mtr);
166
rec = btr_pcur_get_rec(&(node->pcur));
168
offsets = rec_get_offsets(rec, clust_index, offsets,
169
ULINT_UNDEFINED, &heap);
171
if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
172
row_get_rec_roll_ptr(rec, clust_index,
175
/* We must remove the reservation on the undo log record
176
BEFORE releasing the latch on the clustered index page: this
177
is to make sure that some thread will eventually undo the
178
modification corresponding to node->roll_ptr. */
180
/* fputs("--------------------undoing a previous version\n",
185
node->row = row_build(ROW_COPY_DATA, clust_index, rec,
186
offsets, node->heap);
187
btr_pcur_store_position(&(node->pcur), &mtr);
192
btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
194
if (UNIV_LIKELY_NULL(heap)) {
200
/***************************************************************
201
Fetches an undo log record and does the undo for the recorded operation.
202
If none left, or a partial rollback completed, returns control to the
203
parent node, which is always a query thread node. */
208
/* out: DB_SUCCESS if operation successfully
209
completed, else error code */
210
undo_node_t* node, /* in: row undo node */
211
que_thr_t* thr) /* in: query thread */
216
ibool locked_data_dict;
222
if (node->state == UNDO_NODE_FETCH_NEXT) {
224
node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
228
if (!node->undo_rec) {
229
/* Rollback completed for this query thread */
231
thr->run_node = que_node_get_parent(node);
236
node->roll_ptr = roll_ptr;
237
node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
239
if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
241
node->state = UNDO_NODE_INSERT;
243
node->state = UNDO_NODE_MODIFY;
246
} else if (node->state == UNDO_NODE_PREV_VERS) {
248
/* Undo should be done to the same clustered index record
249
again in this same rollback, restoring the previous version */
251
roll_ptr = node->new_roll_ptr;
253
node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
255
node->roll_ptr = roll_ptr;
256
node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
258
if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
260
node->state = UNDO_NODE_INSERT;
262
node->state = UNDO_NODE_MODIFY;
266
/* Prevent DROP TABLE etc. while we are rolling back this row.
267
If we are doing a TABLE CREATE or some other dictionary operation,
268
then we already have dict_operation_lock locked in x-mode. Do not
269
try to lock again, because that would cause a hang. */
271
locked_data_dict = (trx->dict_operation_lock_mode == 0);
273
if (locked_data_dict) {
275
row_mysql_lock_data_dictionary(trx);
278
if (node->state == UNDO_NODE_INSERT) {
280
err = row_undo_ins(node);
282
node->state = UNDO_NODE_FETCH_NEXT;
284
ut_ad(node->state == UNDO_NODE_MODIFY);
285
err = row_undo_mod(node, thr);
288
if (locked_data_dict) {
290
row_mysql_unlock_data_dictionary(trx);
293
/* Do some cleanup */
294
btr_pcur_close(&(node->pcur));
296
mem_heap_empty(node->heap);
298
thr->run_node = node;
303
/***************************************************************
304
Undoes a row operation in a table. This is a high-level function used
305
in SQL execution graphs. */
310
/* out: query thread to run next or NULL */
311
que_thr_t* thr) /* in: query thread */
319
srv_activity_count++;
321
trx = thr_get_trx(thr);
323
node = thr->run_node;
325
ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
327
err = row_undo(node, thr);
329
trx->error_state = err;
331
if (err != DB_SUCCESS) {
332
/* SQL error detected */
334
fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
337
if (err == DB_OUT_OF_FILE_SPACE) {
339
"InnoDB: Error 13 means out of tablespace.\n"
340
"InnoDB: Consider increasing"
341
" your tablespace.\n");