~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to plugin/pbxt/src/index_xt.cc

  • Committer: Brian Aker
  • Date: 2010-04-12 07:43:55 UTC
  • mfrom: (1455.3.13 drizzle-pbxt-6)
  • Revision ID: brian@gaz-20100412074355-udi9dwjlcnmz0oz6
Merge PBXT

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/* Copyright (c) 2005 PrimeBase Technologies GmbH
 
2
 *
 
3
 * PrimeBase XT
 
4
 *
 
5
 * This program is free software; you can redistribute it and/or modify
 
6
 * it under the terms of the GNU General Public License as published by
 
7
 * the Free Software Foundation; either version 2 of the License, or
 
8
 * (at your option) any later version.
 
9
 *
 
10
 * This program is distributed in the hope that it will be useful,
 
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
13
 * GNU General Public License for more details.
 
14
 *
 
15
 * You should have received a copy of the GNU General Public License
 
16
 * along with this program; if not, write to the Free Software
 
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 
18
 *
 
19
 * 2005-09-30   Paul McCullagh
 
20
 *
 
21
 * H&G2JCtL
 
22
 */
 
23
 
 
24
#include "xt_config.h"
 
25
 
 
26
#ifdef DRIZZLED
 
27
#include <bitset>
 
28
#endif
 
29
 
 
30
#include <string.h>
 
31
#include <stdio.h>
 
32
#include <stddef.h>
 
33
#ifndef XT_WIN
 
34
#include <strings.h>
 
35
#endif
 
36
#include <zlib.h>
 
37
/* This header not available on suse-11-amd64, ubuntu-9.10-amd64 */
 
38
//#include <bzlib.h>
 
39
 
 
40
#ifdef DRIZZLED
 
41
#include <drizzled/base.h>
 
42
using namespace drizzled;
 
43
#else
 
44
#include "mysql_priv.h"
 
45
#endif
 
46
 
 
47
#include "pthread_xt.h"
 
48
#include "memory_xt.h"
 
49
#include "index_xt.h"
 
50
#include "heap_xt.h"
 
51
#include "database_xt.h"
 
52
#include "strutil_xt.h"
 
53
#include "cache_xt.h"
 
54
#include "myxt_xt.h"
 
55
#include "trace_xt.h"
 
56
#include "table_xt.h"
 
57
 
 
58
#ifdef DEBUG
 
59
#define MAX_SEARCH_DEPTH                        32
 
60
//#define CHECK_AND_PRINT
 
61
//#define CHECK_NODE_REFERENCE
 
62
//#define TRACE_FLUSH_INDEX
 
63
//#define CHECK_PRINTS_RECORD_REFERENCES
 
64
//#define DO_COMP_TEST
 
65
#define DUMP_INDEX
 
66
#else
 
67
#define MAX_SEARCH_DEPTH                        100
 
68
#endif
 
69
 
 
70
//#define TRACE_FLUSH_TIMES
 
71
 
 
72
typedef struct IdxStackItem {
 
73
        XTIdxItemRec                    i_pos;
 
74
        xtIndexNodeID                   i_branch;
 
75
} IdxStackItemRec, *IdxStackItemPtr;
 
76
 
 
77
typedef struct IdxBranchStack {
 
78
        int                                             s_top;
 
79
        IdxStackItemRec                 s_elements[MAX_SEARCH_DEPTH];
 
80
} IdxBranchStackRec, *IdxBranchStackPtr;
 
81
 
 
82
#ifdef DEBUG
 
83
#ifdef TEST_CODE
 
84
static void idx_check_on_key(XTOpenTablePtr ot);
 
85
#endif
 
86
static u_int idx_check_index(XTOpenTablePtr ot, XTIndexPtr ind, xtBool with_lock);
 
87
#endif
 
88
 
 
89
static xtBool idx_insert_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, xtBool last_item, XTIdxKeyValuePtr key_value, xtIndexNodeID branch);
 
90
static xtBool idx_remove_lazy_deleted_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID current, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value);
 
91
 
 
92
#ifdef XT_TRACK_INDEX_UPDATES
 
93
 
 
94
static xtBool ind_track_write(struct XTOpenTable *ot, struct XTIndex *ind, xtIndexNodeID offset, size_t size, xtWord1 *data)
 
95
{
 
96
        ot->ot_ind_reads++;
 
97
        return xt_ind_write(ot, ind, offset, size, data);
 
98
}
 
99
 
 
100
#define XT_IND_WRITE                                    ind_track_write
 
101
 
 
102
#else
 
103
 
 
104
#define XT_IND_WRITE                                    xt_ind_write
 
105
 
 
106
#endif
 
107
 
 
108
 
 
109
#ifdef CHECK_NODE_REFERENCE
 
110
#define IDX_GET_NODE_REF(t, x, o)               idx_get_node_ref(t, x, o)
 
111
#else
 
112
#define IDX_GET_NODE_REF(t, x, o)               XT_GET_NODE_REF(t, (x) - (o))
 
113
#endif
 
114
 
 
115
/*
 
116
 * -----------------------------------------------------------------------
 
117
 * DEBUG ACTIVITY
 
118
 */
 
119
 
 
120
//#define TRACK_ACTIVITY
 
121
 
 
122
#ifdef TRACK_ACTIVITY
 
123
#define TRACK_MAX_BLOCKS                        2000
 
124
 
 
125
typedef struct TrackBlock {
 
126
        xtWord1                         exists;
 
127
        char                            *activity;
 
128
} TrackBlockRec, *TrackBlockPtr;
 
129
 
 
130
TrackBlockRec           blocks[TRACK_MAX_BLOCKS];
 
131
 
 
132
xtPublic void track_work(u_int block, char *what)
 
133
{
 
134
        int len = 0, len2;
 
135
 
 
136
        ASSERT_NS(block > 0 && block <= TRACK_MAX_BLOCKS);
 
137
        block--;
 
138
        if (blocks[block].activity)
 
139
                len = strlen(blocks[block].activity);
 
140
        len2 = strlen(what);
 
141
        xt_realloc_ns((void **) &blocks[block].activity, len + len2 + 1);
 
142
        memcpy(blocks[block].activity + len, what, len2 + 1);
 
143
}
 
144
 
 
145
static void track_block_exists(xtIndexNodeID block)
 
146
{
 
147
        if (XT_NODE_ID(block) > 0 && XT_NODE_ID(block) <= TRACK_MAX_BLOCKS)
 
148
                blocks[XT_NODE_ID(block)-1].exists = TRUE;
 
149
}
 
150
 
 
151
static void track_reset_missing()
 
152
{
 
153
        for (u_int i=0; i<TRACK_MAX_BLOCKS; i++)
 
154
                blocks[i].exists = FALSE;
 
155
}
 
156
 
 
157
static void track_dump_missing(xtIndexNodeID eof_block)
 
158
{
 
159
        for (u_int i=0; i<XT_NODE_ID(eof_block)-1; i++) {
 
160
                if (!blocks[i].exists)
 
161
                        printf("block missing = %04d %s\n", i+1, blocks[i].activity);
 
162
        }
 
163
}
 
164
 
 
165
static void track_dump_all(u_int max_block)
 
166
{
 
167
        for (u_int i=0; i<max_block; i++) {
 
168
                if (blocks[i].exists)
 
169
                        printf(" %04d %s\n", i+1, blocks[i].activity);
 
170
                else
 
171
                        printf("-%04d %s\n", i+1, blocks[i].activity);
 
172
        }
 
173
}
 
174
 
 
175
#endif
 
176
 
 
177
xtPublic void xt_ind_track_dump_block(XTTableHPtr XT_UNUSED(tab), xtIndexNodeID XT_UNUSED(address))
 
178
{
 
179
#ifdef TRACK_ACTIVITY
 
180
        u_int i = XT_NODE_ID(address)-1;
 
181
 
 
182
        printf("BLOCK %04d %s\n", i+1, blocks[i].activity);
 
183
#endif
 
184
}
 
185
 
 
186
#ifdef CHECK_NODE_REFERENCE
 
187
static xtIndexNodeID idx_get_node_ref(XTTableHPtr tab, xtWord1 *ref, u_int node_ref_size)
 
188
{
 
189
        xtIndexNodeID node;
 
190
 
 
191
        /* Node is invalid by default: */
 
192
        XT_NODE_ID(node) = 0xFFFFEEEE;
 
193
        if (node_ref_size) {
 
194
                ref -= node_ref_size;
 
195
                node = XT_RET_NODE_ID(XT_GET_DISK_4(ref));
 
196
                if (node >= tab->tab_ind_eof) {
 
197
                        xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
 
198
                }
 
199
        }
 
200
        return node;
 
201
}
 
202
#endif
 
203
 
 
204
/*
 
205
 * -----------------------------------------------------------------------
 
206
 * Stack functions
 
207
 */
 
208
 
 
209
static void idx_newstack(IdxBranchStackPtr stack)
 
210
{
 
211
        stack->s_top = 0;
 
212
}
 
213
 
 
214
static xtBool idx_push(IdxBranchStackPtr stack, xtIndexNodeID n, XTIdxItemPtr pos)
 
215
{
 
216
        if (stack->s_top == MAX_SEARCH_DEPTH) {
 
217
                xt_register_error(XT_REG_CONTEXT, XT_ERR_STACK_OVERFLOW, 0, "Index node stack overflow");
 
218
                return FAILED;
 
219
        }
 
220
        stack->s_elements[stack->s_top].i_branch = n;
 
221
        if (pos)
 
222
                stack->s_elements[stack->s_top].i_pos = *pos;
 
223
        stack->s_top++;
 
224
        return OK;
 
225
}
 
226
 
 
227
static IdxStackItemPtr idx_pop(IdxBranchStackPtr stack)
 
228
{
 
229
        if (stack->s_top == 0)
 
230
                return NULL;
 
231
        stack->s_top--;
 
232
        return &stack->s_elements[stack->s_top];
 
233
}
 
234
 
 
235
static IdxStackItemPtr idx_top(IdxBranchStackPtr stack)
 
236
{
 
237
        if (stack->s_top == 0)
 
238
                return NULL;
 
239
        return &stack->s_elements[stack->s_top-1];
 
240
}
 
241
 
 
242
/*
 
243
 * -----------------------------------------------------------------------
 
244
 * Allocation of nodes
 
245
 */
 
246
 
 
247
/*
 
248
 * Allocating and freeing blocks for an index is safe because this is a structural
 
249
 * change which requires an exclusive lock on the index!
 
250
 */
 
251
static xtBool idx_new_branch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID *address)
 
252
{
 
253
        register XTTableHPtr    tab;
 
254
        xtIndexNodeID                   wrote_pos;
 
255
        XTIndFreeBlockRec               free_block;
 
256
        XTIndFreeListPtr                list_ptr;
 
257
 
 
258
        tab = ot->ot_table;
 
259
 
 
260
        //ASSERT_NS(XT_INDEX_HAVE_XLOCK(ind, ot));
 
261
        if (ind->mi_free_list && ind->mi_free_list->fl_free_count) {
 
262
                ind->mi_free_list->fl_free_count--;
 
263
                *address = ind->mi_free_list->fl_page_id[ind->mi_free_list->fl_free_count];
 
264
                TRACK_BLOCK_ALLOC(*address);
 
265
                return OK;
 
266
        }
 
267
 
 
268
        xt_lock_mutex_ns(&tab->tab_ind_lock);
 
269
 
 
270
        /* Check the cached free list: */
 
271
        while ((list_ptr = tab->tab_ind_free_list)) {
 
272
                if (list_ptr->fl_start < list_ptr->fl_free_count) {
 
273
                        wrote_pos = list_ptr->fl_page_id[list_ptr->fl_start];
 
274
                        list_ptr->fl_start++;
 
275
                        xt_unlock_mutex_ns(&tab->tab_ind_lock);
 
276
                        *address = wrote_pos;
 
277
                        TRACK_BLOCK_ALLOC(wrote_pos);
 
278
                        return OK;
 
279
                }
 
280
                tab->tab_ind_free_list = list_ptr->fl_next_list;
 
281
                xt_free_ns(list_ptr);
 
282
        }
 
283
 
 
284
        if ((XT_NODE_ID(wrote_pos) = XT_NODE_ID(tab->tab_ind_free))) {
 
285
                /* Use the block on the free list: */
 
286
                if (!xt_ind_read_bytes(ot, ind, wrote_pos, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block))
 
287
                        goto failed;
 
288
                XT_NODE_ID(tab->tab_ind_free) = (xtIndexNodeID) XT_GET_DISK_8(free_block.if_next_block_8);
 
289
                xt_unlock_mutex_ns(&tab->tab_ind_lock);
 
290
                *address = wrote_pos;
 
291
                TRACK_BLOCK_ALLOC(wrote_pos);
 
292
                return OK;
 
293
        }
 
294
 
 
295
        /* PMC - Dont allow overflow! */
 
296
        if (XT_NODE_ID(tab->tab_ind_eof) >= 0xFFFFFFF) {
 
297
                xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_FILE_TO_LARGE, xt_file_path(ot->ot_ind_file));
 
298
                goto failed;
 
299
        }
 
300
        *address = tab->tab_ind_eof;
 
301
        XT_NODE_ID(tab->tab_ind_eof)++;
 
302
        xt_unlock_mutex_ns(&tab->tab_ind_lock);
 
303
        TRACK_BLOCK_ALLOC(*address);
 
304
        return OK;
 
305
 
 
306
        failed:
 
307
        xt_unlock_mutex_ns(&tab->tab_ind_lock);
 
308
        return FAILED;
 
309
}
 
310
 
 
311
/* Add the block to the private free list of the index.
 
312
 * On flush, this list will be transfered to the global list.
 
313
 */
 
314
static xtBool idx_free_branch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID node_id)
 
315
{
 
316
        register u_int          count;
 
317
        register u_int          i;
 
318
        register u_int          guess;
 
319
 
 
320
        TRACK_BLOCK_FREE(node_id);
 
321
        //ASSERT_NS(XT_INDEX_HAVE_XLOCK(ind, ot));
 
322
        if (!ind->mi_free_list) {
 
323
                count = 0;
 
324
                if (!(ind->mi_free_list = (XTIndFreeListPtr) xt_calloc_ns(offsetof(XTIndFreeListRec, fl_page_id) + 10 * sizeof(xtIndexNodeID))))
 
325
                        return FAILED;
 
326
        }
 
327
        else {
 
328
                count = ind->mi_free_list->fl_free_count;
 
329
                if (!xt_realloc_ns((void **) &ind->mi_free_list, offsetof(XTIndFreeListRec, fl_page_id) + (count + 1) * sizeof(xtIndexNodeID)))
 
330
                        return FAILED;
 
331
        }
 
332
 
 
333
        i = 0;
 
334
        while (i < count) {
 
335
                guess = (i + count - 1) >> 1;
 
336
                if (XT_NODE_ID(node_id) == XT_NODE_ID(ind->mi_free_list->fl_page_id[guess])) {
 
337
                        // Should not happen...
 
338
                        ASSERT_NS(FALSE);
 
339
                        return OK;
 
340
                }
 
341
                if (XT_NODE_ID(node_id) < XT_NODE_ID(ind->mi_free_list->fl_page_id[guess]))
 
342
                        count = guess;
 
343
                else
 
344
                        i = guess + 1;
 
345
        }
 
346
 
 
347
        /* Insert at position i */
 
348
        memmove(ind->mi_free_list->fl_page_id + i + 1, ind->mi_free_list->fl_page_id + i, (ind->mi_free_list->fl_free_count - i) * sizeof(xtIndexNodeID));
 
349
        ind->mi_free_list->fl_page_id[i] = node_id;
 
350
        ind->mi_free_list->fl_free_count++;
 
351
 
 
352
        /* Set the cache page to clean: */
 
353
        return xt_ind_free_block(ot, ind, node_id);
 
354
}
 
355
 
 
356
/*
 
357
 * -----------------------------------------------------------------------
 
358
 * Simple compare functions
 
359
 */
 
360
 
 
361
xtPublic int xt_compare_2_int4(XTIndexPtr XT_UNUSED(ind), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
 
362
{
 
363
        int r;
 
364
 
 
365
        ASSERT_NS(key_length == 4 || key_length == 8);
 
366
        r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
 
367
        if (r == 0 && key_length > 4) {
 
368
                key_value += 4;
 
369
                b_value += 4;
 
370
                r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
 
371
        }
 
372
        return r;
 
373
}
 
374
 
 
375
xtPublic int xt_compare_3_int4(XTIndexPtr XT_UNUSED(ind), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
 
376
{
 
377
        int r;
 
378
 
 
379
        ASSERT_NS(key_length == 4 || key_length == 8 || key_length == 12);
 
380
        r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
 
381
        if (r == 0 && key_length > 4) {
 
382
                key_value += 4;
 
383
                b_value += 4;
 
384
                r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
 
385
                if (r == 0 && key_length > 8) {
 
386
                        key_value += 4;
 
387
                        b_value += 4;
 
388
                        r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
 
389
                }
 
390
        }
 
391
        return r;
 
392
}
 
393
 
 
394
/*
 
395
 * -----------------------------------------------------------------------
 
396
 * Tree branch sanning (searching nodes and leaves)
 
397
 */
 
398
 
 
399
xtPublic void xt_scan_branch_single(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 
400
{
 
401
        XT_NODE_TEMP;
 
402
        u_int                           branch_size;
 
403
        u_int                           node_ref_size;
 
404
        u_int                           full_item_size;
 
405
        int                                     search_flags;
 
406
        register xtWord1        *base;
 
407
        register u_int          i;
 
408
        register xtWord1        *bitem;
 
409
        u_int                           total_count;
 
410
 
 
411
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
412
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
413
 
 
414
        result->sr_found = FALSE;
 
415
        result->sr_duplicate = FALSE;
 
416
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
417
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
418
 
 
419
        result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
 
420
        full_item_size = result->sr_item.i_item_size + node_ref_size;
 
421
        result->sr_item.i_node_ref_size = node_ref_size;
 
422
 
 
423
        search_flags = value->sv_flags;
 
424
        base = branch->tb_data + node_ref_size;
 
425
        total_count = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
 
426
        if (search_flags & XT_SEARCH_FIRST_FLAG)
 
427
                i = 0;
 
428
        else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG)
 
429
                i = total_count;
 
430
        else {
 
431
                register u_int          guess;
 
432
                register u_int          count;
 
433
                register xtInt4         r;
 
434
                xtRecordID                      key_record;
 
435
 
 
436
                key_record = value->sv_rec_id;
 
437
                count = total_count;
 
438
 
 
439
                ASSERT_NS(ind);
 
440
                i = 0;
 
441
                while (i < count) {
 
442
                        guess = (i + count - 1) >> 1;
 
443
 
 
444
                        bitem = base + guess * full_item_size;
 
445
 
 
446
                        switch (ind->mi_single_type) {
 
447
                                case HA_KEYTYPE_LONG_INT: {
 
448
                                        register xtInt4 a, b;
 
449
                                        
 
450
                                        a = XT_GET_DISK_4(value->sv_key);
 
451
                                        b = XT_GET_DISK_4(bitem);
 
452
                                        r = (a < b) ? -1 : (a == b ? 0 : 1);
 
453
                                        break;
 
454
                                }
 
455
                                case HA_KEYTYPE_ULONG_INT: {
 
456
                                        register xtWord4 a, b;
 
457
                                        
 
458
                                        a = XT_GET_DISK_4(value->sv_key);
 
459
                                        b = XT_GET_DISK_4(bitem);
 
460
                                        r = (a < b) ? -1 : (a == b ? 0 : 1);
 
461
                                        break;
 
462
                                }
 
463
                                default:
 
464
                                        /* Should not happen: */
 
465
                                        r = 1;
 
466
                                        break;
 
467
                        }
 
468
                        if (r == 0) {
 
469
                                if (search_flags & XT_SEARCH_WHOLE_KEY) {
 
470
                                        xtRecordID      item_record;
 
471
                                        xtRowID         row_id;
 
472
                                        
 
473
                                        xt_get_record_ref(bitem + ind->mi_key_size, &item_record, &row_id);
 
474
 
 
475
                                        /* This should not happen because we should never
 
476
                                         * try to insert the same record twice into the 
 
477
                                         * index!
 
478
                                         */
 
479
                                        result->sr_duplicate = TRUE;
 
480
                                        if (key_record == item_record) {
 
481
                                                result->sr_found = TRUE;
 
482
                                                result->sr_rec_id = item_record;
 
483
                                                result->sr_row_id = row_id;
 
484
                                                result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
 
485
                                                result->sr_item.i_item_offset = node_ref_size + guess * full_item_size;
 
486
                                                return;
 
487
                                        }
 
488
                                        if (key_record < item_record)
 
489
                                                r = -1;
 
490
                                        else
 
491
                                                r = 1;
 
492
                                }
 
493
                                else {
 
494
                                        result->sr_found = TRUE;
 
495
                                        /* -1 causes a search to the beginning of the duplicate list of keys.
 
496
                                         * 1 causes a search to just after the key.
 
497
                                        */
 
498
                                        if (search_flags & XT_SEARCH_AFTER_KEY)
 
499
                                                r = 1;
 
500
                                        else
 
501
                                                r = -1;
 
502
                                }
 
503
                        }
 
504
 
 
505
                        if (r < 0)
 
506
                                count = guess;
 
507
                        else
 
508
                                i = guess + 1;
 
509
                }
 
510
        }
 
511
 
 
512
        bitem = base + i * full_item_size;
 
513
        xt_get_res_record_ref(bitem + ind->mi_key_size, result);
 
514
        result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);                        /* Only valid if this is a node. */
 
515
        result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
 
516
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
517
        if (i != total_count)
 
518
                result->sr_last_item = FALSE;
 
519
#endif
 
520
}
 
521
 
 
522
/*
 
523
 * We use a special binary search here. It basically assumes that the values
 
524
 * in the index are not unique.
 
525
 *
 
526
 * Even if they are unique, when we search for part of a key, then it is
 
527
 * effectively the case.
 
528
 *
 
529
 * So in the situation where we find duplicates in the index we usually
 
530
 * want to position ourselves at the beginning of the duplicate list.
 
531
 *
 
532
 * Alternatively a search can find the position just after a given key.
 
533
 *
 
534
 * To achieve this we make the following modifications:
 
535
 * - The result of the comparison is always returns 1 or -1. We only stop
 
536
 *   the search early in the case an exact match when inserting (but this
 
537
 *   should not happen anyway).
 
538
 * - The search never actually fails, but sets 'found' to TRUE if it
 
539
 *   sees the search key in the index.
 
540
 *
 
541
 * If the search value exists in the index we know that
 
542
 * this method will take us to the first occurrence of the key in the
 
543
 * index (in the case of -1) or to the first value after the
 
544
 * the search key in the case of 1.
 
545
 */
 
546
xtPublic void xt_scan_branch_fix(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 
547
{
 
548
        XT_NODE_TEMP;
 
549
        u_int                           branch_size;
 
550
        u_int                           node_ref_size;
 
551
        u_int                           full_item_size;
 
552
        int                                     search_flags;
 
553
        xtWord1                         *base;
 
554
        register u_int          i;
 
555
        xtWord1                         *bitem;
 
556
        u_int                           total_count;
 
557
 
 
558
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
559
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
560
 
 
561
        result->sr_found = FALSE;
 
562
        result->sr_duplicate = FALSE;
 
563
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
564
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
565
 
 
566
        result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
 
567
        full_item_size = result->sr_item.i_item_size + node_ref_size;
 
568
        result->sr_item.i_node_ref_size = node_ref_size;
 
569
 
 
570
        search_flags = value->sv_flags;
 
571
        base = branch->tb_data + node_ref_size;
 
572
        total_count = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
 
573
        if (search_flags & XT_SEARCH_FIRST_FLAG)
 
574
                i = 0;
 
575
        else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG)
 
576
                i = total_count;
 
577
        else {
 
578
                register u_int          guess;
 
579
                register u_int          count;
 
580
                xtRecordID                      key_record;
 
581
                int                                     r;
 
582
 
 
583
                key_record = value->sv_rec_id;
 
584
                count = total_count;
 
585
 
 
586
                ASSERT_NS(ind);
 
587
                i = 0;
 
588
                while (i < count) {
 
589
                        guess = (i + count - 1) >> 1;
 
590
 
 
591
                        bitem = base + guess * full_item_size;
 
592
 
 
593
                        r = myxt_compare_key(ind, search_flags, value->sv_length, value->sv_key, bitem);
 
594
 
 
595
                        if (r == 0) {
 
596
                                if (search_flags & XT_SEARCH_WHOLE_KEY) {
 
597
                                        xtRecordID      item_record;
 
598
                                        xtRowID         row_id;
 
599
 
 
600
                                        xt_get_record_ref(bitem + ind->mi_key_size, &item_record, &row_id);
 
601
 
 
602
                                        /* This should not happen because we should never
 
603
                                         * try to insert the same record twice into the 
 
604
                                         * index!
 
605
                                         */
 
606
                                        result->sr_duplicate = TRUE;
 
607
                                        if (key_record == item_record) {
 
608
                                                result->sr_found = TRUE;
 
609
                                                result->sr_rec_id = item_record;
 
610
                                                result->sr_row_id = row_id;
 
611
                                                result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
 
612
                                                result->sr_item.i_item_offset = node_ref_size + guess * full_item_size;
 
613
                                                return;
 
614
                                        }
 
615
                                        if (key_record < item_record)
 
616
                                                r = -1;
 
617
                                        else
 
618
                                                r = 1;
 
619
                                }
 
620
                                else {
 
621
                                        result->sr_found = TRUE;
 
622
                                        /* -1 causes a search to the beginning of the duplicate list of keys.
 
623
                                         * 1 causes a search to just after the key.
 
624
                                        */
 
625
                                        if (search_flags & XT_SEARCH_AFTER_KEY)
 
626
                                                r = 1;
 
627
                                        else
 
628
                                                r = -1;
 
629
                                }
 
630
                        }
 
631
 
 
632
                        if (r < 0)
 
633
                                count = guess;
 
634
                        else
 
635
                                i = guess + 1;
 
636
                }
 
637
        }
 
638
 
 
639
        bitem = base + i * full_item_size;
 
640
        xt_get_res_record_ref(bitem + ind->mi_key_size, result);
 
641
        result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);                        /* Only valid if this is a node. */
 
642
        result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
 
643
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
644
        if (i != total_count)
 
645
                result->sr_last_item = FALSE;
 
646
#endif
 
647
}
 
648
 
 
649
xtPublic void xt_scan_branch_fix_simple(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 
650
{
 
651
        XT_NODE_TEMP;
 
652
        u_int                           branch_size;
 
653
        u_int                           node_ref_size;
 
654
        u_int                           full_item_size;
 
655
        int                                     search_flags;
 
656
        xtWord1                         *base;
 
657
        register u_int          i;
 
658
        xtWord1                         *bitem;
 
659
        u_int                           total_count;
 
660
 
 
661
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
662
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
663
 
 
664
        result->sr_found = FALSE;
 
665
        result->sr_duplicate = FALSE;
 
666
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
667
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
668
 
 
669
        result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
 
670
        full_item_size = result->sr_item.i_item_size + node_ref_size;
 
671
        result->sr_item.i_node_ref_size = node_ref_size;
 
672
 
 
673
        search_flags = value->sv_flags;
 
674
        base = branch->tb_data + node_ref_size;
 
675
        total_count = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
 
676
        if (search_flags & XT_SEARCH_FIRST_FLAG)
 
677
                i = 0;
 
678
        else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG)
 
679
                i = total_count;
 
680
        else {
 
681
                register u_int          guess;
 
682
                register u_int          count;
 
683
                xtRecordID                      key_record;
 
684
                int                                     r;
 
685
 
 
686
                key_record = value->sv_rec_id;
 
687
                count = total_count;
 
688
 
 
689
                ASSERT_NS(ind);
 
690
                i = 0;
 
691
                while (i < count) {
 
692
                        guess = (i + count - 1) >> 1;
 
693
 
 
694
                        bitem = base + guess * full_item_size;
 
695
 
 
696
                        r = ind->mi_simple_comp_key(ind, value->sv_length, value->sv_key, bitem);
 
697
 
 
698
                        if (r == 0) {
 
699
                                if (search_flags & XT_SEARCH_WHOLE_KEY) {
 
700
                                        xtRecordID      item_record;
 
701
                                        xtRowID         row_id;
 
702
 
 
703
                                        xt_get_record_ref(bitem + ind->mi_key_size, &item_record, &row_id);
 
704
 
 
705
                                        /* This should not happen because we should never
 
706
                                         * try to insert the same record twice into the 
 
707
                                         * index!
 
708
                                         */
 
709
                                        result->sr_duplicate = TRUE;
 
710
                                        if (key_record == item_record) {
 
711
                                                result->sr_found = TRUE;
 
712
                                                result->sr_rec_id = item_record;
 
713
                                                result->sr_row_id = row_id;
 
714
                                                result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
 
715
                                                result->sr_item.i_item_offset = node_ref_size + guess * full_item_size;
 
716
                                                return;
 
717
                                        }
 
718
                                        if (key_record < item_record)
 
719
                                                r = -1;
 
720
                                        else
 
721
                                                r = 1;
 
722
                                }
 
723
                                else {
 
724
                                        result->sr_found = TRUE;
 
725
                                        /* -1 causes a search to the beginning of the duplicate list of keys.
 
726
                                         * 1 causes a search to just after the key.
 
727
                                        */
 
728
                                        if (search_flags & XT_SEARCH_AFTER_KEY)
 
729
                                                r = 1;
 
730
                                        else
 
731
                                                r = -1;
 
732
                                }
 
733
                        }
 
734
 
 
735
                        if (r < 0)
 
736
                                count = guess;
 
737
                        else
 
738
                                i = guess + 1;
 
739
                }
 
740
        }
 
741
 
 
742
        bitem = base + i * full_item_size;
 
743
        xt_get_res_record_ref(bitem + ind->mi_key_size, result);
 
744
        result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);                        /* Only valid if this is a node. */
 
745
        result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
 
746
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
747
        if (i != total_count)
 
748
                result->sr_last_item = FALSE;
 
749
#endif
 
750
}
 
751
 
 
752
/*
 
753
 * Variable length key values are stored as a sorted list. Since each list item has a variable length, we
 
754
 * must scan the list sequentially in order to find a key.
 
755
 */
 
756
xtPublic void xt_scan_branch_var(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 
757
{
 
758
        XT_NODE_TEMP;
 
759
        u_int                   branch_size;
 
760
        u_int                   node_ref_size;
 
761
        int                             search_flags;
 
762
        xtWord1                 *base;
 
763
        xtWord1                 *bitem;
 
764
        u_int                   ilen;
 
765
        xtWord1                 *bend;
 
766
 
 
767
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
768
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
769
 
 
770
        result->sr_found = FALSE;
 
771
        result->sr_duplicate = FALSE;
 
772
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
773
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
774
 
 
775
        result->sr_item.i_node_ref_size = node_ref_size;
 
776
 
 
777
        search_flags = value->sv_flags;
 
778
        base = branch->tb_data + node_ref_size;
 
779
        bitem = base;
 
780
        bend = &branch->tb_data[result->sr_item.i_total_size];
 
781
        ilen = 0;
 
782
        if (bitem >= bend)
 
783
                goto done_ok;
 
784
 
 
785
        if (search_flags & XT_SEARCH_FIRST_FLAG)
 
786
                ilen = myxt_get_key_length(ind, bitem);
 
787
        else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG) {
 
788
                bitem = bend;
 
789
                ilen = 0;
 
790
        }
 
791
        else {
 
792
                xtRecordID      key_record;
 
793
                int                     r;
 
794
 
 
795
                key_record = value->sv_rec_id;
 
796
 
 
797
                ASSERT_NS(ind);
 
798
                while (bitem < bend) {
 
799
                        ilen = myxt_get_key_length(ind, bitem);
 
800
                        r = myxt_compare_key(ind, search_flags, value->sv_length, value->sv_key, bitem);
 
801
                        if (r == 0) {
 
802
                                if (search_flags & XT_SEARCH_WHOLE_KEY) {
 
803
                                        xtRecordID      item_record;
 
804
                                        xtRowID         row_id;
 
805
 
 
806
                                        xt_get_record_ref(bitem + ilen, &item_record, &row_id);
 
807
 
 
808
                                        /* This should not happen because we should never
 
809
                                         * try to insert the same record twice into the 
 
810
                                         * index!
 
811
                                         */
 
812
                                        result->sr_duplicate = TRUE;
 
813
                                        if (key_record == item_record) {
 
814
                                                result->sr_found = TRUE;
 
815
                                                result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
 
816
                                                result->sr_rec_id = item_record;
 
817
                                                result->sr_row_id = row_id;
 
818
                                                result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
 
819
                                                result->sr_item.i_item_offset = bitem - branch->tb_data;
 
820
                                                return;
 
821
                                        }
 
822
                                        if (key_record < item_record)
 
823
                                                r = -1;
 
824
                                        else
 
825
                                                r = 1;
 
826
                                }
 
827
                                else {
 
828
                                        result->sr_found = TRUE;
 
829
                                        /* -1 causes a search to the beginning of the duplicate list of keys.
 
830
                                         * 1 causes a search to just after the key.
 
831
                                        */
 
832
                                        if (search_flags & XT_SEARCH_AFTER_KEY)
 
833
                                                r = 1;
 
834
                                        else
 
835
                                                r = -1;
 
836
                                }
 
837
                        }
 
838
                        if (r <= 0)
 
839
                                break;
 
840
                        bitem += ilen + XT_RECORD_REF_SIZE + node_ref_size;
 
841
                }
 
842
        }
 
843
 
 
844
        done_ok:
 
845
        result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
 
846
        xt_get_res_record_ref(bitem + ilen, result);
 
847
        result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);                        /* Only valid if this is a node. */
 
848
        result->sr_item.i_item_offset = bitem - branch->tb_data;
 
849
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
850
        if (bitem != bend)
 
851
                result->sr_last_item = FALSE;
 
852
#endif
 
853
}
 
854
 
 
855
/* Go to the next item in the node. */
 
856
static void idx_next_branch_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
 
857
{
 
858
        XT_NODE_TEMP;
 
859
        xtWord1 *bitem;
 
860
        u_int   ilen;
 
861
 
 
862
        result->sr_item.i_item_offset += result->sr_item.i_item_size + result->sr_item.i_node_ref_size;
 
863
        bitem = branch->tb_data + result->sr_item.i_item_offset;
 
864
        if (result->sr_item.i_item_offset < result->sr_item.i_total_size) {
 
865
                if (ind->mi_fix_key)
 
866
                        ilen = result->sr_item.i_item_size;
 
867
                else {
 
868
                        ilen = myxt_get_key_length(ind, bitem) + XT_RECORD_REF_SIZE;
 
869
                        result->sr_item.i_item_size = ilen;
 
870
                }
 
871
                xt_get_res_record_ref(bitem + ilen - XT_RECORD_REF_SIZE, result); /* (Only valid if i_item_offset < i_total_size) */
 
872
        }
 
873
        else {
 
874
                result->sr_item.i_item_size = 0;
 
875
                result->sr_rec_id = 0;
 
876
                result->sr_row_id = 0;
 
877
        }
 
878
        if (result->sr_item.i_node_ref_size)
 
879
                /* IDX_GET_NODE_REF() loads the branch reference to the LEFT of the item. */
 
880
                result->sr_branch = IDX_GET_NODE_REF(tab, bitem, result->sr_item.i_node_ref_size);
 
881
        else
 
882
                result->sr_branch = 0;
 
883
}
 
884
 
 
885
xtPublic void xt_prev_branch_item_fix(XTTableHPtr XT_UNUSED(tab), XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr branch, register XTIdxResultRec *result)
 
886
{
 
887
        XT_NODE_TEMP;
 
888
        ASSERT_NS(result->sr_item.i_item_offset >= result->sr_item.i_item_size + result->sr_item.i_node_ref_size + result->sr_item.i_node_ref_size);
 
889
        result->sr_item.i_item_offset -= (result->sr_item.i_item_size + result->sr_item.i_node_ref_size);
 
890
        xt_get_res_record_ref(branch->tb_data + result->sr_item.i_item_offset + result->sr_item.i_item_size - XT_RECORD_REF_SIZE, result); /* (Only valid if i_item_offset < i_total_size) */
 
891
        result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, result->sr_item.i_node_ref_size);
 
892
}
 
893
 
 
894
xtPublic void xt_prev_branch_item_var(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
 
895
{
 
896
        XT_NODE_TEMP;
 
897
        xtWord1 *bitem;
 
898
        xtWord1 *bend;
 
899
        u_int   ilen;
 
900
 
 
901
        bitem = branch->tb_data + result->sr_item.i_node_ref_size;
 
902
        bend = &branch->tb_data[result->sr_item.i_item_offset];
 
903
        for (;;) {
 
904
                ilen = myxt_get_key_length(ind, bitem);
 
905
                if (bitem + ilen + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size >= bend)
 
906
                        break;
 
907
                bitem += ilen + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size;
 
908
        }
 
909
 
 
910
        result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
 
911
        xt_get_res_record_ref(bitem + ilen, result); /* (Only valid if i_item_offset < i_total_size) */
 
912
        result->sr_branch = IDX_GET_NODE_REF(tab, bitem, result->sr_item.i_node_ref_size);
 
913
        result->sr_item.i_item_offset = bitem - branch->tb_data;
 
914
}
 
915
 
 
916
static void idx_reload_item_fix(XTIndexPtr XT_NDEBUG_UNUSED(ind), XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 
917
{
 
918
        u_int branch_size;
 
919
 
 
920
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
921
        ASSERT_NS(result->sr_item.i_node_ref_size == (XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0));
 
922
        ASSERT_NS(result->sr_item.i_item_size == ind->mi_key_size + XT_RECORD_REF_SIZE);
 
923
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
924
        if (result->sr_item.i_item_offset > result->sr_item.i_total_size)
 
925
                result->sr_item.i_item_offset = result->sr_item.i_total_size;
 
926
        xt_get_res_record_ref(&branch->tb_data[result->sr_item.i_item_offset + result->sr_item.i_item_size - XT_RECORD_REF_SIZE], result); 
 
927
}
 
928
 
 
929
static void idx_first_branch_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 
930
{
 
931
        XT_NODE_TEMP;
 
932
        u_int branch_size;
 
933
        u_int node_ref_size;
 
934
        u_int key_data_size;
 
935
 
 
936
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
937
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
938
 
 
939
        result->sr_found = FALSE;
 
940
        result->sr_duplicate = FALSE;
 
941
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
942
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
943
 
 
944
        if (ind->mi_fix_key)
 
945
                key_data_size = ind->mi_key_size;
 
946
        else {
 
947
                xtWord1 *bitem;
 
948
 
 
949
                bitem = branch->tb_data + node_ref_size;
 
950
                if (bitem < &branch->tb_data[result->sr_item.i_total_size])
 
951
                        key_data_size = myxt_get_key_length(ind, bitem);
 
952
                else
 
953
                        key_data_size = 0;
 
954
        }
 
955
 
 
956
        result->sr_item.i_item_size = key_data_size + XT_RECORD_REF_SIZE;
 
957
        result->sr_item.i_node_ref_size = node_ref_size;
 
958
 
 
959
        xt_get_res_record_ref(branch->tb_data + node_ref_size + key_data_size, result);
 
960
        result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + node_ref_size, node_ref_size); /* Only valid if this is a node. */
 
961
        result->sr_item.i_item_offset = node_ref_size;
 
962
}
 
963
 
 
964
/*
 
965
 * Last means different things for leaf or node!
 
966
 */
 
967
xtPublic void xt_last_branch_item_fix(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 
968
{
 
969
        XT_NODE_TEMP;
 
970
        u_int branch_size;
 
971
        u_int node_ref_size;
 
972
 
 
973
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
974
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
975
 
 
976
        result->sr_found = FALSE;
 
977
        result->sr_duplicate = FALSE;
 
978
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
979
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
980
 
 
981
        result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
 
982
        result->sr_item.i_node_ref_size = node_ref_size;
 
983
 
 
984
        if (node_ref_size) {
 
985
                result->sr_item.i_item_offset = result->sr_item.i_total_size;
 
986
                result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, node_ref_size);
 
987
        }
 
988
        else {
 
989
                if (result->sr_item.i_total_size) {
 
990
                        result->sr_item.i_item_offset = result->sr_item.i_total_size - result->sr_item.i_item_size;
 
991
                        xt_get_res_record_ref(branch->tb_data + result->sr_item.i_item_offset + ind->mi_key_size, result);
 
992
                }
 
993
                else
 
994
                        /* Leaf is empty: */
 
995
                        result->sr_item.i_item_offset = 0;
 
996
        }
 
997
}
 
998
 
 
999
xtPublic void xt_last_branch_item_var(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 
1000
{
 
1001
        XT_NODE_TEMP;
 
1002
        u_int   branch_size;
 
1003
        u_int   node_ref_size;
 
1004
 
 
1005
        branch_size = XT_GET_DISK_2(branch->tb_size_2);
 
1006
        node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
 
1007
 
 
1008
        result->sr_found = FALSE;
 
1009
        result->sr_duplicate = FALSE;
 
1010
        result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
 
1011
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
 
1012
 
 
1013
        result->sr_item.i_node_ref_size = node_ref_size;
 
1014
 
 
1015
        if (node_ref_size) {
 
1016
                result->sr_item.i_item_offset = result->sr_item.i_total_size;
 
1017
                result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, node_ref_size);
 
1018
                result->sr_item.i_item_size = 0;
 
1019
        }
 
1020
        else {
 
1021
                if (result->sr_item.i_total_size) {
 
1022
                        xtWord1 *bitem;
 
1023
                        u_int   ilen;
 
1024
                        xtWord1 *bend;
 
1025
 
 
1026
                        bitem = branch->tb_data + node_ref_size;;
 
1027
                        bend = &branch->tb_data[result->sr_item.i_total_size];
 
1028
                        ilen = 0;
 
1029
                        if (bitem < bend) {
 
1030
                                for (;;) {
 
1031
                                        ilen = myxt_get_key_length(ind, bitem);
 
1032
                                        if (bitem + ilen + XT_RECORD_REF_SIZE + node_ref_size >= bend)
 
1033
                                                break;
 
1034
                                        bitem += ilen + XT_RECORD_REF_SIZE + node_ref_size;
 
1035
                                }
 
1036
                        }
 
1037
 
 
1038
                        result->sr_item.i_item_offset = bitem - branch->tb_data;
 
1039
                        xt_get_res_record_ref(bitem + ilen, result);
 
1040
                        result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
 
1041
                }
 
1042
                else {
 
1043
                        /* Leaf is empty: */
 
1044
                        result->sr_item.i_item_offset = 0;
 
1045
                        result->sr_item.i_item_size = 0;
 
1046
                }
 
1047
        }
 
1048
}
 
1049
 
 
1050
xtPublic xtBool xt_idx_lazy_delete_on_leaf(XTIndexPtr ind, XTIndBlockPtr block, xtWord2 branch_size)
 
1051
{
 
1052
        ASSERT_NS(ind->mi_fix_key);
 
1053
        
 
1054
        /* Compact the leaf if more than half the items that fit on the page
 
1055
         * are deleted: */
 
1056
        if (block->cp_del_count >= ind->mi_max_items/2)
 
1057
                return FALSE;
 
1058
 
 
1059
        /* Compact the page if there is only 1 (or less) valid item left: */
 
1060
        if ((u_int) block->cp_del_count+1 >= ((u_int) branch_size - 2)/(ind->mi_key_size + XT_RECORD_REF_SIZE))
 
1061
                return FALSE;
 
1062
 
 
1063
        return OK;
 
1064
}
 
1065
 
 
1066
static xtBool idx_lazy_delete_on_node(XTIndexPtr ind, XTIndBlockPtr block, register XTIdxItemPtr item)
 
1067
{
 
1068
        ASSERT_NS(ind->mi_fix_key);
 
1069
        
 
1070
        /* Compact the node if more than 1/4 of the items that fit on the page
 
1071
         * are deleted: */
 
1072
        if (block->cp_del_count >= ind->mi_max_items/4)
 
1073
                return FALSE;
 
1074
 
 
1075
        /* Compact the page if there is only 1 (or less) valid item left: */
 
1076
        if ((u_int) block->cp_del_count+1 >= (item->i_total_size - item->i_node_ref_size)/(item->i_item_size + item->i_node_ref_size))
 
1077
                return FALSE;
 
1078
 
 
1079
        return OK;
 
1080
}
 
1081
 
 
1082
inline static xtBool idx_cmp_item_key_fix(XTIndReferencePtr iref, register XTIdxItemPtr item, XTIdxKeyValuePtr value)
 
1083
{
 
1084
        xtWord1 *data;
 
1085
 
 
1086
        data = &iref->ir_branch->tb_data[item->i_item_offset];
 
1087
        return memcmp(data, value->sv_key, value->sv_length) == 0;
 
1088
}
 
1089
 
 
1090
inline static void idx_set_item_key_fix(XTIndReferencePtr iref, register XTIdxItemPtr item, XTIdxKeyValuePtr value)
 
1091
{
 
1092
        xtWord1 *data;
 
1093
 
 
1094
        data = &iref->ir_branch->tb_data[item->i_item_offset];
 
1095
        memcpy(data, value->sv_key, value->sv_length);
 
1096
        xt_set_val_record_ref(data + value->sv_length, value);
 
1097
#ifdef IND_OPT_DATA_WRITTEN
 
1098
        if (item->i_item_offset < iref->ir_block->cb_min_pos)
 
1099
                iref->ir_block->cb_min_pos = item->i_item_offset;
 
1100
        if (item->i_item_offset + value->sv_length > iref->ir_block->cb_max_pos)
 
1101
                iref->ir_block->cb_max_pos = item->i_item_offset + value->sv_length;
 
1102
        ASSERT_NS(iref->ir_block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
1103
        ASSERT_NS(iref->ir_block->cb_min_pos <= iref->ir_block->cb_max_pos);
 
1104
#endif
 
1105
        iref->ir_updated = TRUE;
 
1106
}
 
1107
 
 
1108
static xtBool idx_set_item_row_id(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item, xtRowID row_id)
 
1109
{
 
1110
        register XTIndBlockPtr  block = iref->ir_block;
 
1111
        size_t                                  offset;
 
1112
        xtWord1                                 *data;
 
1113
 
 
1114
        if (block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1115
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1116
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, block)) {
 
1117
                        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
 
1118
                        return FAILED;
 
1119
                }
 
1120
        }
 
1121
 
 
1122
        offset = 
 
1123
                /* This is the offset of the reference in the item we found: */
 
1124
                item->i_item_offset +item->i_item_size - XT_RECORD_REF_SIZE +
 
1125
                /* This is the offset of the row id in the reference: */
 
1126
                XT_RECORD_ID_SIZE;
 
1127
        data = &iref->ir_branch->tb_data[offset];
 
1128
 
 
1129
        /* This update does not change the structure of page, so we do it without
 
1130
         * copying the page before we write.
 
1131
         */
 
1132
        XT_SET_DISK_4(data, row_id);
 
1133
#ifdef IND_OPT_DATA_WRITTEN
 
1134
        if (offset < block->cb_min_pos)
 
1135
                block->cb_min_pos = offset;
 
1136
        if (offset + XT_ROW_ID_SIZE > block->cb_max_pos)
 
1137
                block->cb_max_pos = offset + XT_ROW_ID_SIZE;
 
1138
        ASSERT_NS(block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
1139
        ASSERT_NS(block->cb_min_pos <= iref->ir_block->cb_max_pos);
 
1140
#endif
 
1141
        iref->ir_updated = TRUE;
 
1142
        return OK;
 
1143
}
 
1144
 
 
1145
inline static xtBool idx_is_item_deleted(register XTIdxBranchDPtr branch, register XTIdxItemPtr item)
 
1146
{
 
1147
        xtWord1 *data;
 
1148
 
 
1149
        data = &branch->tb_data[item->i_item_offset + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE];
 
1150
        return XT_GET_DISK_4(data) == (xtRowID) -1;
 
1151
}
 
1152
 
 
1153
static xtBool idx_set_item_deleted(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
 
1154
{
 
1155
        if (!idx_set_item_row_id(ot, ind, iref, item, (xtRowID) -1))
 
1156
                return FAILED;
 
1157
        
 
1158
        /* This should be safe because there is only one thread,
 
1159
         * the sweeper, that does this!
 
1160
         *
 
1161
         * Threads that decrement this value have an xlock on
 
1162
         * the page, or the index.
 
1163
         */
 
1164
        iref->ir_block->cp_del_count++;
 
1165
        return OK;
 
1166
}
 
1167
 
 
1168
/*
 
1169
 * {LAZY-DEL-INDEX-ITEMS}
 
1170
 * Do a lazy delete of an item by just setting the Row ID
 
1171
 * to the delete indicator: row ID -1.
 
1172
 */
 
1173
static xtBool idx_lazy_delete_branch_item(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
 
1174
{
 
1175
        if (!idx_set_item_deleted(ot, ind, iref, item))
 
1176
                return FAILED;
 
1177
        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_R_UPDATE, iref);
 
1178
        return OK;
 
1179
}
 
1180
 
 
1181
/*
 
1182
 * This function compacts the leaf, but preserves the
 
1183
 * position of the item.
 
1184
 */
 
1185
static xtBool idx_compact_leaf(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
 
1186
{
 
1187
        register XTIndBlockPtr          block = iref->ir_block;
 
1188
        register XTIdxBranchDPtr        branch = iref->ir_branch;
 
1189
        int             item_idx, count, i, idx;
 
1190
        u_int   size;
 
1191
        xtWord1 *s_data;
 
1192
        xtWord1 *d_data;
 
1193
        xtWord1 *data;
 
1194
        xtRowID row_id;
 
1195
 
 
1196
        if (block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1197
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1198
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, block)) {
 
1199
                        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
 
1200
                        return FAILED;
 
1201
                }
 
1202
        }
 
1203
 
 
1204
        if (block->cb_handle_count) {
 
1205
                if (!xt_ind_copy_on_write(iref)) {
 
1206
                        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
 
1207
                        return FAILED;
 
1208
                }
 
1209
        }
 
1210
 
 
1211
        ASSERT_NS(!item->i_node_ref_size);
 
1212
        ASSERT_NS(ind->mi_fix_key);
 
1213
        size = item->i_item_size;
 
1214
        count = item->i_total_size / size;
 
1215
        item_idx = item->i_item_offset / size;
 
1216
        s_data = d_data = branch->tb_data;
 
1217
        idx = 0;
 
1218
        for (i=0; i<count; i++) {
 
1219
                data = s_data + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE;
 
1220
                row_id = XT_GET_DISK_4(data);
 
1221
                if (row_id == (xtRowID) -1) {
 
1222
                        if (idx < item_idx)
 
1223
                                item_idx--;
 
1224
                }
 
1225
                else {
 
1226
                        if (d_data != s_data)
 
1227
                                memcpy(d_data, s_data, size);
 
1228
                        d_data += size;
 
1229
                        idx++;
 
1230
                }
 
1231
                s_data += size;
 
1232
        }
 
1233
        block->cp_del_count = 0;
 
1234
        item->i_total_size = d_data - branch->tb_data;
 
1235
        ASSERT_NS(idx * size == item->i_total_size);
 
1236
        item->i_item_offset = item_idx * size;
 
1237
        XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, 0));
 
1238
#ifdef IND_OPT_DATA_WRITTEN
 
1239
        block->cb_header = TRUE;
 
1240
        block->cb_min_pos = 0;
 
1241
        block->cb_max_pos = item->i_total_size;
 
1242
        ASSERT_NS(block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
1243
        ASSERT_NS(block->cb_min_pos <= iref->ir_block->cb_max_pos);
 
1244
#endif
 
1245
        iref->ir_updated = TRUE;
 
1246
        return OK;
 
1247
}
 
1248
 
 
1249
static xtBool idx_lazy_remove_leaf_item_right(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
 
1250
{
 
1251
        register XTIndBlockPtr          block = iref->ir_block;
 
1252
        register XTIdxBranchDPtr        branch = iref->ir_branch;
 
1253
        int             item_idx, count, i;
 
1254
        u_int   size;
 
1255
        xtWord1 *s_data;
 
1256
        xtWord1 *d_data;
 
1257
        xtWord1 *data;
 
1258
        xtRowID row_id;
 
1259
 
 
1260
        ASSERT_NS(!item->i_node_ref_size);
 
1261
 
 
1262
        if (block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1263
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1264
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, block)) {
 
1265
                        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
 
1266
                        return FAILED;
 
1267
                }
 
1268
        }
 
1269
 
 
1270
        if (block->cb_handle_count) {
 
1271
                if (!xt_ind_copy_on_write(iref)) {
 
1272
                        xt_ind_release(ot, ind, XT_UNLOCK_WRITE, iref);
 
1273
                        return FAILED;
 
1274
                }
 
1275
        }
 
1276
 
 
1277
        ASSERT_NS(ind->mi_fix_key);
 
1278
        size = item->i_item_size;
 
1279
        count = item->i_total_size / size;
 
1280
        item_idx = item->i_item_offset / size;
 
1281
        s_data = d_data = branch->tb_data;
 
1282
        for (i=0; i<count; i++) {
 
1283
                if (i == item_idx)
 
1284
                        item->i_item_offset = d_data - branch->tb_data;
 
1285
                else {
 
1286
                        data = s_data + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE;
 
1287
                        row_id = XT_GET_DISK_4(data);
 
1288
                        if (row_id != (xtRowID) -1) {
 
1289
                                if (d_data != s_data)
 
1290
                                        memcpy(d_data, s_data, size);
 
1291
                                d_data += size;
 
1292
                        }
 
1293
                }
 
1294
                s_data += size;
 
1295
        }
 
1296
        block->cp_del_count = 0;
 
1297
        item->i_total_size = d_data - branch->tb_data;
 
1298
        XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, 0));
 
1299
#ifdef IND_OPT_DATA_WRITTEN
 
1300
        block->cb_header = TRUE;
 
1301
        block->cb_min_pos = 0;
 
1302
        block->cb_max_pos = item->i_total_size;
 
1303
        ASSERT_NS(block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
1304
        ASSERT_NS(block->cb_min_pos <= iref->ir_block->cb_max_pos);
 
1305
#endif
 
1306
        iref->ir_updated = TRUE;
 
1307
        xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, iref);
 
1308
        return OK;
 
1309
}
 
1310
 
 
1311
/*
 
1312
 * Remove an item and save to disk.
 
1313
 */
 
1314
static xtBool idx_remove_branch_item_right(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID, XTIndReferencePtr iref, register XTIdxItemPtr item)
 
1315
{
 
1316
        register XTIndBlockPtr          block = iref->ir_block;
 
1317
        register XTIdxBranchDPtr        branch = iref->ir_branch;
 
1318
        u_int size = item->i_item_size + item->i_node_ref_size;
 
1319
 
 
1320
        if (block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1321
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1322
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, block)) {
 
1323
                        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
 
1324
                        return FAILED;
 
1325
                }
 
1326
        }
 
1327
 
 
1328
        /* {HANDLE-COUNT-USAGE}
 
1329
         * This access is safe because we have the right to update
 
1330
         * the page, so no other thread can modify the page.
 
1331
         *
 
1332
         * This means:
 
1333
         * We either have an Xlock on the index, or we have
 
1334
         * an Xlock on the cache block.
 
1335
         */
 
1336
        if (block->cb_handle_count) {
 
1337
                if (!xt_ind_copy_on_write(iref)) {
 
1338
                        xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
 
1339
                        return FAILED;
 
1340
                }
 
1341
        }
 
1342
        if (ind->mi_lazy_delete) {
 
1343
                if (idx_is_item_deleted(branch, item))
 
1344
                        block->cp_del_count--;
 
1345
        }
 
1346
        /* Remove the node reference to the left of the item: */
 
1347
        memmove(&branch->tb_data[item->i_item_offset],
 
1348
                &branch->tb_data[item->i_item_offset + size],
 
1349
                item->i_total_size - item->i_item_offset - size);
 
1350
        item->i_total_size -= size;
 
1351
        XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, item->i_node_ref_size));
 
1352
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(branch->tb_size_2));
 
1353
#ifdef IND_OPT_DATA_WRITTEN
 
1354
        block->cb_header = TRUE;
 
1355
        if (item->i_item_offset < block->cb_min_pos)
 
1356
                block->cb_min_pos = item->i_item_offset;
 
1357
        block->cb_max_pos = item->i_total_size;
 
1358
        ASSERT_NS(block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
1359
        ASSERT_NS(block->cb_min_pos <= block->cb_max_pos);
 
1360
#endif
 
1361
        iref->ir_updated = TRUE;
 
1362
        xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE, iref);
 
1363
        return OK;
 
1364
}
 
1365
 
 
1366
static xtBool idx_remove_branch_item_left(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID, XTIndReferencePtr iref, register XTIdxItemPtr item, xtBool *lazy_delete_cleanup_required)
 
1367
{
 
1368
        register XTIndBlockPtr          block = iref->ir_block;
 
1369
        register XTIdxBranchDPtr        branch = iref->ir_branch;
 
1370
        u_int size = item->i_item_size + item->i_node_ref_size;
 
1371
 
 
1372
        if (block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1373
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1374
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, block)) {
 
1375
                        xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
 
1376
                        return FAILED;
 
1377
                }
 
1378
        }
 
1379
 
 
1380
        ASSERT_NS(item->i_node_ref_size);
 
1381
        if (block->cb_handle_count) {
 
1382
                if (!xt_ind_copy_on_write(iref)) {
 
1383
                        xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
 
1384
                        return FAILED;
 
1385
                }
 
1386
        }
 
1387
        if (ind->mi_lazy_delete) {
 
1388
                if (idx_is_item_deleted(branch, item))
 
1389
                        block->cp_del_count--;
 
1390
                if (lazy_delete_cleanup_required)
 
1391
                        *lazy_delete_cleanup_required = idx_lazy_delete_on_node(ind, block, item);
 
1392
        }
 
1393
        /* Remove the node reference to the left of the item: */
 
1394
        memmove(&branch->tb_data[item->i_item_offset - item->i_node_ref_size],
 
1395
                &branch->tb_data[item->i_item_offset + item->i_item_size],
 
1396
                item->i_total_size - item->i_item_offset - item->i_item_size);
 
1397
        item->i_total_size -= size;
 
1398
        XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, item->i_node_ref_size));
 
1399
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(branch->tb_size_2));
 
1400
#ifdef IND_OPT_DATA_WRITTEN
 
1401
        block->cb_header = TRUE;
 
1402
        if (item->i_item_offset - item->i_node_ref_size < block->cb_min_pos)
 
1403
                block->cb_min_pos = item->i_item_offset - item->i_node_ref_size;
 
1404
        block->cb_max_pos = item->i_total_size;
 
1405
        ASSERT_NS(block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
1406
        ASSERT_NS(block->cb_min_pos <= block->cb_max_pos);
 
1407
#endif
 
1408
        iref->ir_updated = TRUE;
 
1409
        xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE, iref);
 
1410
        return OK;
 
1411
}
 
1412
 
 
1413
static void idx_insert_leaf_item(XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result)
 
1414
{
 
1415
        xtWord1 *item;
 
1416
 
 
1417
        /* This will ensure we do not overwrite the end of the buffer: */
 
1418
        ASSERT_NS(value->sv_length <= XT_INDEX_MAX_KEY_SIZE);
 
1419
        memmove(&leaf->tb_data[result->sr_item.i_item_offset + value->sv_length + XT_RECORD_REF_SIZE],
 
1420
                &leaf->tb_data[result->sr_item.i_item_offset],
 
1421
                result->sr_item.i_total_size - result->sr_item.i_item_offset);
 
1422
        item = &leaf->tb_data[result->sr_item.i_item_offset];
 
1423
        memcpy(item, value->sv_key, value->sv_length);
 
1424
        xt_set_val_record_ref(item + value->sv_length, value);
 
1425
        result->sr_item.i_total_size += value->sv_length + XT_RECORD_REF_SIZE;
 
1426
        XT_SET_DISK_2(leaf->tb_size_2, XT_MAKE_LEAF_SIZE(result->sr_item.i_total_size));
 
1427
}
 
1428
 
 
1429
static void idx_insert_node_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result, xtIndexNodeID branch)
 
1430
{
 
1431
        xtWord1 *item;
 
1432
 
 
1433
        /* This will ensure we do not overwrite the end of the buffer: */
 
1434
        ASSERT_NS(value->sv_length <= XT_INDEX_MAX_KEY_SIZE);
 
1435
        memmove(&leaf->tb_data[result->sr_item.i_item_offset + value->sv_length + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size],
 
1436
                &leaf->tb_data[result->sr_item.i_item_offset],
 
1437
                result->sr_item.i_total_size - result->sr_item.i_item_offset);
 
1438
        item = &leaf->tb_data[result->sr_item.i_item_offset];
 
1439
        memcpy(item, value->sv_key, value->sv_length);
 
1440
        xt_set_val_record_ref(item + value->sv_length, value);
 
1441
        XT_SET_NODE_REF(tab, item + value->sv_length + XT_RECORD_REF_SIZE, branch);
 
1442
        result->sr_item.i_total_size += value->sv_length + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size;
 
1443
        XT_SET_DISK_2(leaf->tb_size_2, XT_MAKE_NODE_SIZE(result->sr_item.i_total_size));
 
1444
}
 
1445
 
 
1446
static xtBool idx_get_middle_branch_item(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxBranchDPtr branch, XTIdxKeyValuePtr value, XTIdxResultPtr result)
 
1447
{
 
1448
        xtWord1 *bitem;
 
1449
 
 
1450
        ASSERT_NS(result->sr_item.i_node_ref_size == 0 || result->sr_item.i_node_ref_size == XT_NODE_REF_SIZE);
 
1451
        ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE*2);
 
1452
        if (ind->mi_fix_key) {
 
1453
                u_int full_item_size = result->sr_item.i_item_size + result->sr_item.i_node_ref_size;
 
1454
 
 
1455
                result->sr_item.i_item_offset = ((result->sr_item.i_total_size - result->sr_item.i_node_ref_size)
 
1456
                        / full_item_size / 2 * full_item_size) + result->sr_item.i_node_ref_size;
 
1457
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
1458
                if (result->sr_last_item) {
 
1459
                        u_int offset;
 
1460
                        
 
1461
                        offset = result->sr_item.i_total_size - full_item_size * 2;
 
1462
                        /* We actually split at the item before last! */
 
1463
                        if (offset > result->sr_item.i_item_offset)
 
1464
                                result->sr_item.i_item_offset = offset;
 
1465
                }
 
1466
#endif
 
1467
 
 
1468
                bitem = &branch->tb_data[result->sr_item.i_item_offset];
 
1469
                value->sv_flags = XT_SEARCH_WHOLE_KEY;
 
1470
                value->sv_length = result->sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
1471
                xt_get_record_ref(bitem + value->sv_length, &value->sv_rec_id, &value->sv_row_id);
 
1472
                memcpy(value->sv_key, bitem, value->sv_length);
 
1473
        }
 
1474
        else {
 
1475
                u_int   node_ref_size;
 
1476
                u_int   ilen, tlen;
 
1477
                xtWord1 *bend;
 
1478
 
 
1479
                node_ref_size = result->sr_item.i_node_ref_size;
 
1480
                bitem = branch->tb_data + node_ref_size;
 
1481
                bend = &branch->tb_data[(result->sr_item.i_total_size - node_ref_size) / 2 + node_ref_size];
 
1482
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
1483
                if (result->sr_last_item)
 
1484
                        bend = &branch->tb_data[XT_INDEX_PAGE_DATA_SIZE];
 
1485
 
 
1486
                u_int   prev_ilen = 0;
 
1487
                xtWord1 *prev_bitem = NULL;
 
1488
#endif
 
1489
                ilen = 0;
 
1490
                if (bitem < bend) {
 
1491
                        tlen = 0;
 
1492
                        for (;;) {
 
1493
                                ilen = myxt_get_key_length(ind, bitem);
 
1494
                                tlen += ilen + XT_RECORD_REF_SIZE + node_ref_size;
 
1495
                                if (bitem + ilen + XT_RECORD_REF_SIZE + node_ref_size >= bend) {
 
1496
                                        if (ilen > XT_INDEX_PAGE_SIZE || tlen > result->sr_item.i_total_size) {
 
1497
                                                xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
 
1498
                                                return FAILED;
 
1499
                                        }
 
1500
                                        break;
 
1501
                                }
 
1502
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
1503
                                prev_ilen = ilen;
 
1504
                                prev_bitem = bitem;
 
1505
#endif
 
1506
                                bitem += ilen + XT_RECORD_REF_SIZE + node_ref_size;
 
1507
                        }
 
1508
                }
 
1509
 
 
1510
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
1511
                /* We actully want the item before last! */
 
1512
                if (result->sr_last_item && prev_bitem) {
 
1513
                        bitem = prev_bitem;
 
1514
                        ilen = prev_ilen;
 
1515
                }
 
1516
#endif
 
1517
                result->sr_item.i_item_offset = bitem - branch->tb_data;
 
1518
                result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
 
1519
 
 
1520
                value->sv_flags = XT_SEARCH_WHOLE_KEY;
 
1521
                value->sv_length = ilen;
 
1522
                xt_get_record_ref(bitem + ilen, &value->sv_rec_id, &value->sv_row_id);
 
1523
                memcpy(value->sv_key, bitem, value->sv_length);
 
1524
        }
 
1525
        return OK;
 
1526
}
 
1527
 
 
1528
static size_t idx_write_branch_item(XTIndexPtr XT_UNUSED(ind), xtWord1 *item, XTIdxKeyValuePtr value)
 
1529
{
 
1530
        memcpy(item, value->sv_key, value->sv_length);
 
1531
        xt_set_val_record_ref(item + value->sv_length, value);
 
1532
        return value->sv_length + XT_RECORD_REF_SIZE;
 
1533
}
 
1534
 
 
1535
static xtBool idx_replace_node_key(XTOpenTablePtr ot, XTIndexPtr ind, IdxStackItemPtr item, IdxBranchStackPtr stack, u_int item_size, xtWord1 *item_buf)
 
1536
{
 
1537
        XTIndReferenceRec       iref;
 
1538
        xtIndexNodeID           new_branch;
 
1539
        XTIdxResultRec          result;
 
1540
        xtIndexNodeID           current = item->i_branch;
 
1541
        u_int                           new_size;
 
1542
        XTIdxBranchDPtr         new_branch_ptr;
 
1543
        XTIdxKeyValueRec        key_value;
 
1544
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE];
 
1545
 
 
1546
#ifdef DEBUG
 
1547
        iref.ir_xlock = 2;
 
1548
        iref.ir_updated = 2;
 
1549
#endif
 
1550
        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_WRITE, &iref))
 
1551
                return FAILED;
 
1552
 
 
1553
        if (iref.ir_block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1554
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1555
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, iref.ir_block))
 
1556
                        goto failed_1;
 
1557
        }
 
1558
 
 
1559
        if (iref.ir_block->cb_handle_count) {
 
1560
                if (!xt_ind_copy_on_write(&iref))
 
1561
                        goto failed_1;
 
1562
        }
 
1563
 
 
1564
        if (ind->mi_lazy_delete) {
 
1565
                ASSERT_NS(item_size == item->i_pos.i_item_size);
 
1566
                if (idx_is_item_deleted(iref.ir_branch, &item->i_pos))
 
1567
                        iref.ir_block->cp_del_count--;
 
1568
        }
 
1569
        memmove(&iref.ir_branch->tb_data[item->i_pos.i_item_offset + item_size],
 
1570
                &iref.ir_branch->tb_data[item->i_pos.i_item_offset + item->i_pos.i_item_size],
 
1571
                item->i_pos.i_total_size - item->i_pos.i_item_offset - item->i_pos.i_item_size);
 
1572
        memcpy(&iref.ir_branch->tb_data[item->i_pos.i_item_offset],
 
1573
                item_buf, item_size);
 
1574
        if (ind->mi_lazy_delete) {
 
1575
                if (idx_is_item_deleted(iref.ir_branch, &item->i_pos))
 
1576
                        iref.ir_block->cp_del_count++;
 
1577
        }
 
1578
        item->i_pos.i_total_size = item->i_pos.i_total_size + item_size - item->i_pos.i_item_size;
 
1579
        XT_SET_DISK_2(iref.ir_branch->tb_size_2, XT_MAKE_NODE_SIZE(item->i_pos.i_total_size));
 
1580
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
1581
#ifdef IND_OPT_DATA_WRITTEN
 
1582
        iref.ir_block->cb_header = TRUE;
 
1583
        if (item->i_pos.i_item_offset < iref.ir_block->cb_min_pos)
 
1584
                iref.ir_block->cb_min_pos = item->i_pos.i_item_offset;
 
1585
        iref.ir_block->cb_max_pos = item->i_pos.i_total_size;
 
1586
        ASSERT_NS(iref.ir_block->cb_min_pos <= iref.ir_block->cb_max_pos);
 
1587
#endif
 
1588
        iref.ir_updated = TRUE;
 
1589
 
 
1590
#ifdef DEBUG
 
1591
        if (ind->mi_lazy_delete)
 
1592
                ASSERT_NS(item->i_pos.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
 
1593
#endif
 
1594
        if (item->i_pos.i_total_size <= XT_INDEX_PAGE_DATA_SIZE)
 
1595
                return xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
1596
 
 
1597
        /* The node has overflowed!! */
 
1598
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
1599
        result.sr_last_item = FALSE;
 
1600
#endif
 
1601
        result.sr_item = item->i_pos;
 
1602
 
 
1603
        /* Adjust the stack (we want the parents of the delete node): */
 
1604
        for (;;) {
 
1605
                if (idx_pop(stack) == item)
 
1606
                        break;
 
1607
        }               
 
1608
 
 
1609
        /* We assume that value can be overwritten (which is the case) */
 
1610
        key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
1611
        key_value.sv_key = key_buf;
 
1612
        if (!idx_get_middle_branch_item(ot, ind, iref.ir_branch, &key_value, &result))
 
1613
                goto failed_1;
 
1614
 
 
1615
        if (!idx_new_branch(ot, ind, &new_branch))
 
1616
                goto failed_1;
 
1617
 
 
1618
        /* Split the node: */
 
1619
        new_size = result.sr_item.i_total_size - result.sr_item.i_item_offset - result.sr_item.i_item_size;
 
1620
        // TODO: Are 2 buffers now required?
 
1621
        new_branch_ptr = (XTIdxBranchDPtr) &ot->ot_ind_wbuf.tb_data[XT_INDEX_PAGE_DATA_SIZE];
 
1622
        memmove(new_branch_ptr->tb_data, &iref.ir_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size], new_size);
 
1623
 
 
1624
        XT_SET_DISK_2(new_branch_ptr->tb_size_2, XT_MAKE_NODE_SIZE(new_size));
 
1625
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(new_branch_ptr->tb_size_2));
 
1626
        if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + new_size, (xtWord1 *) new_branch_ptr))
 
1627
                goto failed_2;
 
1628
 
 
1629
        /* Change the size of the old branch: */
 
1630
        XT_SET_DISK_2(iref.ir_branch->tb_size_2, XT_MAKE_NODE_SIZE(result.sr_item.i_item_offset));
 
1631
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
1632
#ifdef IND_OPT_DATA_WRITTEN
 
1633
        iref.ir_block->cb_header = TRUE;
 
1634
        if (result.sr_item.i_item_offset < iref.ir_block->cb_min_pos)
 
1635
                iref.ir_block->cb_min_pos = result.sr_item.i_item_offset;
 
1636
        iref.ir_block->cb_max_pos = result.sr_item.i_item_offset;
 
1637
        ASSERT_NS(iref.ir_block->cb_max_pos <= XT_INDEX_PAGE_DATA_SIZE);
 
1638
        ASSERT_NS(iref.ir_block->cb_min_pos <= iref.ir_block->cb_max_pos);
 
1639
#endif
 
1640
        iref.ir_updated = TRUE;
 
1641
        xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
1642
 
 
1643
        /* Insert the new branch into the parent node, using the new middle key value: */
 
1644
        if (!idx_insert_node(ot, ind, stack, FALSE, &key_value, new_branch)) {
 
1645
                /* 
 
1646
                 * TODO: Mark the index as corrupt.
 
1647
                 * This should not fail because everything has been
 
1648
                 * preallocated.
 
1649
                 * However, if it does fail the index
 
1650
                 * will be corrupt.
 
1651
                 * I could modify and release the branch above,
 
1652
                 * after this point.
 
1653
                 * But that would mean holding the lock longer,
 
1654
                 * and also may not help because idx_insert_node()
 
1655
                 * is recursive.
 
1656
                 */
 
1657
                idx_free_branch(ot, ind, new_branch);
 
1658
                return FAILED;
 
1659
        }
 
1660
 
 
1661
        return OK;
 
1662
 
 
1663
        failed_2:
 
1664
        idx_free_branch(ot, ind, new_branch);
 
1665
 
 
1666
        failed_1:
 
1667
        xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
 
1668
 
 
1669
        return FAILED;
 
1670
}
 
1671
 
 
1672
/*ot_ind_wbuf
 
1673
 * -----------------------------------------------------------------------
 
1674
 * Standard b-tree insert
 
1675
 */
 
1676
 
 
1677
/*
 
1678
 * Insert the given branch into the node on the top of the stack. If the stack
 
1679
 * is empty we need to add a new root.
 
1680
 */
 
1681
static xtBool idx_insert_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, xtBool last_item, XTIdxKeyValuePtr key_value, xtIndexNodeID branch)
 
1682
{
 
1683
        IdxStackItemPtr         stack_item;
 
1684
        xtIndexNodeID           new_branch;
 
1685
        size_t                          size;
 
1686
        xtIndexNodeID           current;
 
1687
        XTIndReferenceRec       iref;
 
1688
        XTIdxResultRec          result;
 
1689
        u_int                           new_size;
 
1690
        XTIdxBranchDPtr         new_branch_ptr;
 
1691
#ifdef IND_OPT_DATA_WRITTEN
 
1692
        u_int                           new_min_pos;
 
1693
#endif
 
1694
 
 
1695
#ifdef DEBUG
 
1696
        iref.ir_xlock = 2;
 
1697
        iref.ir_updated = 2;
 
1698
#endif
 
1699
        /* Insert a new branch (key, data)... */
 
1700
        if (!(stack_item = idx_pop(stack))) {
 
1701
                xtWord1 *ditem;
 
1702
 
 
1703
                /* New root */
 
1704
                if (!idx_new_branch(ot, ind, &new_branch))
 
1705
                        goto failed;
 
1706
 
 
1707
                ditem = ot->ot_ind_wbuf.tb_data;
 
1708
                XT_SET_NODE_REF(ot->ot_table, ditem, ind->mi_root);
 
1709
                ditem += XT_NODE_REF_SIZE;
 
1710
                ditem += idx_write_branch_item(ind, ditem, key_value);
 
1711
                XT_SET_NODE_REF(ot->ot_table, ditem, branch);
 
1712
                ditem += XT_NODE_REF_SIZE;
 
1713
                size = ditem - ot->ot_ind_wbuf.tb_data;
 
1714
                XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_NODE_SIZE(size));
 
1715
                IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
1716
                if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + size, (xtWord1 *) &ot->ot_ind_wbuf))
 
1717
                        goto failed_2;
 
1718
                ind->mi_root = new_branch;
 
1719
                goto done_ok;
 
1720
        }
 
1721
 
 
1722
        current = stack_item->i_branch;
 
1723
        /* This read does not count (towards ot_ind_reads), because we are only
 
1724
         * counting each loaded page once. We assume that the page is in
 
1725
         * cache, and will remain in cache when we read again below for the
 
1726
         * purpose of update.
 
1727
         */
 
1728
        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
1729
                goto failed;
 
1730
        ASSERT_NS(XT_IS_NODE(XT_GET_DISK_2(iref.ir_branch->tb_size_2)));
 
1731
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
1732
        result.sr_last_item = last_item;
 
1733
#endif
 
1734
        ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
 
1735
 
 
1736
        if (result.sr_item.i_total_size + key_value->sv_length + XT_RECORD_REF_SIZE + result.sr_item.i_node_ref_size <= XT_INDEX_PAGE_DATA_SIZE) {
 
1737
                if (iref.ir_block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1738
                        ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1739
                        if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, iref.ir_block))
 
1740
                                goto failed_1;
 
1741
                }
 
1742
 
 
1743
                if (iref.ir_block->cb_handle_count) {
 
1744
                        if (!xt_ind_copy_on_write(&iref))
 
1745
                                goto failed_1;
 
1746
                }
 
1747
 
 
1748
                idx_insert_node_item(ot->ot_table, ind, iref.ir_branch, key_value, &result, branch);
 
1749
                IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
1750
#ifdef IND_OPT_DATA_WRITTEN
 
1751
                iref.ir_block->cb_header = TRUE;
 
1752
                if (result.sr_item.i_item_offset < iref.ir_block->cb_min_pos)
 
1753
                        iref.ir_block->cb_min_pos = result.sr_item.i_item_offset;
 
1754
                iref.ir_block->cb_max_pos = result.sr_item.i_total_size;
 
1755
                ASSERT_NS(iref.ir_block->cb_max_pos <= XT_INDEX_PAGE_DATA_SIZE);
 
1756
        ASSERT_NS(iref.ir_block->cb_min_pos <= iref.ir_block->cb_max_pos);
 
1757
#endif
 
1758
                iref.ir_updated = TRUE;
 
1759
                ASSERT_NS(result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
 
1760
                xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
 
1761
                goto done_ok;
 
1762
        }
 
1763
 
 
1764
        memcpy(&ot->ot_ind_wbuf, iref.ir_branch, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_total_size);
 
1765
        idx_insert_node_item(ot->ot_table, ind, &ot->ot_ind_wbuf, key_value, &result, branch);
 
1766
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
1767
        ASSERT_NS(result.sr_item.i_total_size > XT_INDEX_PAGE_DATA_SIZE);
 
1768
#ifdef IND_OPT_DATA_WRITTEN
 
1769
        new_min_pos = result.sr_item.i_item_offset;
 
1770
#endif
 
1771
 
 
1772
        /* We assume that value can be overwritten (which is the case) */
 
1773
        if (!idx_get_middle_branch_item(ot, ind, &ot->ot_ind_wbuf, key_value, &result))
 
1774
                goto failed_1;
 
1775
 
 
1776
        if (!idx_new_branch(ot, ind, &new_branch))
 
1777
                goto failed_1;
 
1778
 
 
1779
        /* Split the node: */
 
1780
        new_size = result.sr_item.i_total_size - result.sr_item.i_item_offset - result.sr_item.i_item_size;
 
1781
        new_branch_ptr = (XTIdxBranchDPtr) &ot->ot_ind_wbuf.tb_data[XT_INDEX_PAGE_DATA_SIZE];
 
1782
        memmove(new_branch_ptr->tb_data, &ot->ot_ind_wbuf.tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size], new_size);
 
1783
 
 
1784
        XT_SET_DISK_2(new_branch_ptr->tb_size_2, XT_MAKE_NODE_SIZE(new_size));
 
1785
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(new_branch_ptr->tb_size_2));
 
1786
        if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + new_size, (xtWord1 *) new_branch_ptr))
 
1787
                goto failed_2;
 
1788
 
 
1789
        /* Change the size of the old branch: */
 
1790
        XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_NODE_SIZE(result.sr_item.i_item_offset));
 
1791
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
1792
 
 
1793
        if (iref.ir_block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
1794
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
1795
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, iref.ir_block))
 
1796
                        goto failed_2;
 
1797
        }
 
1798
 
 
1799
        if (iref.ir_block->cb_handle_count) {
 
1800
                if (!xt_ind_copy_on_write(&iref))
 
1801
                        goto failed_2;
 
1802
        }
 
1803
 
 
1804
#ifdef IND_OPT_DATA_WRITTEN
 
1805
        if (result.sr_item.i_item_offset < new_min_pos)
 
1806
                new_min_pos = result.sr_item.i_item_offset;
 
1807
#endif
 
1808
        memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
 
1809
#ifdef IND_OPT_DATA_WRITTEN
 
1810
        iref.ir_block->cb_header = TRUE;
 
1811
        if (new_min_pos < iref.ir_block->cb_min_pos)
 
1812
                iref.ir_block->cb_min_pos = new_min_pos;
 
1813
        iref.ir_block->cb_max_pos = result.sr_item.i_item_offset;
 
1814
        ASSERT_NS(iref.ir_block->cb_max_pos <= XT_INDEX_PAGE_DATA_SIZE);
 
1815
        ASSERT_NS(iref.ir_block->cb_min_pos <= iref.ir_block->cb_max_pos);
 
1816
#endif
 
1817
        iref.ir_updated = TRUE;
 
1818
        xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
 
1819
 
 
1820
        /* Insert the new branch into the parent node, using the new middle key value: */
 
1821
        if (!idx_insert_node(ot, ind, stack, last_item, key_value, new_branch)) {
 
1822
                // Index may be inconsistant now...
 
1823
                idx_free_branch(ot, ind, new_branch);
 
1824
                goto failed;
 
1825
        }
 
1826
 
 
1827
        done_ok:
 
1828
        return OK;
 
1829
 
 
1830
        failed_2:
 
1831
        idx_free_branch(ot, ind, new_branch);
 
1832
 
 
1833
        failed_1:
 
1834
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
1835
 
 
1836
        failed:
 
1837
        return FAILED;
 
1838
}
 
1839
 
 
1840
#define IDX_MAX_INDEX_FLUSH_COUNT               10
 
1841
 
 
1842
struct IdxTableItem {
 
1843
        xtTableID               ti_tab_id;
 
1844
        u_int                   ti_dirty_blocks;
 
1845
};
 
1846
 
 
1847
inline u_int idx_dirty_blocks(XTTableHPtr tab)
 
1848
{
 
1849
        XTIndexPtr      *indp;
 
1850
        XTIndexPtr      ind;
 
1851
        u_int           dirty_blocks;
 
1852
 
 
1853
        dirty_blocks = 0;
 
1854
        indp = tab->tab_dic.dic_keys;
 
1855
        for (int i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
1856
                ind = *indp;
 
1857
                dirty_blocks += ind->mi_dirty_blocks;
 
1858
        }
 
1859
        return dirty_blocks;
 
1860
}
 
1861
 
 
1862
static xtBool idx_out_of_memory_failure(XTOpenTablePtr ot)
 
1863
{
 
1864
#ifdef XT_TRACK_INDEX_UPDATES
 
1865
        /* If the index has been changed when we run out of memory, we
 
1866
         * will corrupt the index!
 
1867
         */
 
1868
        ASSERT_NS(ot->ot_ind_changed == 0);
 
1869
#endif
 
1870
        if (ot->ot_thread->t_exception.e_xt_err == XT_ERR_NO_INDEX_CACHE) {
 
1871
                u_int block_total = xt_ind_get_blocks();
 
1872
 
 
1873
                /* Flush index and retry! */
 
1874
                xt_clear_exception(ot->ot_thread);
 
1875
 
 
1876
                if (idx_dirty_blocks(ot->ot_table) >= block_total / 4) {
 
1877
                        if (!xt_async_flush_indices(ot->ot_table, FALSE, TRUE, ot->ot_thread))
 
1878
                                return FAILED;
 
1879
                        if (!xt_wait_for_async_task_results(ot->ot_thread))
 
1880
                                return FAILED;
 
1881
                }
 
1882
                else {
 
1883
                        XTDatabaseHPtr  db = ot->ot_table->tab_db;
 
1884
                        IdxTableItem    table_list[IDX_MAX_INDEX_FLUSH_COUNT];
 
1885
                        int                             item_count = 0;
 
1886
                        int                             i;
 
1887
                        u_int                   edx;
 
1888
                        XTTableEntryPtr tab_ptr;
 
1889
                        u_int                   dirty_blocks;
 
1890
                        u_int                   dirty_total = 0;
 
1891
 
 
1892
                        xt_ht_lock(NULL, db->db_tables);
 
1893
                        xt_enum_tables_init(&edx);
 
1894
                        while ((tab_ptr = xt_enum_tables_next(NULL, db, &edx))) {
 
1895
                                if (tab_ptr->te_table) {
 
1896
                                        if (tab_ptr->te_table->tab_ind_flush_task->tk_is_running()) {
 
1897
                                                if (!(dirty_blocks = tab_ptr->te_table->tab_ind_flush_task->fit_dirty_blocks))
 
1898
                                                        dirty_blocks = idx_dirty_blocks(tab_ptr->te_table);
 
1899
                                        }
 
1900
                                        else
 
1901
                                                dirty_blocks = idx_dirty_blocks(tab_ptr->te_table);
 
1902
                                        dirty_total += dirty_blocks;
 
1903
                                        if (dirty_blocks) {
 
1904
                                                for (i=0; i<item_count; i++) {
 
1905
                                                        if (table_list[i].ti_dirty_blocks < dirty_blocks)
 
1906
                                                                break;
 
1907
                                                }
 
1908
                                                if (i < IDX_MAX_INDEX_FLUSH_COUNT) {
 
1909
                                                        int cnt;
 
1910
                                                        
 
1911
                                                        if (item_count < IDX_MAX_INDEX_FLUSH_COUNT) {
 
1912
                                                                cnt = item_count - i;
 
1913
                                                                item_count++;
 
1914
                                                        }
 
1915
                                                        else
 
1916
                                                                cnt = item_count - i - 1;
 
1917
                                                        memmove(&table_list[i], &table_list[i+1], sizeof(IdxTableItem) * cnt);
 
1918
                                                        table_list[i].ti_tab_id = tab_ptr->te_table->tab_id;
 
1919
                                                        table_list[i].ti_dirty_blocks = dirty_blocks;
 
1920
                                                }
 
1921
                                        }
 
1922
                                        if (dirty_total >= block_total / 4)
 
1923
                                                break;
 
1924
                                }
 
1925
                        }
 
1926
                        xt_ht_unlock(NULL, db->db_tables);
 
1927
                        if (dirty_total >= block_total / 4) {
 
1928
                                for (i=0; i<item_count; i++) {
 
1929
                                        if (table_list[i].ti_tab_id == ot->ot_table->tab_id) {
 
1930
                                                if (!xt_async_flush_indices(ot->ot_table, FALSE, TRUE, ot->ot_thread))
 
1931
                                                        return FAILED;
 
1932
                                        }
 
1933
                                        else {
 
1934
                                                XTTableHPtr tab;
 
1935
                                                xtBool          ok;
 
1936
 
 
1937
                                                if ((tab = xt_use_table_by_id_ns(db, table_list[i].ti_tab_id))) {
 
1938
                                                        ok = xt_async_flush_indices(tab, FALSE, TRUE, ot->ot_thread);
 
1939
                                                        xt_heap_release_ns(tab);
 
1940
                                                }
 
1941
                                        }
 
1942
                                }
 
1943
                                if (!xt_wait_for_async_task_results(ot->ot_thread))
 
1944
                                        return FAILED;
 
1945
                        }
 
1946
                }
 
1947
 
 
1948
                return TRUE;
 
1949
        }
 
1950
        return FALSE;
 
1951
}
 
1952
 
 
1953
/*
 
1954
 * Check all the duplicate variation in an index.
 
1955
 * If one of them is visible, then we have a duplicate key
 
1956
 * error.
 
1957
 *
 
1958
 * GOTCHA: This routine must use the write index buffer!
 
1959
 */
 
1960
static xtBool idx_check_duplicates(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxKeyValuePtr key_value)
 
1961
{
 
1962
        IdxBranchStackRec       stack;
 
1963
        xtIndexNodeID           current;
 
1964
        XTIndReferenceRec       iref;
 
1965
        XTIdxResultRec          result;
 
1966
        xtBool                          on_key = FALSE;
 
1967
        xtXactID                        xn_id;
 
1968
        int                                     save_flags;                             
 
1969
        XTXactWaitRec           xw;
 
1970
 
 
1971
#ifdef DEBUG
 
1972
        iref.ir_xlock = 2;
 
1973
        iref.ir_updated = 2;
 
1974
#endif
 
1975
        retry:
 
1976
        idx_newstack(&stack);
 
1977
 
 
1978
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
 
1979
                return OK;
 
1980
 
 
1981
        save_flags = key_value->sv_flags;
 
1982
        key_value->sv_flags = 0;
 
1983
 
 
1984
        while (XT_NODE_ID(current)) {
 
1985
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref)) {
 
1986
                        key_value->sv_flags = save_flags;
 
1987
                        return FAILED;
 
1988
                }
 
1989
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
 
1990
                if (result.sr_found)
 
1991
                        /* If we have found the key in a node: */
 
1992
                        on_key = TRUE;
 
1993
                if (!result.sr_item.i_node_ref_size)
 
1994
                        break;
 
1995
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
1996
                if (!idx_push(&stack, current, &result.sr_item)) {
 
1997
                        key_value->sv_flags = save_flags;
 
1998
                        return FAILED;
 
1999
                }
 
2000
                current = result.sr_branch;
 
2001
        }
 
2002
 
 
2003
        key_value->sv_flags = save_flags;
 
2004
 
 
2005
        if (!on_key) {
 
2006
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2007
                return OK;
 
2008
        }
 
2009
 
 
2010
        for (;;) {
 
2011
                if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
 
2012
                        IdxStackItemPtr node;
 
2013
 
 
2014
                        /* We are at the end of a leaf node.
 
2015
                         * Go up the stack to find the start position of the next key.
 
2016
                         * If we find none, then we are the end of the index.
 
2017
                         */
 
2018
                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2019
                        while ((node = idx_pop(&stack))) {
 
2020
                                if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
 
2021
                                        current = node->i_branch;
 
2022
                                        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
2023
                                                return FAILED;
 
2024
                                        xt_get_res_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &result);
 
2025
                                        result.sr_item = node->i_pos;
 
2026
                                        goto check_value;
 
2027
                                }
 
2028
                        }
 
2029
                        break;
 
2030
                }
 
2031
 
 
2032
                check_value:
 
2033
                /* Quit the loop if the key is no longer matched! */
 
2034
                if (myxt_compare_key(ind, 0, key_value->sv_length, key_value->sv_key, &iref.ir_branch->tb_data[result.sr_item.i_item_offset]) != 0) {
 
2035
                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2036
                        break;
 
2037
                }
 
2038
 
 
2039
                if (ind->mi_lazy_delete) {
 
2040
                        if (result.sr_row_id == (xtRowID) -1)
 
2041
                                goto next_item;
 
2042
                }
 
2043
 
 
2044
                switch (xt_tab_maybe_committed(ot, result.sr_rec_id, &xn_id, NULL, NULL)) {
 
2045
                        case XT_MAYBE:
 
2046
                                /* Record is not committed, wait for the transaction. */
 
2047
                                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2048
                                XT_INDEX_UNLOCK(ind, ot);                               
 
2049
                                xw.xw_xn_id = xn_id;
 
2050
                                if (!xt_xn_wait_for_xact(ot->ot_thread, &xw, NULL)) {
 
2051
                                        XT_INDEX_WRITE_LOCK(ind, ot);
 
2052
                                        return FAILED;
 
2053
                                }
 
2054
                                XT_INDEX_WRITE_LOCK(ind, ot);
 
2055
                                goto retry;                     
 
2056
                        case XT_ERR:
 
2057
                                /* Error while reading... */
 
2058
                                goto failed;
 
2059
                        case TRUE:
 
2060
                                /* Record is committed or belongs to me, duplicate key: */
 
2061
                                XT_DEBUG_TRACE(("DUPLICATE KEY tx=%d rec=%d\n", (int) ot->ot_thread->st_xact_data->xd_start_xn_id, (int) result.sr_rec_id));
 
2062
                                xt_register_xterr(XT_REG_CONTEXT, XT_ERR_DUPLICATE_KEY);
 
2063
                                goto failed;
 
2064
                        case FALSE:
 
2065
                                /* Record is deleted or rolled-back: */
 
2066
                                break;
 
2067
                }
 
2068
 
 
2069
                next_item:
 
2070
                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
2071
 
 
2072
                if (result.sr_item.i_node_ref_size) {
 
2073
                        /* Go down to the bottom: */
 
2074
                        while (XT_NODE_ID(current)) {
 
2075
                                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2076
                                if (!idx_push(&stack, current, &result.sr_item))
 
2077
                                        return FAILED;
 
2078
                                current = result.sr_branch;
 
2079
                                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
2080
                                        return FAILED;
 
2081
                                idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
2082
                                if (!result.sr_item.i_node_ref_size)
 
2083
                                        break;
 
2084
                        }
 
2085
                }
 
2086
        }
 
2087
 
 
2088
        return OK;
 
2089
        
 
2090
        failed:
 
2091
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2092
        return FAILED;
 
2093
}
 
2094
 
 
2095
inline static void idx_still_on_key(XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, register XTIdxBranchDPtr branch, register XTIdxItemPtr item)
 
2096
{
 
2097
        if (search_key && search_key->sk_on_key) {
 
2098
                search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
 
2099
                        search_key->sk_key_value.sv_key, &branch->tb_data[item->i_item_offset]) == 0;
 
2100
        }
 
2101
}
 
2102
 
 
2103
/*
 
2104
 * Insert a value into the given index. Return FALSE if an error occurs.
 
2105
 */
 
2106
xtPublic xtBool xt_idx_insert(XTOpenTablePtr ot, XTIndexPtr ind, xtRowID row_id, xtRecordID rec_id, xtWord1 *rec_buf, xtWord1 *bef_buf, xtBool allow_dups)
 
2107
{
 
2108
        XTIdxKeyValueRec        key_value;
 
2109
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE];
 
2110
        IdxBranchStackRec       stack;
 
2111
        xtIndexNodeID           current;
 
2112
        XTIndReferenceRec       iref;
 
2113
        xtIndexNodeID           new_branch;
 
2114
        XTIdxBranchDPtr         new_branch_ptr;
 
2115
        size_t                          size;
 
2116
        XTIdxResultRec          result;
 
2117
        size_t                          new_size;
 
2118
        xtBool                          check_for_dups = ind->mi_flags & (HA_UNIQUE_CHECK | HA_NOSAME) && !allow_dups;
 
2119
        xtBool                          lock_structure = FALSE;
 
2120
        xtBool                          updated = FALSE;
 
2121
#ifdef IND_OPT_DATA_WRITTEN
 
2122
        u_int                           new_min_pos;
 
2123
#endif
 
2124
 
 
2125
#ifdef DEBUG
 
2126
        iref.ir_xlock = 2;
 
2127
        iref.ir_updated = 2;
 
2128
#endif
 
2129
#ifdef CHECK_AND_PRINT
 
2130
        //idx_check_index(ot, ind, TRUE);
 
2131
#endif
 
2132
 
 
2133
        retry_after_oom:
 
2134
#ifdef XT_TRACK_INDEX_UPDATES
 
2135
        ot->ot_ind_changed = 0;
 
2136
#endif
 
2137
        key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
2138
        key_value.sv_rec_id = rec_id;
 
2139
        key_value.sv_row_id = row_id;           /* Should always be zero on insert (will be update by sweeper later). 
 
2140
                                                                                 * Non-zero only during recovery, assuming that sweeper will process such records right after recovery.
 
2141
                                                                                 */
 
2142
        key_value.sv_key = key_buf;
 
2143
        key_value.sv_length = myxt_create_key_from_row(ind, key_buf, rec_buf, &check_for_dups);
 
2144
 
 
2145
        if (bef_buf && check_for_dups) {
 
2146
                /* If we have a before image, and we are required to check for duplicates.
 
2147
                 * then compare the before image key with the after image key.
 
2148
                 */
 
2149
                xtWord1 bef_key_buf[XT_INDEX_MAX_KEY_SIZE];
 
2150
                u_int   len;
 
2151
                xtBool  has_no_null = TRUE;
 
2152
 
 
2153
                len = myxt_create_key_from_row(ind, bef_key_buf, bef_buf, &has_no_null);
 
2154
                if (has_no_null) {
 
2155
                        /* If the before key has no null values, then compare with the after key value.
 
2156
                         * We only have to check for duplicates if the key has changed!
 
2157
                         */
 
2158
                        check_for_dups = myxt_compare_key(ind, 0, len, bef_key_buf, key_buf) != 0;
 
2159
                }
 
2160
        }
 
2161
 
 
2162
        /* The index appears to have no root: */
 
2163
        if (!XT_NODE_ID(ind->mi_root))
 
2164
                lock_structure = TRUE;
 
2165
 
 
2166
        lock_and_retry:
 
2167
        idx_newstack(&stack);
 
2168
 
 
2169
        /* A write lock is only required if we are going to change the
 
2170
         * strcuture of the index!
 
2171
         */
 
2172
        if (lock_structure)
 
2173
                XT_INDEX_WRITE_LOCK(ind, ot);
 
2174
        else
 
2175
                XT_INDEX_READ_LOCK(ind, ot);
 
2176
 
 
2177
        retry:
 
2178
        /* Create a root node if required: */
 
2179
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
 
2180
                /* Index is empty, create a new one: */
 
2181
                ASSERT_NS(lock_structure);
 
2182
                if (!xt_ind_reserve(ot, 1, NULL))
 
2183
                        goto failed;
 
2184
                if (!idx_new_branch(ot, ind, &new_branch))
 
2185
                        goto failed;
 
2186
                size = idx_write_branch_item(ind, ot->ot_ind_wbuf.tb_data, &key_value);
 
2187
                XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_LEAF_SIZE(size));
 
2188
                IDX_TRACE("%d-> %x\n", (int) new_branch, (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
2189
                if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + size, (xtWord1 *) &ot->ot_ind_wbuf))
 
2190
                        goto failed_2;
 
2191
                ind->mi_root = new_branch;
 
2192
                goto done_ok;
 
2193
        }
 
2194
 
 
2195
        /* Search down the tree for the insertion point. */
 
2196
#ifdef IND_SKEW_SPLIT_ON_APPEND
 
2197
        result.sr_last_item = TRUE;
 
2198
#endif
 
2199
        while (XT_NODE_ID(current)) {
 
2200
                if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, &iref))
 
2201
                        goto failed;
 
2202
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 
2203
                if (result.sr_duplicate) {
 
2204
                        if (check_for_dups) {
 
2205
                                /* Duplicates are not allowed, at least one has been
 
2206
                                 * found...
 
2207
                                 */
 
2208
 
 
2209
                                /* Leaf nodes (i_node_ref_size == 0) are write locked,
 
2210
                                 * non-leaf nodes are read locked.
 
2211
                                 */
 
2212
                                xt_ind_release(ot, ind, result.sr_item.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, &iref);
 
2213
 
 
2214
                                if (!idx_check_duplicates(ot, ind, &key_value))
 
2215
                                        goto failed;
 
2216
                                /* We have checked all the "duplicate" variations. None of them are
 
2217
                                 * relevant. So this will cause a correct insert.
 
2218
                                 */
 
2219
                                check_for_dups = FALSE;
 
2220
                                idx_newstack(&stack);
 
2221
                                goto retry;
 
2222
                        }
 
2223
                }
 
2224
                if (result.sr_found) {
 
2225
                        /* Node found, can happen during recovery of indexes! 
 
2226
                         * We have found an exact match of both key and record.
 
2227
                         */
 
2228
                        XTPageUnlockType        utype;
 
2229
                        xtBool                          overwrite = FALSE;
 
2230
 
 
2231
                        /* {LAZY-DEL-INDEX-ITEMS}
 
2232
                         * If the item has been lazy deleted, then just overwrite!
 
2233
                         */ 
 
2234
                        if (result.sr_row_id == (xtRowID) -1) {
 
2235
                                xtWord2 del_count;
 
2236
        
 
2237
                                /* This is safe because we have an xlock on the leaf. */
 
2238
                                if ((del_count = iref.ir_block->cp_del_count))
 
2239
                                        iref.ir_block->cp_del_count = del_count-1;
 
2240
                                overwrite = TRUE;
 
2241
                        }
 
2242
 
 
2243
                        if (!result.sr_row_id && row_id) {
 
2244
                                /* {INDEX-RECOV_ROWID} Set the row-id
 
2245
                                 * during recovery, even if the index entry
 
2246
                                 * is not committed.
 
2247
                                 * It will be removed later by the sweeper.
 
2248
                                 */
 
2249
                                overwrite = TRUE;
 
2250
                        }
 
2251
 
 
2252
                        if (overwrite) {
 
2253
                                if (!idx_set_item_row_id(ot, ind, &iref, &result.sr_item, row_id))
 
2254
                                        goto failed;
 
2255
                                utype = result.sr_item.i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE;
 
2256
                        }
 
2257
                        else
 
2258
                                utype = result.sr_item.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE;
 
2259
                        xt_ind_release(ot, ind, utype, &iref);
 
2260
                        goto done_ok;
 
2261
                }
 
2262
                /* Stop when we get to a leaf: */
 
2263
                if (!result.sr_item.i_node_ref_size)
 
2264
                        break;
 
2265
                xt_ind_release(ot, ind, result.sr_item.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, &iref);
 
2266
                if (!idx_push(&stack, current, NULL))
 
2267
                        goto failed;
 
2268
                current = result.sr_branch;
 
2269
        }
 
2270
        ASSERT_NS(XT_NODE_ID(current));
 
2271
        
 
2272
        /* Must be a leaf!: */
 
2273
        ASSERT_NS(!result.sr_item.i_node_ref_size);
 
2274
 
 
2275
        updated = FALSE;
 
2276
        if (ind->mi_lazy_delete && iref.ir_block->cp_del_count) {
 
2277
                /* There are a number of possibilities:
 
2278
                 * - We could just replace a lazy deleted slot.
 
2279
                 * - We could compact and insert.
 
2280
                 * - We could just insert
 
2281
                 */
 
2282
 
 
2283
                if (result.sr_item.i_item_offset > 0) {
 
2284
                        /* Check if it can go into the previous node: */
 
2285
                        XTIdxResultRec  t_res;
 
2286
 
 
2287
                        t_res.sr_item = result.sr_item;
 
2288
                        xt_prev_branch_item_fix(ot->ot_table, ind, iref.ir_branch, &t_res);
 
2289
                        if (t_res.sr_row_id != (xtRowID) -1)
 
2290
                                goto try_current;
 
2291
 
 
2292
                        /* Yup, it can, but first check to see if it would be 
 
2293
                         * better to put it in the current node.
 
2294
                         * This is the case if the previous node key is not the
 
2295
                         * same as the key we are adding...
 
2296
                         */
 
2297
                        if (result.sr_item.i_item_offset < result.sr_item.i_total_size &&
 
2298
                                result.sr_row_id == (xtRowID) -1) {
 
2299
                                if (!idx_cmp_item_key_fix(&iref, &t_res.sr_item, &key_value))
 
2300
                                        goto try_current;
 
2301
                        }
 
2302
 
 
2303
                        idx_set_item_key_fix(&iref, &t_res.sr_item, &key_value);
 
2304
                        iref.ir_block->cp_del_count--;
 
2305
                        xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
2306
                        goto done_ok;
 
2307
                }
 
2308
 
 
2309
                try_current:
 
2310
                if (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
2311
                        if (result.sr_row_id == (xtRowID) -1) {
 
2312
                                idx_set_item_key_fix(&iref, &result.sr_item, &key_value);
 
2313
                                iref.ir_block->cp_del_count--;
 
2314
                                xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
2315
                                goto done_ok;
 
2316
                        }
 
2317
                }
 
2318
 
 
2319
                /* Check if we must compact... 
 
2320
                 * It makes no sense to split as long as there are lazy deleted items
 
2321
                 * in the page. So, delete them if a split would otherwise be required!
 
2322
                 */
 
2323
                ASSERT_NS(key_value.sv_length + XT_RECORD_REF_SIZE == result.sr_item.i_item_size);
 
2324
                if (result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE > XT_INDEX_PAGE_DATA_SIZE) {
 
2325
                        if (!idx_compact_leaf(ot, ind, &iref, &result.sr_item))
 
2326
                                goto failed;
 
2327
                        updated = TRUE;
 
2328
                }
 
2329
                
 
2330
                /* Fall through to the insert code... */
 
2331
                /* NOTE: if there were no lazy deleted items in the leaf, then
 
2332
                 * idx_compact_leaf is a NOP. This is the only case in which it may not
 
2333
                 * fall through and do the insert below.
 
2334
                 *
 
2335
                 * Normally, if the cp_del_count is correct then the insert
 
2336
                 * will work below, and the assertion here will not fail.
 
2337
                 *
 
2338
                 * In this case, the xt_ind_release() will correctly indicate an update.
 
2339
                 */
 
2340
                ASSERT_NS(result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE <= XT_INDEX_PAGE_DATA_SIZE);
 
2341
        }
 
2342
 
 
2343
        if (result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE <= XT_INDEX_PAGE_DATA_SIZE) {
 
2344
                if (iref.ir_block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
2345
                        ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
2346
                        if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, iref.ir_block))
 
2347
                                goto failed_1;
 
2348
                }
 
2349
 
 
2350
                if (iref.ir_block->cb_handle_count) {
 
2351
                        if (!xt_ind_copy_on_write(&iref))
 
2352
                                goto failed_1;
 
2353
                }
 
2354
 
 
2355
                idx_insert_leaf_item(ind, iref.ir_branch, &key_value, &result);
 
2356
                IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
2357
                ASSERT_NS(result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
 
2358
#ifdef IND_OPT_DATA_WRITTEN
 
2359
                iref.ir_block->cb_header = TRUE;
 
2360
                if (result.sr_item.i_item_offset < iref.ir_block->cb_min_pos)
 
2361
                        iref.ir_block->cb_min_pos = result.sr_item.i_item_offset;
 
2362
                iref.ir_block->cb_max_pos = result.sr_item.i_total_size;
 
2363
                ASSERT_NS(iref.ir_block->cb_max_pos <= XT_INDEX_PAGE_DATA_SIZE);
 
2364
        ASSERT_NS(iref.ir_block->cb_min_pos <= iref.ir_block->cb_max_pos);
 
2365
#endif
 
2366
                iref.ir_updated = TRUE;
 
2367
                xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
2368
                goto done_ok;
 
2369
        }
 
2370
 
 
2371
        /* Key does not fit. Must split the node.
 
2372
         * Make sure we have a structural lock:
 
2373
         */
 
2374
        if (!lock_structure) {
 
2375
                xt_ind_release(ot, ind, updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, &iref);
 
2376
                XT_INDEX_UNLOCK(ind, ot);
 
2377
                lock_structure = TRUE;
 
2378
                goto lock_and_retry;
 
2379
        }
 
2380
 
 
2381
        memcpy(&ot->ot_ind_wbuf, iref.ir_branch, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_total_size);
 
2382
        idx_insert_leaf_item(ind, &ot->ot_ind_wbuf, &key_value, &result);
 
2383
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
2384
        ASSERT_NS(result.sr_item.i_total_size > XT_INDEX_PAGE_DATA_SIZE && result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE*2);
 
2385
#ifdef IND_OPT_DATA_WRITTEN
 
2386
        new_min_pos = result.sr_item.i_item_offset;
 
2387
#endif
 
2388
 
 
2389
        /* This is the number of potential writes. In other words, the total number
 
2390
         * of blocks that may be accessed.
 
2391
         *
 
2392
         * Note that this assume if a block is read and written soon after that the block
 
2393
         * will not be freed in-between (a safe assumption?)
 
2394
         */
 
2395
        if (!xt_ind_reserve(ot, stack.s_top * 2 + 3, iref.ir_branch))
 
2396
                goto failed_1;
 
2397
 
 
2398
        /* Key does not fit, must split... */
 
2399
        if (!idx_get_middle_branch_item(ot, ind, &ot->ot_ind_wbuf, &key_value, &result))
 
2400
                goto failed_1;
 
2401
 
 
2402
        if (!idx_new_branch(ot, ind, &new_branch))
 
2403
                goto failed_1;
 
2404
 
 
2405
        /* Copy and write the rest of the data to the new node: */
 
2406
        new_size = result.sr_item.i_total_size - result.sr_item.i_item_offset - result.sr_item.i_item_size;
 
2407
        new_branch_ptr = (XTIdxBranchDPtr) &ot->ot_ind_wbuf.tb_data[XT_INDEX_PAGE_DATA_SIZE];
 
2408
        memmove(new_branch_ptr->tb_data, &ot->ot_ind_wbuf.tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size], new_size);
 
2409
 
 
2410
        XT_SET_DISK_2(new_branch_ptr->tb_size_2, XT_MAKE_LEAF_SIZE(new_size));
 
2411
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(new_branch_ptr->tb_size_2));
 
2412
        if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + new_size, (xtWord1 *) new_branch_ptr))
 
2413
                goto failed_2;
 
2414
 
 
2415
        /* Modify the first node: */
 
2416
        XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_LEAF_SIZE(result.sr_item.i_item_offset));
 
2417
        IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 
2418
 
 
2419
        if (iref.ir_block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
2420
                ASSERT_NS(ot->ot_table->tab_ind_flush_ilog);
 
2421
                if (!ot->ot_table->tab_ind_flush_ilog->il_write_block(ot, iref.ir_block))
 
2422
                        goto failed_2;
 
2423
        }
 
2424
 
 
2425
        if (iref.ir_block->cb_handle_count) {
 
2426
                if (!xt_ind_copy_on_write(&iref))
 
2427
                        goto failed_2;
 
2428
        }
 
2429
#ifdef IND_OPT_DATA_WRITTEN
 
2430
        if (result.sr_item.i_item_offset < new_min_pos)
 
2431
                new_min_pos = result.sr_item.i_item_offset;
 
2432
#endif
 
2433
        memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
 
2434
#ifdef IND_OPT_DATA_WRITTEN
 
2435
        iref.ir_block->cb_header = TRUE;
 
2436
        if (new_min_pos < iref.ir_block->cb_min_pos)
 
2437
                iref.ir_block->cb_min_pos = new_min_pos;
 
2438
        iref.ir_block->cb_max_pos = result.sr_item.i_item_offset;
 
2439
        ASSERT_NS(iref.ir_block->cb_max_pos <= XT_INDEX_PAGE_DATA_SIZE);
 
2440
        ASSERT_NS(iref.ir_block->cb_min_pos <= iref.ir_block->cb_max_pos);
 
2441
#endif
 
2442
        iref.ir_updated = TRUE;
 
2443
        xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
2444
 
 
2445
        /* Insert the new branch into the parent node, using the new middle key value: */
 
2446
        if (!idx_insert_node(ot, ind, &stack, result.sr_last_item, &key_value, new_branch)) {
 
2447
                // Index may be inconsistant now...
 
2448
                idx_free_branch(ot, ind, new_branch);
 
2449
                goto failed;
 
2450
        }
 
2451
 
 
2452
#ifdef XT_TRACK_INDEX_UPDATES
 
2453
        ASSERT_NS(ot->ot_ind_reserved >= ot->ot_ind_reads);
 
2454
#endif
 
2455
 
 
2456
        done_ok:
 
2457
        XT_INDEX_UNLOCK(ind, ot);
 
2458
 
 
2459
#ifdef DEBUG
 
2460
        //printf("INSERT OK\n");
 
2461
        //idx_check_index(ot, ind, TRUE);
 
2462
#endif
 
2463
        xt_ind_unreserve(ot);
 
2464
        return OK;
 
2465
 
 
2466
        failed_2:
 
2467
        idx_free_branch(ot, ind, new_branch);
 
2468
 
 
2469
        failed_1:
 
2470
        xt_ind_release(ot, ind, updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, &iref);
 
2471
 
 
2472
        failed:
 
2473
        XT_INDEX_UNLOCK(ind, ot);
 
2474
        if (idx_out_of_memory_failure(ot))
 
2475
                goto retry_after_oom;
 
2476
 
 
2477
#ifdef DEBUG
 
2478
        //printf("INSERT FAILED\n");
 
2479
        //idx_check_index(ot, ind, TRUE);
 
2480
#endif
 
2481
        xt_ind_unreserve(ot);
 
2482
        return FAILED;
 
2483
}
 
2484
 
 
2485
 
 
2486
/* Remove the given item in the node.
 
2487
 * This is done by going down the tree to find a replacement
 
2488
 * for the deleted item!
 
2489
 */
 
2490
static xtBool idx_remove_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value)
 
2491
{
 
2492
        IdxStackItemPtr         delete_node;
 
2493
        XTIdxResultRec          result;
 
2494
        xtIndexNodeID           current;
 
2495
        xtBool                          lazy_delete_cleanup_required = FALSE;
 
2496
        IdxStackItemPtr         current_top;
 
2497
 
 
2498
        delete_node = idx_top(stack);
 
2499
        current = delete_node->i_branch;
 
2500
        result.sr_item = delete_node->i_pos;
 
2501
 
 
2502
        /* Follow the branch after this item: */
 
2503
        idx_next_branch_item(ot->ot_table, ind, iref->ir_branch, &result);
 
2504
        xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
 
2505
 
 
2506
        /* Go down the left-hand side until we reach a leaf: */
 
2507
        while (XT_NODE_ID(current)) {
 
2508
                current = result.sr_branch;
 
2509
                if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, iref))
 
2510
                        return FAILED;
 
2511
                idx_first_branch_item(ot->ot_table, ind, iref->ir_branch, &result);
 
2512
                if (!result.sr_item.i_node_ref_size)
 
2513
                        break;
 
2514
                xt_ind_release(ot, ind, XT_UNLOCK_READ, iref);
 
2515
                if (!idx_push(stack, current, &result.sr_item))
 
2516
                        return FAILED;
 
2517
        }
 
2518
 
 
2519
        ASSERT_NS(XT_NODE_ID(current));
 
2520
        ASSERT_NS(!result.sr_item.i_node_ref_size);
 
2521
 
 
2522
        if (!xt_ind_reserve(ot, stack->s_top + 2, iref->ir_branch)) {
 
2523
                xt_ind_release(ot, ind, XT_UNLOCK_WRITE, iref);
 
2524
                return FAILED;
 
2525
        }
 
2526
        
 
2527
        /* This code removes lazy deleted items from the leaf,
 
2528
         * before we promote an item to a leaf.
 
2529
         * This is not essential, but prevents lazy deleted
 
2530
         * items from being propogated up the tree.
 
2531
         */
 
2532
        if (ind->mi_lazy_delete) {
 
2533
                if (iref->ir_block->cp_del_count) {
 
2534
                        if (!idx_compact_leaf(ot, ind, iref, &result.sr_item))
 
2535
                                return FAILED;
 
2536
                }
 
2537
        }
 
2538
 
 
2539
        /* Crawl back up the stack trace, looking for a key
 
2540
         * that can be used to replace the deleted key.
 
2541
         *
 
2542
         * Any empty nodes on the way up can be removed!
 
2543
         */
 
2544
        if (result.sr_item.i_total_size > 0) {
 
2545
                /* There is a key in the leaf, extract it, and put it in the node: */
 
2546
                memcpy(key_value->sv_key, &iref->ir_branch->tb_data[result.sr_item.i_item_offset], result.sr_item.i_item_size);
 
2547
                /* This call also frees the iref.ir_branch page! */
 
2548
                if (!idx_remove_branch_item_right(ot, ind, current, iref, &result.sr_item))
 
2549
                        return FAILED;
 
2550
                if (!idx_replace_node_key(ot, ind, delete_node, stack, result.sr_item.i_item_size, key_value->sv_key))
 
2551
                        return FAILED;
 
2552
                goto done_ok;
 
2553
        }
 
2554
 
 
2555
        xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, iref);
 
2556
 
 
2557
        for (;;) {
 
2558
                /* The current node/leaf is empty, remove it: */
 
2559
                idx_free_branch(ot, ind, current);
 
2560
 
 
2561
                current_top = idx_pop(stack);
 
2562
                current = current_top->i_branch;
 
2563
                if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, iref))
 
2564
                        return FAILED;
 
2565
                
 
2566
                if (current_top == delete_node) {
 
2567
                        /* All children have been removed. Delete the key and done: */
 
2568
                        if (!idx_remove_branch_item_right(ot, ind, current, iref, &current_top->i_pos))
 
2569
                                return FAILED;
 
2570
                        goto done_ok;
 
2571
                }
 
2572
 
 
2573
                if (current_top->i_pos.i_total_size > current_top->i_pos.i_node_ref_size) {
 
2574
                        /* Save the key: */
 
2575
                        memcpy(key_value->sv_key, &iref->ir_branch->tb_data[current_top->i_pos.i_item_offset], current_top->i_pos.i_item_size);
 
2576
                        /* This function also frees the cache page: */
 
2577
                        if (!idx_remove_branch_item_left(ot, ind, current, iref, &current_top->i_pos, &lazy_delete_cleanup_required))
 
2578
                                return FAILED;
 
2579
                        if (!idx_replace_node_key(ot, ind, delete_node, stack, current_top->i_pos.i_item_size, key_value->sv_key))
 
2580
                                return FAILED;
 
2581
                        /* */
 
2582
                        if (lazy_delete_cleanup_required) {
 
2583
                                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, iref))
 
2584
                                        return FAILED;
 
2585
                                if (!idx_remove_lazy_deleted_item_in_node(ot, ind, current, iref, key_value))
 
2586
                                        return FAILED;
 
2587
                        }
 
2588
                        goto done_ok;
 
2589
                }
 
2590
                xt_ind_release(ot, ind, current_top->i_pos.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
 
2591
        }
 
2592
 
 
2593
        done_ok:
 
2594
#ifdef XT_TRACK_INDEX_UPDATES
 
2595
        ASSERT_NS(ot->ot_ind_reserved >= ot->ot_ind_reads);
 
2596
#endif
 
2597
        return OK;
 
2598
}
 
2599
 
 
2600
/*
 
2601
 * This function assumes we have a lock on the structure of the index.
 
2602
 */
 
2603
static xtBool idx_remove_lazy_deleted_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID current, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value)
 
2604
{
 
2605
        IdxBranchStackRec       stack;
 
2606
        XTIdxResultRec          result;
 
2607
 
 
2608
        /* Now remove all lazy deleted items in this node.... */
 
2609
        idx_first_branch_item(ot->ot_table, ind, (XTIdxBranchDPtr) iref->ir_block->cb_data, &result);
 
2610
 
 
2611
        for (;;) {
 
2612
                while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
2613
                        if (result.sr_row_id == (xtRowID) -1)
 
2614
                                goto remove_item;
 
2615
                        idx_next_branch_item(ot->ot_table, ind, (XTIdxBranchDPtr) iref->ir_block->cb_data, &result);
 
2616
                }
 
2617
                break;
 
2618
 
 
2619
                remove_item:
 
2620
 
 
2621
                idx_newstack(&stack);
 
2622
                if (!idx_push(&stack, current, &result.sr_item)) {
 
2623
                        xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
 
2624
                        return FAILED;
 
2625
                }
 
2626
 
 
2627
                if (!idx_remove_item_in_node(ot, ind, &stack, iref, key_value))
 
2628
                        return FAILED;
 
2629
 
 
2630
                /* Go back up to the node we are trying to
 
2631
                 * free of things.
 
2632
                 */
 
2633
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, iref))
 
2634
                        return FAILED;
 
2635
                /* Load the data again: */
 
2636
                idx_reload_item_fix(ind, iref->ir_branch, &result);
 
2637
        }
 
2638
 
 
2639
        xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
 
2640
        return OK;
 
2641
}
 
2642
 
 
2643
static xtBool idx_delete(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxKeyValuePtr key_value)
 
2644
{
 
2645
        IdxBranchStackRec       stack;
 
2646
        xtIndexNodeID           current;
 
2647
        XTIndReferenceRec       iref;
 
2648
        XTIdxResultRec          result;
 
2649
        xtBool                          lock_structure = FALSE;
 
2650
 
 
2651
#ifdef DEBUG
 
2652
        iref.ir_xlock = 2;
 
2653
        iref.ir_updated = 2;
 
2654
#endif
 
2655
        /* The index appears to have no root: */
 
2656
        if (!XT_NODE_ID(ind->mi_root))
 
2657
                lock_structure = TRUE;
 
2658
 
 
2659
        lock_and_retry:
 
2660
        idx_newstack(&stack);
 
2661
 
 
2662
        if (lock_structure)
 
2663
                XT_INDEX_WRITE_LOCK(ind, ot);
 
2664
        else
 
2665
                XT_INDEX_READ_LOCK(ind, ot);
 
2666
 
 
2667
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
 
2668
                goto done_ok;
 
2669
 
 
2670
        while (XT_NODE_ID(current)) {
 
2671
                if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_DEL_LEAF, &iref))
 
2672
                        goto failed;
 
2673
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
 
2674
                if (!result.sr_item.i_node_ref_size) {
 
2675
                        /* A leaf... */
 
2676
                        if (result.sr_found) {
 
2677
                                if (ind->mi_lazy_delete) {
 
2678
                                        /* If the we have a W lock, then fetch decided that we
 
2679
                                         * need to compact the page.
 
2680
                                         * The decision is made by xt_idx_lazy_delete_on_leaf() 
 
2681
                                         */
 
2682
                                        if (!iref.ir_xlock) {
 
2683
                                                if (!idx_lazy_delete_branch_item(ot, ind, &iref, &result.sr_item))
 
2684
                                                        goto failed;
 
2685
                                        }
 
2686
                                        else {
 
2687
                                                if (!iref.ir_block->cp_del_count) {
 
2688
                                                        if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
 
2689
                                                                goto failed;
 
2690
                                                }
 
2691
                                                else {
 
2692
                                                        if (!idx_lazy_remove_leaf_item_right(ot, ind, &iref, &result.sr_item))
 
2693
                                                                goto failed;
 
2694
                                                }
 
2695
                                        }
 
2696
                                }
 
2697
                                else {
 
2698
                                        if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
 
2699
                                                goto failed;
 
2700
                                }
 
2701
                        }
 
2702
                        else
 
2703
                                xt_ind_release(ot, ind, iref.ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, &iref);
 
2704
                        goto done_ok;
 
2705
                }
 
2706
                if (!idx_push(&stack, current, &result.sr_item)) {
 
2707
                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2708
                        goto failed;
 
2709
                }
 
2710
                if (result.sr_found)
 
2711
                        /* If we have found the key in a node: */
 
2712
                        break;
 
2713
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2714
                current = result.sr_branch;
 
2715
        }
 
2716
 
 
2717
        /* Must be a non-leaf!: */
 
2718
        ASSERT_NS(result.sr_item.i_node_ref_size);
 
2719
 
 
2720
        if (ind->mi_lazy_delete) {
 
2721
                if (!idx_lazy_delete_on_node(ind, iref.ir_block, &result.sr_item)) {
 
2722
                        /* We need to remove some items from this node: */
 
2723
 
 
2724
                        if (!lock_structure) {
 
2725
                                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2726
                                XT_INDEX_UNLOCK(ind, ot);
 
2727
                                lock_structure = TRUE;
 
2728
                                goto lock_and_retry;
 
2729
                        }
 
2730
 
 
2731
                        if (!idx_set_item_deleted(ot, ind, &iref, &result.sr_item))
 
2732
                                goto failed;
 
2733
                        if (!idx_remove_lazy_deleted_item_in_node(ot, ind, current, &iref, key_value))
 
2734
                                goto failed;
 
2735
                        goto done_ok;
 
2736
                }
 
2737
 
 
2738
                if (!ot->ot_table->tab_dic.dic_no_lazy_delete) {
 
2739
                        /* {LAZY-DEL-INDEX-ITEMS}
 
2740
                         * We just set item to deleted, this is a significant time
 
2741
                         * saver.
 
2742
                         * But this item can only be cleaned up when all
 
2743
                         * items on the node below are deleted.
 
2744
                         */
 
2745
                        if (!idx_lazy_delete_branch_item(ot, ind, &iref, &result.sr_item))
 
2746
                                goto failed;
 
2747
                        goto done_ok;
 
2748
                }
 
2749
        }
 
2750
 
 
2751
        /* We will have to remove the key from a non-leaf node,
 
2752
         * which means we are changing the structure of the index.
 
2753
         * Make sure we have a structural lock:
 
2754
         */
 
2755
        if (!lock_structure) {
 
2756
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2757
                XT_INDEX_UNLOCK(ind, ot);
 
2758
                lock_structure = TRUE;
 
2759
                goto lock_and_retry;
 
2760
        }
 
2761
 
 
2762
        /* This is the item we will have to replace: */
 
2763
        if (!idx_remove_item_in_node(ot, ind, &stack, &iref, key_value))
 
2764
                goto failed;
 
2765
 
 
2766
        done_ok:
 
2767
        XT_INDEX_UNLOCK(ind, ot);
 
2768
 
 
2769
#ifdef DEBUG
 
2770
        //printf("DELETE OK\n");
 
2771
        //idx_check_index(ot, ind, TRUE);
 
2772
#endif
 
2773
        xt_ind_unreserve(ot);
 
2774
        return OK;
 
2775
 
 
2776
        failed:
 
2777
        XT_INDEX_UNLOCK(ind, ot);
 
2778
        xt_ind_unreserve(ot);
 
2779
        return FAILED;
 
2780
}
 
2781
 
 
2782
xtPublic xtBool xt_idx_delete(XTOpenTablePtr ot, XTIndexPtr ind, xtRecordID rec_id, xtWord1 *rec_buf)
 
2783
{
 
2784
        XTIdxKeyValueRec        key_value;
 
2785
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE + XT_MAX_RECORD_REF_SIZE];
 
2786
 
 
2787
        retry_after_oom:
 
2788
#ifdef XT_TRACK_INDEX_UPDATES
 
2789
        ot->ot_ind_changed = 0;
 
2790
#endif
 
2791
 
 
2792
        key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
2793
        key_value.sv_rec_id = rec_id;
 
2794
        key_value.sv_row_id = 0;
 
2795
        key_value.sv_key = key_buf;
 
2796
        key_value.sv_length = myxt_create_key_from_row(ind, key_buf, rec_buf, NULL);
 
2797
 
 
2798
        if (!idx_delete(ot, ind, &key_value)) {
 
2799
                if (idx_out_of_memory_failure(ot))
 
2800
                        goto retry_after_oom;
 
2801
                return FAILED;
 
2802
        }
 
2803
        return OK;
 
2804
}
 
2805
 
 
2806
xtPublic xtBool xt_idx_update_row_id(XTOpenTablePtr ot, XTIndexPtr ind, xtRecordID rec_id, xtRowID row_id, xtWord1 *rec_buf)
 
2807
{
 
2808
        xtIndexNodeID           current;
 
2809
        XTIndReferenceRec       iref;
 
2810
        XTIdxResultRec          result;
 
2811
        XTIdxKeyValueRec        key_value;
 
2812
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE + XT_MAX_RECORD_REF_SIZE];
 
2813
 
 
2814
#ifdef DEBUG
 
2815
        iref.ir_xlock = 2;
 
2816
        iref.ir_updated = 2;
 
2817
#endif
 
2818
#ifdef CHECK_AND_PRINT
 
2819
        idx_check_index(ot, ind, TRUE);
 
2820
#endif
 
2821
        retry_after_oom:
 
2822
#ifdef XT_TRACK_INDEX_UPDATES
 
2823
        ot->ot_ind_changed = 0;
 
2824
#endif
 
2825
        key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
2826
        key_value.sv_rec_id = rec_id;
 
2827
        key_value.sv_row_id = 0;
 
2828
        key_value.sv_key = key_buf;
 
2829
        key_value.sv_length = myxt_create_key_from_row(ind, key_buf, rec_buf, NULL);
 
2830
 
 
2831
        /* NOTE: Only a read lock is required for this!!
 
2832
         *
 
2833
         * 09.05.2008 - This has changed because the dirty list now
 
2834
         * hangs on the index. And the dirty list may be updated
 
2835
         * by any change of the index.
 
2836
         * However, the advantage is that I should be able to read
 
2837
         * lock in the first phase of the flush.
 
2838
         *
 
2839
         * 18.02.2009 - This has changed again.
 
2840
         * I am now using a read lock, because this update does not
 
2841
         * require a structural change. In fact, it does not even
 
2842
         * need a WRITE LOCK on the page affected, because there
 
2843
         * is only ONE thread that can do this (the sweeper).
 
2844
         *
 
2845
         * This has the advantage that the sweeper (which uses this
 
2846
         * function, causes less conflicts.
 
2847
         *
 
2848
         * However, it does mean that the dirty list must be otherwise
 
2849
         * protected (which it now is be a spin lock - mi_dirty_lock).
 
2850
         *
 
2851
         * It also has the dissadvantage that I am going to have to
 
2852
         * take an xlock in the first phase of the flush.
 
2853
         */
 
2854
        XT_INDEX_READ_LOCK(ind, ot);
 
2855
 
 
2856
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
 
2857
                goto done_ok;
 
2858
 
 
2859
        while (XT_NODE_ID(current)) {
 
2860
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
2861
                        goto failed;
 
2862
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 
2863
                if (result.sr_found || !result.sr_item.i_node_ref_size)
 
2864
                        break;
 
2865
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2866
                current = result.sr_branch;
 
2867
        }
 
2868
 
 
2869
        if (result.sr_found) {
 
2870
                /* TODO: Check that concurrent reads can handle this!
 
2871
                 * assuming the write is not atomic.
 
2872
                 */
 
2873
                if (!idx_set_item_row_id(ot, ind, &iref, &result.sr_item, row_id))
 
2874
                        goto failed;
 
2875
                xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
 
2876
        }
 
2877
        else
 
2878
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2879
 
 
2880
        done_ok:
 
2881
        XT_INDEX_UNLOCK(ind, ot);
 
2882
 
 
2883
#ifdef DEBUG
 
2884
        //idx_check_index(ot, ind, TRUE);
 
2885
        //idx_check_on_key(ot);
 
2886
#endif
 
2887
        return OK;
 
2888
 
 
2889
        failed:
 
2890
        XT_INDEX_UNLOCK(ind, ot);
 
2891
        if (idx_out_of_memory_failure(ot))
 
2892
                goto retry_after_oom;
 
2893
        return FAILED;
 
2894
}
 
2895
 
 
2896
xtPublic void xt_idx_prep_key(XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, int flags, xtWord1 *in_key_buf, size_t in_key_length)
 
2897
{
 
2898
        search_key->sk_key_value.sv_flags = flags;
 
2899
        search_key->sk_key_value.sv_rec_id = 0;
 
2900
        search_key->sk_key_value.sv_row_id = 0;
 
2901
        search_key->sk_key_value.sv_key = search_key->sk_key_buf;
 
2902
        search_key->sk_key_value.sv_length = myxt_create_key_from_key(ind, search_key->sk_key_buf, in_key_buf, in_key_length);
 
2903
        search_key->sk_on_key = FALSE;
 
2904
}
 
2905
 
 
2906
xtPublic xtBool xt_idx_research(XTOpenTablePtr ot, XTIndexPtr ind)
 
2907
{
 
2908
        XTIdxSearchKeyRec search_key;
 
2909
 
 
2910
        xt_ind_lock_handle(ot->ot_ind_rhandle);
 
2911
        search_key.sk_key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
2912
        xt_get_record_ref(&ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset + ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE],
 
2913
                &search_key.sk_key_value.sv_rec_id, &search_key.sk_key_value.sv_row_id);
 
2914
        search_key.sk_key_value.sv_key = search_key.sk_key_buf;
 
2915
        search_key.sk_key_value.sv_length = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
 
2916
        search_key.sk_on_key = FALSE;
 
2917
        memcpy(search_key.sk_key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset], search_key.sk_key_value.sv_length);
 
2918
        xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
2919
        return xt_idx_search(ot, ind, &search_key);
 
2920
}
 
2921
 
 
2922
/*
 
2923
 * Search for a given key and position the current pointer on the first
 
2924
 * key in the list of duplicates. If the key is not found the current
 
2925
 * pointer is placed at the first position after the key.
 
2926
 */
 
2927
xtPublic xtBool xt_idx_search(XTOpenTablePtr ot, XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
 
2928
{
 
2929
        IdxBranchStackRec       stack;
 
2930
        xtIndexNodeID           current;
 
2931
        XTIndReferenceRec       iref;
 
2932
        XTIdxResultRec          result;
 
2933
 
 
2934
#ifdef DEBUG
 
2935
        iref.ir_xlock = 2;
 
2936
        iref.ir_updated = 2;
 
2937
#endif
 
2938
        if (ot->ot_ind_rhandle) {
 
2939
                xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
 
2940
                ot->ot_ind_rhandle = NULL;
 
2941
        }
 
2942
#ifdef DEBUG
 
2943
        //idx_check_index(ot, ind, TRUE);
 
2944
#endif
 
2945
 
 
2946
        /* Calling from recovery, this is not the case.
 
2947
         * But the index read does not require a transaction!
 
2948
         * Only insert requires this to check for duplicates.
 
2949
        if (!ot->ot_thread->st_xact_data) {
 
2950
                xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_TRANSACTION);
 
2951
                return FAILED;
 
2952
        }
 
2953
        */
 
2954
 
 
2955
        retry_after_oom:
 
2956
#ifdef XT_TRACK_INDEX_UPDATES
 
2957
        ot->ot_ind_changed = 0;
 
2958
#endif
 
2959
        idx_newstack(&stack);
 
2960
 
 
2961
        ot->ot_curr_rec_id = 0;
 
2962
        ot->ot_curr_row_id = 0;
 
2963
 
 
2964
        XT_INDEX_READ_LOCK(ind, ot);
 
2965
 
 
2966
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
 
2967
                goto done_ok;
 
2968
 
 
2969
        while (XT_NODE_ID(current)) {
 
2970
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
2971
                        goto failed;
 
2972
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &search_key->sk_key_value, &result);
 
2973
                if (result.sr_found)
 
2974
                        /* If we have found the key in a node: */
 
2975
                        search_key->sk_on_key = TRUE;
 
2976
                if (!result.sr_item.i_node_ref_size)
 
2977
                        break;
 
2978
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
2979
                if (!idx_push(&stack, current, &result.sr_item))
 
2980
                        goto failed;
 
2981
                current = result.sr_branch;
 
2982
        }
 
2983
 
 
2984
        if (ind->mi_lazy_delete) {
 
2985
                ignore_lazy_deleted_items:
 
2986
                while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
2987
                        if (result.sr_row_id != (xtRowID) -1) {
 
2988
                                idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
 
2989
                                break;
 
2990
                        }
 
2991
                        idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
2992
                }
 
2993
        }
 
2994
 
 
2995
        if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
 
2996
                IdxStackItemPtr node;
 
2997
 
 
2998
                /* We are at the end of a leaf node.
 
2999
                 * Go up the stack to find the start position of the next key.
 
3000
                 * If we find none, then we are the end of the index.
 
3001
                 */
 
3002
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3003
                while ((node = idx_pop(&stack))) {
 
3004
                        if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
 
3005
                                if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 
3006
                                        goto failed;
 
3007
                                xt_get_res_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &result);
 
3008
 
 
3009
                                if (ind->mi_lazy_delete) {
 
3010
                                        result.sr_item = node->i_pos;
 
3011
                                        if (result.sr_row_id == (xtRowID) -1) {
 
3012
                                                /* If this node pointer is lazy deleted, then
 
3013
                                                 * go down the next branch...
 
3014
                                                 */
 
3015
                                                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3016
 
 
3017
                                                /* Go down to the bottom: */
 
3018
                                                current = node->i_branch;
 
3019
                                                while (XT_NODE_ID(current)) {
 
3020
                                                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3021
                                                        if (!idx_push(&stack, current, &result.sr_item))
 
3022
                                                                goto failed;
 
3023
                                                        current = result.sr_branch;
 
3024
                                                        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3025
                                                                goto failed;
 
3026
                                                        idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3027
                                                        if (!result.sr_item.i_node_ref_size)
 
3028
                                                                break;
 
3029
                                                }
 
3030
 
 
3031
                                                goto ignore_lazy_deleted_items;
 
3032
                                        }
 
3033
                                        idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
 
3034
                                }
 
3035
 
 
3036
                                ot->ot_curr_rec_id = result.sr_rec_id;
 
3037
                                ot->ot_curr_row_id = result.sr_row_id;
 
3038
                                ot->ot_ind_state = node->i_pos;
 
3039
 
 
3040
                                /* Convert the pointer to a handle which can be used in later operations: */
 
3041
                                ASSERT_NS(!ot->ot_ind_rhandle);
 
3042
                                if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
 
3043
                                        goto failed;
 
3044
                                /* Keep the node for next operations: */
 
3045
                                /*
 
3046
                                branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
3047
                                memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
 
3048
                                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3049
                                */
 
3050
                                break;
 
3051
                        }
 
3052
                }
 
3053
        }
 
3054
        else {
 
3055
                ot->ot_curr_rec_id = result.sr_rec_id;
 
3056
                ot->ot_curr_row_id = result.sr_row_id;
 
3057
                ot->ot_ind_state = result.sr_item;
 
3058
 
 
3059
                /* Convert the pointer to a handle which can be used in later operations: */
 
3060
                ASSERT_NS(!ot->ot_ind_rhandle);
 
3061
                if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
 
3062
                        goto failed;
 
3063
                /* Keep the node for next operations: */
 
3064
                /*
 
3065
                branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
3066
                memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
 
3067
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3068
                */
 
3069
        }
 
3070
 
 
3071
        done_ok:
 
3072
        XT_INDEX_UNLOCK(ind, ot);
 
3073
 
 
3074
#ifdef DEBUG
 
3075
        //idx_check_index(ot, ind, TRUE);
 
3076
        //idx_check_on_key(ot);
 
3077
#endif
 
3078
        ASSERT_NS(iref.ir_xlock == 2);
 
3079
        ASSERT_NS(iref.ir_updated == 2);
 
3080
        return OK;
 
3081
 
 
3082
        failed:
 
3083
        XT_INDEX_UNLOCK(ind, ot);
 
3084
        if (idx_out_of_memory_failure(ot))
 
3085
                goto retry_after_oom;
 
3086
        ASSERT_NS(iref.ir_xlock == 2);
 
3087
        ASSERT_NS(iref.ir_updated == 2);
 
3088
        return FAILED;
 
3089
}
 
3090
 
 
3091
xtPublic xtBool xt_idx_search_prev(XTOpenTablePtr ot, XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
 
3092
{
 
3093
        IdxBranchStackRec       stack;
 
3094
        xtIndexNodeID           current;
 
3095
        XTIndReferenceRec       iref;
 
3096
        XTIdxResultRec          result;
 
3097
 
 
3098
#ifdef DEBUG
 
3099
        iref.ir_xlock = 2;
 
3100
        iref.ir_updated = 2;
 
3101
#endif
 
3102
        if (ot->ot_ind_rhandle) {
 
3103
                xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
 
3104
                ot->ot_ind_rhandle = NULL;
 
3105
        }
 
3106
#ifdef DEBUG
 
3107
        //idx_check_index(ot, ind, TRUE);
 
3108
#endif
 
3109
 
 
3110
        /* see the comment above in xt_idx_search */
 
3111
        /*
 
3112
        if (!ot->ot_thread->st_xact_data) {
 
3113
                xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_TRANSACTION);
 
3114
                return FAILED;
 
3115
        }
 
3116
        */
 
3117
 
 
3118
        retry_after_oom:
 
3119
#ifdef XT_TRACK_INDEX_UPDATES
 
3120
        ot->ot_ind_changed = 0;
 
3121
#endif
 
3122
        idx_newstack(&stack);
 
3123
 
 
3124
        ot->ot_curr_rec_id = 0;
 
3125
        ot->ot_curr_row_id = 0;
 
3126
 
 
3127
        XT_INDEX_READ_LOCK(ind, ot);
 
3128
 
 
3129
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
 
3130
                goto done_ok;
 
3131
 
 
3132
        while (XT_NODE_ID(current)) {
 
3133
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3134
                        goto failed;
 
3135
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &search_key->sk_key_value, &result);
 
3136
                if (result.sr_found)
 
3137
                        /* If we have found the key in a node: */
 
3138
                        search_key->sk_on_key = TRUE;
 
3139
                if (!result.sr_item.i_node_ref_size)
 
3140
                        break;
 
3141
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3142
                if (!idx_push(&stack, current, &result.sr_item))
 
3143
                        goto failed;
 
3144
                current = result.sr_branch;
 
3145
        }
 
3146
 
 
3147
        if (result.sr_item.i_item_offset == 0) {
 
3148
                IdxStackItemPtr node;
 
3149
 
 
3150
                search_up_stack:
 
3151
                /* We are at the start of a leaf node.
 
3152
                 * Go up the stack to find the start position of the next key.
 
3153
                 * If we find none, then we are the end of the index.
 
3154
                 */
 
3155
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3156
                while ((node = idx_pop(&stack))) {
 
3157
                        if (node->i_pos.i_item_offset > node->i_pos.i_node_ref_size) {
 
3158
                                if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 
3159
                                        goto failed;
 
3160
                                result.sr_item = node->i_pos;
 
3161
                                ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3162
 
 
3163
                                if (ind->mi_lazy_delete) {
 
3164
                                        if (result.sr_row_id == (xtRowID) -1) {
 
3165
                                                /* Go down to the bottom, in order to scan the leaf backwards: */
 
3166
                                                current = node->i_branch;
 
3167
                                                while (XT_NODE_ID(current)) {
 
3168
                                                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3169
                                                        if (!idx_push(&stack, current, &result.sr_item))
 
3170
                                                                goto failed;
 
3171
                                                        current = result.sr_branch;
 
3172
                                                        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3173
                                                                goto failed;
 
3174
                                                        ind->mi_last_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3175
                                                        if (!result.sr_item.i_node_ref_size)
 
3176
                                                                break;
 
3177
                                                }
 
3178
 
 
3179
                                                /* If the leaf empty we have to go up the stack again... */
 
3180
                                                if (result.sr_item.i_total_size == 0)
 
3181
                                                        goto search_up_stack;
 
3182
 
 
3183
                                                goto scan_back_in_leaf;
 
3184
                                        }
 
3185
                                }
 
3186
 
 
3187
                                goto record_found;
 
3188
                        }
 
3189
                }
 
3190
                goto done_ok;
 
3191
        }
 
3192
 
 
3193
        /* We must just step once to the left in this leaf node... */
 
3194
        ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3195
 
 
3196
        if (ind->mi_lazy_delete) {
 
3197
                scan_back_in_leaf:
 
3198
                while (result.sr_row_id == (xtRowID) -1) {
 
3199
                        if (result.sr_item.i_item_offset == 0)
 
3200
                                goto search_up_stack;
 
3201
                        ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3202
                }
 
3203
                idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
 
3204
        }
 
3205
 
 
3206
        record_found:
 
3207
        ot->ot_curr_rec_id = result.sr_rec_id;
 
3208
        ot->ot_curr_row_id = result.sr_row_id;
 
3209
        ot->ot_ind_state = result.sr_item;
 
3210
 
 
3211
        /* Convert to handle for later operations: */
 
3212
        ASSERT_NS(!ot->ot_ind_rhandle);
 
3213
        if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
 
3214
                goto failed;
 
3215
        /* Keep a copy of the node for previous operations... */
 
3216
        /*
 
3217
        u_int branch_size;
 
3218
 
 
3219
        branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
3220
        memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
 
3221
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3222
        */
 
3223
 
 
3224
        done_ok:
 
3225
        XT_INDEX_UNLOCK(ind, ot);
 
3226
 
 
3227
#ifdef DEBUG
 
3228
        //idx_check_index(ot, ind, TRUE);
 
3229
        //idx_check_on_key(ot);
 
3230
#endif
 
3231
        return OK;
 
3232
 
 
3233
        failed:
 
3234
        XT_INDEX_UNLOCK(ind, ot);
 
3235
        if (idx_out_of_memory_failure(ot))
 
3236
                goto retry_after_oom;
 
3237
        return FAILED;
 
3238
}
 
3239
 
 
3240
/*
 
3241
 * Copy the current index value to the record.
 
3242
 */
 
3243
xtPublic xtBool xt_idx_read(XTOpenTablePtr ot, XTIndexPtr ind, xtWord1 *rec_buf)
 
3244
{
 
3245
        xtWord1 *bitem;
 
3246
 
 
3247
#ifdef DEBUG
 
3248
        //idx_check_on_key(ot);
 
3249
#endif
 
3250
        xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3251
        bitem = ot->ot_ind_rhandle->ih_branch->tb_data + ot->ot_ind_state.i_item_offset;
 
3252
        myxt_create_row_from_key(ot, ind, bitem, ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE, rec_buf);
 
3253
        xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3254
        return OK;
 
3255
}
 
3256
 
 
3257
xtPublic xtBool xt_idx_next(register XTOpenTablePtr ot, register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
 
3258
{
 
3259
        XTIdxKeyValueRec        key_value;
 
3260
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE];
 
3261
        XTIdxResultRec          result;
 
3262
        IdxBranchStackRec       stack;
 
3263
        xtIndexNodeID           current;
 
3264
        XTIndReferenceRec       iref;
 
3265
 
 
3266
#ifdef DEBUG
 
3267
        iref.ir_xlock = 2;
 
3268
        iref.ir_updated = 2;
 
3269
#endif
 
3270
        ASSERT_NS(ot->ot_ind_rhandle);
 
3271
        xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3272
        result.sr_item = ot->ot_ind_state;
 
3273
        if (!result.sr_item.i_node_ref_size && 
 
3274
                result.sr_item.i_item_offset < result.sr_item.i_total_size && 
 
3275
                ot->ot_ind_rhandle->ih_cache_reference) {
 
3276
                XTIdxItemRec prev_item;
 
3277
 
 
3278
                key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset];
 
3279
                key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
3280
 
 
3281
                prev_item = result.sr_item;
 
3282
                idx_next_branch_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
 
3283
 
 
3284
                if (ind->mi_lazy_delete) {
 
3285
                        while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
3286
                                if (result.sr_row_id != (xtRowID) -1)
 
3287
                                        break;
 
3288
                                prev_item = result.sr_item;
 
3289
                                idx_next_branch_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
 
3290
                        }
 
3291
                }
 
3292
 
 
3293
                if (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
3294
                        /* Still on key? */
 
3295
                        idx_still_on_key(ind, search_key, ot->ot_ind_rhandle->ih_branch, &result.sr_item);
 
3296
                        xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3297
                        goto checked_on_key;
 
3298
                }
 
3299
 
 
3300
                result.sr_item = prev_item;
 
3301
        }
 
3302
 
 
3303
        key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
3304
        xt_get_record_ref(&ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE], &key_value.sv_rec_id, &key_value.sv_row_id);
 
3305
        key_value.sv_key = key_buf;
 
3306
        key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
3307
        memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset], key_value.sv_length);
 
3308
        xt_ind_release_handle(ot->ot_ind_rhandle, TRUE, ot->ot_thread);
 
3309
        ot->ot_ind_rhandle = NULL;
 
3310
 
 
3311
        retry_after_oom:
 
3312
#ifdef XT_TRACK_INDEX_UPDATES
 
3313
        ot->ot_ind_changed = 0;
 
3314
#endif
 
3315
        idx_newstack(&stack);
 
3316
 
 
3317
        XT_INDEX_READ_LOCK(ind, ot);
 
3318
 
 
3319
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
 
3320
                XT_INDEX_UNLOCK(ind, ot);
 
3321
                return OK;
 
3322
        }
 
3323
 
 
3324
        while (XT_NODE_ID(current)) {
 
3325
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3326
                        goto failed;
 
3327
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 
3328
                if (result.sr_item.i_node_ref_size) {
 
3329
                        if (result.sr_found) {
 
3330
                                /* If we have found the key in a node: */
 
3331
                                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3332
 
 
3333
                                /* Go down to the bottom: */
 
3334
                                while (XT_NODE_ID(current)) {
 
3335
                                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3336
                                        if (!idx_push(&stack, current, &result.sr_item))
 
3337
                                                goto failed;
 
3338
                                        current = result.sr_branch;
 
3339
                                        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3340
                                                goto failed;
 
3341
                                        idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3342
                                        if (!result.sr_item.i_node_ref_size)
 
3343
                                                break;
 
3344
                                }
 
3345
 
 
3346
                                /* Is the leaf not empty, then we are done... */
 
3347
                                break;
 
3348
                        }
 
3349
                }
 
3350
                else {
 
3351
                        /* We have reached the leaf. */
 
3352
                        if (result.sr_found)
 
3353
                                /* If we have found the key in a leaf: */
 
3354
                                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3355
                        /* If we did not find the key (although we should have). Our
 
3356
                         * position is automatically the next one.
 
3357
                         */
 
3358
                        break;
 
3359
                }
 
3360
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3361
                if (!idx_push(&stack, current, &result.sr_item))
 
3362
                        goto failed;
 
3363
                current = result.sr_branch;
 
3364
        }
 
3365
 
 
3366
        if (ind->mi_lazy_delete) {
 
3367
                ignore_lazy_deleted_items:
 
3368
                while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
3369
                        if (result.sr_row_id != (xtRowID) -1)
 
3370
                                break;
 
3371
                        idx_next_branch_item(NULL, ind, iref.ir_branch, &result);
 
3372
                }
 
3373
        }
 
3374
 
 
3375
        /* Check the current position in a leaf: */
 
3376
        if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
 
3377
                /* At the end: */
 
3378
                IdxStackItemPtr node;
 
3379
 
 
3380
                /* We are at the end of a leaf node.
 
3381
                 * Go up the stack to find the start poition of the next key.
 
3382
                 * If we find none, then we are the end of the index.
 
3383
                 */
 
3384
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3385
                while ((node = idx_pop(&stack))) {
 
3386
                        if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
 
3387
                                if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 
3388
                                        goto failed;
 
3389
                                result.sr_item = node->i_pos;
 
3390
                                xt_get_res_record_ref(&iref.ir_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE], &result);
 
3391
 
 
3392
                                if (ind->mi_lazy_delete) {
 
3393
                                        if (result.sr_row_id == (xtRowID) -1) {
 
3394
                                                /* If this node pointer is lazy deleted, then
 
3395
                                                 * go down the next branch...
 
3396
                                                 */
 
3397
                                                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3398
 
 
3399
                                                /* Go down to the bottom: */
 
3400
                                                current = node->i_branch;
 
3401
                                                while (XT_NODE_ID(current)) {
 
3402
                                                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3403
                                                        if (!idx_push(&stack, current, &result.sr_item))
 
3404
                                                                goto failed;
 
3405
                                                        current = result.sr_branch;
 
3406
                                                        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3407
                                                                goto failed;
 
3408
                                                        idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3409
                                                        if (!result.sr_item.i_node_ref_size)
 
3410
                                                                break;
 
3411
                                                }
 
3412
 
 
3413
                                                /* And scan the leaf... */
 
3414
                                                goto ignore_lazy_deleted_items;
 
3415
                                        }
 
3416
                                }
 
3417
 
 
3418
                                goto unlock_check_on_key;
 
3419
                        }
 
3420
                }
 
3421
 
 
3422
                /* No more keys: */
 
3423
                if (search_key)
 
3424
                        search_key->sk_on_key = FALSE;
 
3425
                ot->ot_curr_rec_id = 0;
 
3426
                ot->ot_curr_row_id = 0;
 
3427
                XT_INDEX_UNLOCK(ind, ot);
 
3428
                return OK;
 
3429
        }
 
3430
 
 
3431
        unlock_check_on_key:
 
3432
 
 
3433
        ASSERT_NS(!ot->ot_ind_rhandle);
 
3434
        if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
 
3435
                goto failed;
 
3436
        /*
 
3437
        u_int branch_size;
 
3438
 
 
3439
        branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
3440
        memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
 
3441
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3442
        */
 
3443
 
 
3444
        XT_INDEX_UNLOCK(ind, ot);
 
3445
 
 
3446
        /* Still on key? */
 
3447
        if (search_key && search_key->sk_on_key) {
 
3448
                /* GOTCHA: As a short-cut I was using a length compare
 
3449
                 * and a memcmp() here to check whether we as still on
 
3450
                 * the original search key.
 
3451
                 * This does not work because it does not take into account
 
3452
                 * trialing spaces (which are ignored in comparison).
 
3453
                 * So lengths can be different, but values still equal.
 
3454
                 * 
 
3455
                 * NOTE: We have to use the original search flags for
 
3456
                 * this compare.
 
3457
                 */
 
3458
                xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3459
                search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
 
3460
                        search_key->sk_key_value.sv_key, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset]) == 0;
 
3461
                xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3462
        }
 
3463
 
 
3464
        checked_on_key:
 
3465
        ot->ot_curr_rec_id = result.sr_rec_id;
 
3466
        ot->ot_curr_row_id = result.sr_row_id;
 
3467
        ot->ot_ind_state = result.sr_item;
 
3468
 
 
3469
        return OK;
 
3470
 
 
3471
        failed:
 
3472
        XT_INDEX_UNLOCK(ind, ot);
 
3473
        if (idx_out_of_memory_failure(ot))
 
3474
                goto retry_after_oom;
 
3475
        return FAILED;
 
3476
}
 
3477
 
 
3478
xtPublic xtBool xt_idx_prev(register XTOpenTablePtr ot, register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
 
3479
{
 
3480
        XTIdxKeyValueRec        key_value;
 
3481
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE];
 
3482
        XTIdxResultRec          result;
 
3483
        IdxBranchStackRec       stack;
 
3484
        xtIndexNodeID           current;
 
3485
        XTIndReferenceRec       iref;
 
3486
        IdxStackItemPtr         node;
 
3487
 
 
3488
#ifdef DEBUG
 
3489
        iref.ir_xlock = 2;
 
3490
        iref.ir_updated = 2;
 
3491
#endif
 
3492
        ASSERT_NS(ot->ot_ind_rhandle);
 
3493
        xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3494
        result.sr_item = ot->ot_ind_state;
 
3495
        if (!result.sr_item.i_node_ref_size && result.sr_item.i_item_offset > 0) {
 
3496
                key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset];
 
3497
                key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
3498
 
 
3499
                ind->mi_prev_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
 
3500
 
 
3501
                if (ind->mi_lazy_delete) {
 
3502
                        while (result.sr_row_id == (xtRowID) -1) {
 
3503
                                if (result.sr_item.i_item_offset == 0)
 
3504
                                        goto research;
 
3505
                                ind->mi_prev_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
 
3506
                        }
 
3507
                }
 
3508
 
 
3509
                idx_still_on_key(ind, search_key, ot->ot_ind_rhandle->ih_branch, &result.sr_item);
 
3510
 
 
3511
                xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3512
                goto checked_on_key;
 
3513
        }
 
3514
 
 
3515
        research:
 
3516
        key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 
3517
        key_value.sv_rec_id = ot->ot_curr_rec_id;
 
3518
        key_value.sv_row_id = 0;
 
3519
        key_value.sv_key = key_buf;
 
3520
        key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
3521
        memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset], key_value.sv_length);
 
3522
        xt_ind_release_handle(ot->ot_ind_rhandle, TRUE, ot->ot_thread);
 
3523
        ot->ot_ind_rhandle = NULL;
 
3524
 
 
3525
        retry_after_oom:
 
3526
#ifdef XT_TRACK_INDEX_UPDATES
 
3527
        ot->ot_ind_changed = 0;
 
3528
#endif
 
3529
        idx_newstack(&stack);
 
3530
 
 
3531
        XT_INDEX_READ_LOCK(ind, ot);
 
3532
 
 
3533
        if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
 
3534
                XT_INDEX_UNLOCK(ind, ot);
 
3535
                return OK;
 
3536
        }
 
3537
 
 
3538
        while (XT_NODE_ID(current)) {
 
3539
                if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3540
                        goto failed;
 
3541
                ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 
3542
                if (result.sr_item.i_node_ref_size) {
 
3543
                        if (result.sr_found) {
 
3544
                                /* If we have found the key in a node: */
 
3545
 
 
3546
                                search_down_stack:
 
3547
                                /* Go down to the bottom: */
 
3548
                                while (XT_NODE_ID(current)) {
 
3549
                                        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3550
                                        if (!idx_push(&stack, current, &result.sr_item))
 
3551
                                                goto failed;
 
3552
                                        current = result.sr_branch;
 
3553
                                        if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 
3554
                                                goto failed;
 
3555
                                        ind->mi_last_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3556
                                        if (!result.sr_item.i_node_ref_size)
 
3557
                                                break;
 
3558
                                }
 
3559
 
 
3560
                                /* If the leaf empty we have to go up the stack again... */
 
3561
                                if (result.sr_item.i_total_size == 0)
 
3562
                                        break;
 
3563
 
 
3564
                                if (ind->mi_lazy_delete) {
 
3565
                                        while (result.sr_row_id == (xtRowID) -1) {
 
3566
                                                if (result.sr_item.i_item_offset == 0)
 
3567
                                                        goto search_up_stack;
 
3568
                                                ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3569
                                        }
 
3570
                                }
 
3571
 
 
3572
                                goto unlock_check_on_key;
 
3573
                        }
 
3574
                }
 
3575
                else {
 
3576
                        /* We have reached the leaf.
 
3577
                         * Whether we found the key or not, we have
 
3578
                         * to move one to the left.
 
3579
                         */
 
3580
                        if (result.sr_item.i_item_offset == 0)
 
3581
                                break;
 
3582
                        ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3583
 
 
3584
                        if (ind->mi_lazy_delete) {
 
3585
                                while (result.sr_row_id == (xtRowID) -1) {
 
3586
                                        if (result.sr_item.i_item_offset == 0)
 
3587
                                                goto search_up_stack;
 
3588
                                        ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3589
                                }
 
3590
                        }
 
3591
 
 
3592
                        goto unlock_check_on_key;
 
3593
                }
 
3594
                xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3595
                if (!idx_push(&stack, current, &result.sr_item))
 
3596
                        goto failed;
 
3597
                current = result.sr_branch;
 
3598
        }
 
3599
 
 
3600
        search_up_stack:
 
3601
        /* We are at the start of a leaf node.
 
3602
         * Go up the stack to find the start poition of the next key.
 
3603
         * If we find none, then we are the end of the index.
 
3604
         */
 
3605
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3606
        while ((node = idx_pop(&stack))) {
 
3607
                if (node->i_pos.i_item_offset > node->i_pos.i_node_ref_size) {
 
3608
                        if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 
3609
                                goto failed;
 
3610
                        result.sr_item = node->i_pos;
 
3611
                        ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
3612
 
 
3613
                        if (ind->mi_lazy_delete) {
 
3614
                                if (result.sr_row_id == (xtRowID) -1) {
 
3615
                                        current = node->i_branch;
 
3616
                                        goto search_down_stack;
 
3617
                                }
 
3618
                        }
 
3619
 
 
3620
                        goto unlock_check_on_key;
 
3621
                }
 
3622
        }
 
3623
 
 
3624
        /* No more keys: */
 
3625
        if (search_key)
 
3626
                search_key->sk_on_key = FALSE;
 
3627
        ot->ot_curr_rec_id = 0;
 
3628
        ot->ot_curr_row_id = 0;
 
3629
 
 
3630
        XT_INDEX_UNLOCK(ind, ot);
 
3631
        return OK;
 
3632
 
 
3633
        unlock_check_on_key:
 
3634
        ASSERT_NS(!ot->ot_ind_rhandle);
 
3635
        if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
 
3636
                goto failed;
 
3637
        /*
 
3638
        u_int branch_size;
 
3639
 
 
3640
        branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
 
3641
        memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
 
3642
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
3643
        */
 
3644
 
 
3645
        XT_INDEX_UNLOCK(ind, ot);
 
3646
 
 
3647
        /* Still on key? */
 
3648
        if (search_key && search_key->sk_on_key) {
 
3649
                xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3650
                search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
 
3651
                        search_key->sk_key_value.sv_key, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset]) == 0;
 
3652
                xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3653
        }
 
3654
 
 
3655
        checked_on_key:
 
3656
        ot->ot_curr_rec_id = result.sr_rec_id;
 
3657
        ot->ot_curr_row_id = result.sr_row_id;
 
3658
        ot->ot_ind_state = result.sr_item;
 
3659
        return OK;
 
3660
 
 
3661
        failed:
 
3662
        XT_INDEX_UNLOCK(ind, ot);
 
3663
        if (idx_out_of_memory_failure(ot))
 
3664
                goto retry_after_oom;
 
3665
        return FAILED;
 
3666
}
 
3667
 
 
3668
/* Return TRUE if the record matches the current index search! */
 
3669
xtPublic xtBool xt_idx_match_search(register XTOpenTablePtr XT_UNUSED(ot), register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, xtWord1 *buf, int mode)
 
3670
{
 
3671
        int             r;
 
3672
        xtWord1 key_buf[XT_INDEX_MAX_KEY_SIZE];
 
3673
 
 
3674
        myxt_create_key_from_row(ind, key_buf, (xtWord1 *) buf, NULL);
 
3675
        r = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length, search_key->sk_key_value.sv_key, key_buf);
 
3676
        switch (mode) {
 
3677
                case XT_S_MODE_MATCH:
 
3678
                        return r == 0;
 
3679
                case XT_S_MODE_NEXT:
 
3680
                        return r <= 0;
 
3681
                case XT_S_MODE_PREV:
 
3682
                        return r >= 0;
 
3683
        }
 
3684
        return FALSE;
 
3685
}
 
3686
 
 
3687
static void idx_set_index_selectivity(XTOpenTablePtr ot, XTIndexPtr ind, XTThreadPtr thread)
 
3688
{
 
3689
        static const xtRecordID MAX_RECORDS = 100;
 
3690
 
 
3691
        XTIdxSearchKeyRec       search_key;
 
3692
        XTIndexSegPtr           key_seg;
 
3693
        u_int                           select_count[2] = {0, 0};
 
3694
        xtWord1                         key_buf[XT_INDEX_MAX_KEY_SIZE];
 
3695
        u_int                           key_len;
 
3696
        xtWord1                         *next_key_buf;
 
3697
        u_int                           next_key_len;
 
3698
        u_int                           curr_len;
 
3699
        u_int                           diff;
 
3700
        u_int                           j, i;
 
3701
        /* these 2 vars are used to check the overlapping if we have < 200 records */
 
3702
        xtRecordID                      last_rec = 0;           /* last record accounted in this iteration */
 
3703
        xtRecordID                      last_iter_rec = 0;      /* last record accounted in the previous iteration */
 
3704
 
 
3705
        xtBool  (* xt_idx_iterator[2])(
 
3706
                register struct XTOpenTable *ot, register struct XTIndex *ind, register XTIdxSearchKeyPtr search_key) = {
 
3707
 
 
3708
                xt_idx_next,
 
3709
                xt_idx_prev
 
3710
        };
 
3711
 
 
3712
        xtBool  (* xt_idx_begin[2])(
 
3713
                struct XTOpenTable *ot, struct XTIndex *ind, register XTIdxSearchKeyPtr search_key) = {
 
3714
        
 
3715
                xt_idx_search,
 
3716
                xt_idx_search_prev
 
3717
        };
 
3718
 
 
3719
        ind->mi_select_total = 0;
 
3720
        key_seg = ind->mi_seg;
 
3721
        for (i=0; i < ind->mi_seg_count; key_seg++, i++) {
 
3722
                key_seg->is_selectivity = 1;
 
3723
                key_seg->is_recs_in_range = 1;
 
3724
        }
 
3725
 
 
3726
        for (j=0; j < 2; j++) {
 
3727
                xt_idx_prep_key(ind, &search_key, j == 0 ? XT_SEARCH_FIRST_FLAG : XT_SEARCH_AFTER_LAST_FLAG, NULL, 0);
 
3728
                if (!(xt_idx_begin[j])(ot, ind, &search_key))
 
3729
                        goto failed;
 
3730
 
 
3731
                /* Initialize the buffer with the first index valid index entry: */
 
3732
                while (!select_count[j] && ot->ot_curr_rec_id != last_iter_rec) {
 
3733
                        if (ot->ot_curr_row_id) {
 
3734
                                select_count[j]++;
 
3735
                                last_rec = ot->ot_curr_rec_id;
 
3736
 
 
3737
                                key_len = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
 
3738
                                xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3739
                                memcpy(key_buf, ot->ot_ind_rhandle->ih_branch->tb_data + ot->ot_ind_state.i_item_offset, key_len);
 
3740
                                xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3741
                        }
 
3742
                        if (!(xt_idx_iterator[j])(ot, ind, &search_key))
 
3743
                                goto failed_1;
 
3744
                }
 
3745
 
 
3746
                while (select_count[j] < MAX_RECORDS && ot->ot_curr_rec_id != last_iter_rec) {
 
3747
                        /* Check if the index entry is committed: */
 
3748
                        if (ot->ot_curr_row_id) {
 
3749
                                xt_ind_lock_handle(ot->ot_ind_rhandle);
 
3750
                                select_count[j]++;
 
3751
                                last_rec = ot->ot_curr_rec_id;
 
3752
 
 
3753
                                next_key_len = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
 
3754
                                next_key_buf = ot->ot_ind_rhandle->ih_branch->tb_data + ot->ot_ind_state.i_item_offset;
 
3755
                        
 
3756
                                curr_len = 0;
 
3757
                                diff = FALSE;
 
3758
                                key_seg = ind->mi_seg;
 
3759
                                for (i=0; i < ind->mi_seg_count; key_seg++, i++) {
 
3760
                                        curr_len += myxt_key_seg_length(key_seg, curr_len, key_buf);
 
3761
                                        if (!diff && myxt_compare_key(ind, 0, curr_len, key_buf, next_key_buf) != 0)
 
3762
                                                diff = i+1;
 
3763
                                        if (diff)
 
3764
                                                key_seg->is_selectivity++;
 
3765
                                }
 
3766
 
 
3767
                                /* Store the key for the next comparison: */
 
3768
                                key_len = next_key_len;
 
3769
                                memcpy(key_buf, next_key_buf, key_len);
 
3770
                                xt_ind_unlock_handle(ot->ot_ind_rhandle);
 
3771
                        }
 
3772
 
 
3773
                        if (!(xt_idx_iterator[j])(ot, ind, &search_key))
 
3774
                                goto failed_1;
 
3775
                }
 
3776
 
 
3777
                last_iter_rec = last_rec;
 
3778
 
 
3779
                if (ot->ot_ind_rhandle) {
 
3780
                        xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
 
3781
                        ot->ot_ind_rhandle = NULL;
 
3782
                }
 
3783
        }
 
3784
 
 
3785
        u_int select_total;
 
3786
 
 
3787
        select_total = select_count[0] + select_count[1];
 
3788
        if (select_total) {
 
3789
                u_int recs;
 
3790
 
 
3791
                ind->mi_select_total = select_total;
 
3792
                key_seg = ind->mi_seg;
 
3793
                for (i=0; i < ind->mi_seg_count; key_seg++, i++) {
 
3794
                        recs = (u_int) ((double) select_total / (double) key_seg->is_selectivity + (double) 0.5);
 
3795
                        key_seg->is_recs_in_range = recs ? recs : 1;
 
3796
                }
 
3797
        }
 
3798
        return;
 
3799
 
 
3800
        failed_1:
 
3801
        if (ot->ot_ind_rhandle) {
 
3802
                xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
 
3803
                ot->ot_ind_rhandle = NULL;
 
3804
        }
 
3805
 
 
3806
        failed:
 
3807
        xt_tab_disable_index(ot->ot_table, XT_INDEX_CORRUPTED);
 
3808
        xt_log_and_clear_exception_ns();
 
3809
        return;
 
3810
}
 
3811
 
 
3812
xtPublic void xt_ind_set_index_selectivity(XTOpenTablePtr ot, XTThreadPtr thread)
 
3813
{
 
3814
        XTTableHPtr             tab = ot->ot_table;
 
3815
        XTIndexPtr              *ind;
 
3816
        u_int                   i;
 
3817
        time_t                  now;
 
3818
 
 
3819
        now = time(NULL);
 
3820
        xt_lock_mutex_ns(&tab->tab_ind_stat_lock);
 
3821
        if (tab->tab_ind_stat_calc_time < now) {
 
3822
                if (!tab->tab_dic.dic_disable_index) {
 
3823
                        for (i=0, ind=tab->tab_dic.dic_keys; i<tab->tab_dic.dic_key_count; i++, ind++)
 
3824
                                idx_set_index_selectivity(ot, *ind, thread);
 
3825
                }
 
3826
                tab->tab_ind_stat_calc_time = time(NULL);
 
3827
        }
 
3828
        xt_unlock_mutex_ns(&tab->tab_ind_stat_lock);
 
3829
}
 
3830
 
 
3831
/*
 
3832
 * -----------------------------------------------------------------------
 
3833
 * Print a b-tree
 
3834
 */
 
3835
 
 
3836
#ifdef TEST_CODE
 
3837
static void idx_check_on_key(XTOpenTablePtr ot)
 
3838
{
 
3839
        u_int           offs = ot->ot_ind_state.i_item_offset + ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
 
3840
        xtRecordID      rec_id;
 
3841
        xtRowID         row_id;
 
3842
        
 
3843
        if (ot->ot_curr_rec_id && ot->ot_ind_state.i_item_offset < ot->ot_ind_state.i_total_size) {
 
3844
                xt_get_record_ref(&ot->ot_ind_rbuf.tb_data[offs], &rec_id, &row_id);
 
3845
                
 
3846
                ASSERT_NS(rec_id == ot->ot_curr_rec_id);
 
3847
        }
 
3848
}
 
3849
#endif
 
3850
 
 
3851
static void idx_check_space(int 
 
3852
#ifdef DUMP_INDEX
 
3853
depth
 
3854
#endif
 
3855
)
 
3856
{
 
3857
#ifdef DUMP_INDEX
 
3858
        for (int i=0; i<depth; i++)
 
3859
                printf(". ");
 
3860
#endif
 
3861
}
 
3862
 
 
3863
#ifdef DO_COMP_TEST
 
3864
 
 
3865
//#define FILL_COMPRESS_BLOCKS
 
3866
 
 
3867
#ifdef FILL_COMPRESS_BLOCKS
 
3868
#define COMPRESS_BLOCK_SIZE                     (1024 * 128)
 
3869
#else
 
3870
#define COMPRESS_BLOCK_SIZE                     16384
 
3871
#endif
 
3872
 
 
3873
int blocks;
 
3874
int usage_total;
 
3875
 
 
3876
int zlib_1024;
 
3877
int zlib_2048;
 
3878
int zlib_4096;
 
3879
int zlib_8192;
 
3880
int zlib_16384;
 
3881
int zlib_total;
 
3882
int zlib_time;
 
3883
int read_time;
 
3884
int uncomp_time;
 
3885
int fill_size;
 
3886
int filled_size;
 
3887
unsigned char out[COMPRESS_BLOCK_SIZE];
 
3888
unsigned char uncomp[COMPRESS_BLOCK_SIZE];
 
3889
xtWord1 precomp[COMPRESS_BLOCK_SIZE+16000];
 
3890
 
 
3891
u_int idx_precompress(XTIndexPtr ind, u_int node_ref_size, u_int item_size, u_int insize, xtWord1 *in_data, xtWord1 *out_data)
 
3892
{
 
3893
        xtWord1 *prev_item = NULL;
 
3894
        xtWord1 *in_ptr = in_data;
 
3895
        xtWord1 *out_ptr = out_data;
 
3896
        u_int   out_size = 0;
 
3897
        u_int   same_size;
 
3898
 
 
3899
        if (insize >= node_ref_size) {
 
3900
                memcpy(out_ptr, in_ptr, node_ref_size);
 
3901
                insize -= node_ref_size;
 
3902
                out_size += node_ref_size;
 
3903
                in_ptr += node_ref_size;
 
3904
                out_ptr += node_ref_size;
 
3905
        }
 
3906
 
 
3907
        while (insize >= item_size + node_ref_size) {
 
3908
                if (prev_item) {
 
3909
                        same_size = 0;
 
3910
                        while (same_size < item_size + node_ref_size && *prev_item == *in_ptr) {
 
3911
                                same_size++;
 
3912
                                prev_item++;
 
3913
                                in_ptr++;
 
3914
                        }
 
3915
                        ASSERT_NS(same_size < 256);
 
3916
                        *out_ptr = (xtWord1) same_size;
 
3917
                        out_size++;
 
3918
                        out_ptr++;
 
3919
                        same_size = item_size + node_ref_size - same_size;
 
3920
                        memcpy(out_ptr, in_ptr, same_size);
 
3921
                        out_size += same_size;
 
3922
                        out_ptr += same_size;
 
3923
                        in_ptr += same_size;
 
3924
                        prev_item += same_size;
 
3925
                }
 
3926
                else {
 
3927
                        prev_item = in_ptr;
 
3928
                        memcpy(out_ptr, in_ptr, item_size + node_ref_size);
 
3929
                        out_size += item_size + node_ref_size;
 
3930
                        out_ptr += item_size + node_ref_size;
 
3931
                        in_ptr += item_size + node_ref_size;
 
3932
                }
 
3933
                insize -= (item_size + node_ref_size);
 
3934
        }
 
3935
        return out_size;
 
3936
}
 
3937
 
 
3938
u_int idx_compress(u_int insize, xtWord1 *in_data, u_int outsize, xtWord1 *out_data)
 
3939
{
 
3940
        z_stream strm;
 
3941
        int ret;
 
3942
 
 
3943
        strm.zalloc = Z_NULL;
 
3944
        strm.zfree = Z_NULL;
 
3945
        strm.opaque = Z_NULL;
 
3946
        strm.avail_in = 0;
 
3947
        strm.next_in = Z_NULL;
 
3948
        ret = deflateInit(&strm, Z_DEFAULT_COMPRESSION);
 
3949
        strm.avail_out = outsize;
 
3950
        strm.next_out = out_data;
 
3951
        strm.avail_in = insize;
 
3952
        strm.next_in = in_data;
 
3953
        ret = deflate(&strm, Z_FINISH);
 
3954
        deflateEnd(&strm);
 
3955
        return outsize - strm.avail_out;
 
3956
 
 
3957
/*
 
3958
        bz_stream strm;
 
3959
        int ret;
 
3960
 
 
3961
        memset(&strm, 0, sizeof(strm));
 
3962
 
 
3963
        ret = BZ2_bzCompressInit(&strm, 1, 0, 0);
 
3964
        strm.avail_out = outsize;
 
3965
        strm.next_out = (char *) out_data;
 
3966
        strm.avail_in = insize;
 
3967
        strm.next_in = (char *) in_data;
 
3968
        ret = BZ2_bzCompress(&strm, BZ_FINISH);
 
3969
 
 
3970
        BZ2_bzCompressEnd(&strm);
 
3971
        return outsize - strm.avail_out;
 
3972
*/
 
3973
}
 
3974
 
 
3975
u_int idx_decompress(u_int insize, xtWord1 *in_data, u_int outsize, xtWord1 *out_data)
 
3976
{
 
3977
        z_stream strm;
 
3978
        int ret;
 
3979
 
 
3980
        strm.zalloc = Z_NULL;
 
3981
        strm.zfree = Z_NULL;
 
3982
        strm.opaque = Z_NULL;
 
3983
        strm.avail_in = 0;
 
3984
        strm.next_in = Z_NULL;
 
3985
        ret = inflateInit(&strm);
 
3986
        strm.avail_out = outsize;
 
3987
        strm.next_out = out_data;
 
3988
        strm.avail_in = insize;
 
3989
        strm.next_in = in_data;
 
3990
        ret = inflate(&strm, Z_FINISH);
 
3991
        inflateEnd(&strm);
 
3992
        return outsize - strm.avail_out;
 
3993
 
 
3994
/*
 
3995
        bz_stream strm;
 
3996
        int ret;
 
3997
 
 
3998
        memset(&strm, 0, sizeof(strm));
 
3999
 
 
4000
        ret = BZ2_bzDecompressInit(&strm, 0, 0);
 
4001
        strm.avail_out = outsize;
 
4002
        strm.next_out = (char *) out_data;
 
4003
        strm.avail_in = insize;
 
4004
        strm.next_in = (char *) in_data;
 
4005
        ret = BZ2_bzDecompress(&strm);
 
4006
 
 
4007
        BZ2_bzDecompressEnd(&strm);
 
4008
        return outsize - strm.avail_out;
 
4009
*/
 
4010
}
 
4011
#endif // DO_COMP_TEST
 
4012
 
 
4013
static u_int idx_check_node(XTOpenTablePtr ot, XTIndexPtr ind, int depth, xtIndexNodeID node)
 
4014
{
 
4015
        XTIdxResultRec          result;
 
4016
        u_int                           block_count = 1;
 
4017
        XTIndReferenceRec       iref;
 
4018
 
 
4019
#ifdef DO_COMP_TEST
 
4020
        unsigned comp_size;
 
4021
        unsigned uncomp_size;
 
4022
        xtWord8 now;
 
4023
        xtWord8 time;
 
4024
#endif
 
4025
 
 
4026
#ifdef DEBUG
 
4027
        iref.ir_xlock = 2;
 
4028
        iref.ir_updated = 2;
 
4029
#endif
 
4030
        ASSERT_NS(XT_NODE_ID(node) <= XT_NODE_ID(ot->ot_table->tab_ind_eof));
 
4031
#ifdef DO_COMP_TEST
 
4032
        now = xt_trace_clock();
 
4033
#endif
 
4034
        /* A deadlock can occur when taking a read lock
 
4035
         * because the XT_IPAGE_WRITE_TRY_LOCK(&block->cb_lock, ot->ot_thread->t_id)
 
4036
         * only takes into account WRITE locks.
 
4037
         * So, if we hold a READ lock on a page, and ind_free_block() trys to
 
4038
         * free the block, it hangs on its own read lock!
 
4039
         *
 
4040
         * So we change from READ lock to a WRITE lock.
 
4041
         * If too restrictive then locks need to handle TRY on a
 
4042
         * read lock as well.
 
4043
         *
 
4044
         * #3   0x00e576b6 in xt_yield at thread_xt.cc:1351
 
4045
         * #4   0x00e7218e in xt_spinxslock_xlock at lock_xt.cc:1467
 
4046
         * #5   0x00dee1a9 in ind_free_block at cache_xt.cc:901
 
4047
         * #6   0x00dee500 in ind_cac_free_lru_blocks at cache_xt.cc:1054
 
4048
         * #7   0x00dee88c in ind_cac_fetch at cache_xt.cc:1151
 
4049
         * #8   0x00def6d4 in xt_ind_fetch at cache_xt.cc:1480
 
4050
         * #9   0x00e1ce2e in idx_check_node at index_xt.cc:3996
 
4051
         * #10  0x00e1cf2b in idx_check_node at index_xt.cc:4106
 
4052
         * #11  0x00e1cf2b in idx_check_node at index_xt.cc:4106
 
4053
         * #12  0x00e1cfdc in idx_check_index at index_xt.cc:4130
 
4054
         * #13  0x00e1d11c in xt_check_indices at index_xt.cc:4181
 
4055
         * #14  0x00e4aa82 in xt_check_table at table_xt.cc:2363
 
4056
         */
 
4057
        if (!xt_ind_fetch(ot, ind, node, XT_LOCK_WRITE, &iref))
 
4058
                return 0;
 
4059
#ifdef DO_COMP_TEST
 
4060
        time = xt_trace_clock() - now;
 
4061
        read_time += time;
 
4062
#endif
 
4063
 
 
4064
        idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
4065
        ASSERT_NS(result.sr_item.i_total_size + offsetof(XTIdxBranchDRec, tb_data) <= XT_INDEX_PAGE_SIZE);
 
4066
 
 
4067
#ifdef DO_COMP_TEST
 
4068
        u_int size = result.sr_item.i_total_size;
 
4069
        xtWord1 *data = iref.ir_branch->tb_data;
 
4070
 
 
4071
/*
 
4072
        size = idx_precompress(ind, result.sr_item.i_node_ref_size, result.sr_item.i_item_size, size, data, precomp);
 
4073
        if (size > result.sr_item.i_total_size)
 
4074
                size = result.sr_item.i_total_size;
 
4075
        else
 
4076
                data = precomp;
 
4077
*/
 
4078
 
 
4079
        blocks++;
 
4080
        usage_total += result.sr_item.i_total_size;
 
4081
 
 
4082
#ifdef FILL_COMPRESS_BLOCKS
 
4083
        if (fill_size + size > COMPRESS_BLOCK_SIZE) {
 
4084
                now = xt_trace_clock();
 
4085
                comp_size = idx_compress(fill_size, precomp, COMPRESS_BLOCK_SIZE, out);
 
4086
                time = xt_trace_clock() - now;
 
4087
                zlib_time += time;
 
4088
 
 
4089
                zlib_total += comp_size;
 
4090
                filled_size += fill_size;
 
4091
 
 
4092
                now = xt_trace_clock();
 
4093
                uncomp_size = idx_decompress(comp_size, out, COMPRESS_BLOCK_SIZE, uncomp);
 
4094
                time = xt_trace_clock() - now;
 
4095
                uncomp_time += time;
 
4096
 
 
4097
                if (uncomp_size != fill_size)
 
4098
                        printf("what?\n");
 
4099
 
 
4100
                fill_size = 0;
 
4101
        }
 
4102
        memcpy(precomp + fill_size, data, size);
 
4103
        fill_size += size;
 
4104
#else
 
4105
        now = xt_trace_clock();
 
4106
        comp_size = idx_compress(size, data, COMPRESS_BLOCK_SIZE, out);
 
4107
        time = xt_trace_clock() - now;
 
4108
        zlib_time += time;
 
4109
        zlib_total += comp_size;
 
4110
 
 
4111
        now = xt_trace_clock();
 
4112
        uncomp_size = idx_decompress(comp_size, out, COMPRESS_BLOCK_SIZE, uncomp);
 
4113
        time = xt_trace_clock() - now;
 
4114
        uncomp_time += time;
 
4115
        if (uncomp_size != size)
 
4116
                printf("what?\n");
 
4117
#endif
 
4118
 
 
4119
        if (comp_size <= 1024)
 
4120
                zlib_1024++;
 
4121
        else if (comp_size <= 2048)
 
4122
                zlib_2048++;
 
4123
        else if (comp_size <= 4096)
 
4124
                zlib_4096++;
 
4125
        else if (comp_size <= 8192)
 
4126
                zlib_8192++;
 
4127
        else
 
4128
                zlib_16384++;
 
4129
 
 
4130
#endif // DO_COMP_TEST
 
4131
 
 
4132
        if (result.sr_item.i_node_ref_size) {
 
4133
                idx_check_space(depth);
 
4134
#ifdef DUMP_INDEX
 
4135
                printf("%04d -->\n", (int) XT_NODE_ID(result.sr_branch));
 
4136
#endif
 
4137
#ifdef TRACK_ACTIVITY
 
4138
                track_block_exists(result.sr_branch);
 
4139
#endif
 
4140
                block_count += idx_check_node(ot, ind, depth+1, result.sr_branch);
 
4141
        }
 
4142
 
 
4143
        while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
4144
#ifdef CHECK_PRINTS_RECORD_REFERENCES
 
4145
                idx_check_space(depth);
 
4146
                if (result.sr_item.i_item_size == 12) {
 
4147
                        /* Assume this is a NOT-NULL INT!: */
 
4148
                        xtWord4 val = XT_GET_DISK_4(&iref.ir_branch->tb_data[result.sr_item.i_item_offset]);
 
4149
#ifdef DUMP_INDEX
 
4150
                        printf("(%6d) ", (int) val);
 
4151
#endif
 
4152
                }
 
4153
#ifdef DUMP_INDEX
 
4154
                printf("rec=%d row=%d ", (int) result.sr_rec_id, (int) result.sr_row_id);
 
4155
                printf("\n");
 
4156
#endif
 
4157
#endif
 
4158
                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
4159
                if (result.sr_item.i_node_ref_size) {
 
4160
                        idx_check_space(depth);
 
4161
#ifdef DUMP_INDEX
 
4162
                        printf("%04d -->\n", (int) XT_NODE_ID(result.sr_branch));
 
4163
#endif
 
4164
#ifdef TRACK_ACTIVITY
 
4165
                        track_block_exists(result.sr_branch);
 
4166
#endif
 
4167
                        block_count += idx_check_node(ot, ind, depth+1, result.sr_branch);
 
4168
                }
 
4169
        }
 
4170
 
 
4171
        xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
 
4172
        return block_count;
 
4173
}
 
4174
 
 
4175
static u_int idx_check_index(XTOpenTablePtr ot, XTIndexPtr ind, xtBool with_lock)
 
4176
{
 
4177
        xtIndexNodeID                   current;
 
4178
        u_int                                   block_count = 0;
 
4179
        u_int                                   i;
 
4180
 
 
4181
        if (with_lock)
 
4182
                XT_INDEX_WRITE_LOCK(ind, ot);
 
4183
 
 
4184
#ifdef DUMP_INDEX
 
4185
        printf("INDEX (%d) %04d ---------------------------------------\n", (int) ind->mi_index_no, (int) XT_NODE_ID(ind->mi_root));
 
4186
#endif
 
4187
        if ((XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
 
4188
#ifdef TRACK_ACTIVITY
 
4189
                track_block_exists(ind->mi_root);
 
4190
#endif
 
4191
                block_count = idx_check_node(ot, ind, 0, current);
 
4192
        }
 
4193
 
 
4194
        if (ind->mi_free_list && ind->mi_free_list->fl_free_count) {
 
4195
#ifdef DUMP_INDEX
 
4196
                printf("INDEX (%d) FREE ---------------------------------------", (int) ind->mi_index_no);
 
4197
#endif
 
4198
                ASSERT_NS(ind->mi_free_list->fl_start == 0);
 
4199
                for (i=0; i<ind->mi_free_list->fl_free_count; i++) {
 
4200
#ifdef DUMP_INDEX
 
4201
                        if ((i % 40) == 0)
 
4202
                                printf("\n");
 
4203
#endif
 
4204
                        block_count++;
 
4205
#ifdef TRACK_ACTIVITY
 
4206
                        track_block_exists(ind->mi_free_list->fl_page_id[i]);
 
4207
#endif
 
4208
#ifdef DUMP_INDEX
 
4209
                        printf("%2d ", (int) XT_NODE_ID(ind->mi_free_list->fl_page_id[i]));
 
4210
#endif
 
4211
                }
 
4212
#ifdef DUMP_INDEX
 
4213
                if ((i % 40) != 0)
 
4214
                        printf("\n");
 
4215
#endif
 
4216
        }
 
4217
 
 
4218
        if (with_lock)
 
4219
                XT_INDEX_UNLOCK(ind, ot);
 
4220
        return block_count;
 
4221
 
 
4222
}
 
4223
 
 
4224
xtPublic void xt_check_indices(XTOpenTablePtr ot)
 
4225
{
 
4226
        register XTTableHPtr    tab = ot->ot_table;
 
4227
        XTIndexPtr                              *ind;
 
4228
        xtIndexNodeID                   current;
 
4229
        XTIndFreeBlockRec               free_block;
 
4230
        u_int                                   ind_count, block_count = 0;
 
4231
        u_int                                   free_count = 0;
 
4232
        u_int                                   i, j;
 
4233
 
 
4234
        xt_lock_mutex_ns(&tab->tab_ind_flush_lock);
 
4235
        printf("CHECK INDICES %s ==============================\n", tab->tab_name->ps_path);
 
4236
#ifdef TRACK_ACTIVITY
 
4237
        track_reset_missing();
 
4238
#endif
 
4239
 
 
4240
        ind = tab->tab_dic.dic_keys;
 
4241
        for (u_int k=0; k<tab->tab_dic.dic_key_count; k++, ind++) {
 
4242
                ind_count = idx_check_index(ot, *ind, TRUE);
 
4243
                block_count += ind_count;
 
4244
        }
 
4245
 
 
4246
#ifdef DO_COMP_TEST
 
4247
        int block_total;
 
4248
 
 
4249
#ifdef FILL_COMPRESS_BLOCKS
 
4250
        if (fill_size > 0) {
 
4251
                unsigned        comp_size;
 
4252
                unsigned        uncomp_size;
 
4253
                xtWord8         now;
 
4254
                xtWord8         time;
 
4255
 
 
4256
                now = xt_trace_clock();
 
4257
                comp_size = idx_compress(fill_size, precomp, COMPRESS_BLOCK_SIZE, out);
 
4258
                time = xt_trace_clock() - now;
 
4259
                zlib_time += time;
 
4260
                zlib_total += comp_size;
 
4261
                filled_size += fill_size;
 
4262
 
 
4263
                now = xt_trace_clock();
 
4264
                uncomp_size = idx_decompress(comp_size, out, COMPRESS_BLOCK_SIZE, uncomp);
 
4265
                time = xt_trace_clock() - now;
 
4266
                uncomp_time += time;
 
4267
        }
 
4268
        if (filled_size != usage_total)
 
4269
                printf("What?\n");
 
4270
#endif
 
4271
 
 
4272
        printf("Total blocks  = %d\n", blocks);
 
4273
        printf("zlib <=  1024 = %d\n", zlib_1024);
 
4274
        printf("zlib <=  2048 = %d\n", zlib_2048);
 
4275
        printf("zlib <=  4096 = %d\n", zlib_4096);
 
4276
        printf("zlib <=  8192 = %d\n", zlib_8192);
 
4277
        printf("zlib <= 16384 = %d\n", zlib_16384);
 
4278
        printf("zlib average size = %.2f\n", (double) zlib_total / (double) blocks);
 
4279
        printf("zlib average time = %.2f\n", (double) zlib_time / (double) blocks);
 
4280
        printf("read average time = %.2f\n", (double) read_time / (double) blocks);
 
4281
        printf("uncompress time   = %.2f\n", (double) uncomp_time / (double) blocks);
 
4282
        block_total = (zlib_1024 + zlib_2048) * 8192;
 
4283
        block_total += zlib_4096 * 8192;
 
4284
        block_total += zlib_8192 * 8192;
 
4285
        block_total += zlib_16384 * 16384;
 
4286
        printf("block total       = %d\n", block_total);
 
4287
        printf("block %% compress  = %.2f\n", ((double) block_total * (double) 100) / ((double) blocks * (double) 16384));
 
4288
        printf("Total size        = %d\n", blocks * 16384);
 
4289
        printf("total before zlib = %d\n", usage_total);
 
4290
        printf("total after zlib  = %d\n", zlib_total);
 
4291
        printf("zlib %% compress   = %.2f\n", ((double) zlib_total * (double) 100) / (double) usage_total);
 
4292
        printf("total %% compress  = %.2f\n", ((double) zlib_total * (double) 100) / (double) (blocks * 16384));
 
4293
#endif
 
4294
 
 
4295
        xt_lock_mutex_ns(&tab->tab_ind_lock);
 
4296
#ifdef DUMP_INDEX
 
4297
        printf("\nFREE: ---------------------------------------\n");
 
4298
#endif
 
4299
        if (tab->tab_ind_free_list) {
 
4300
                XTIndFreeListPtr        ptr;
 
4301
 
 
4302
                ptr = tab->tab_ind_free_list;
 
4303
                while (ptr) {
 
4304
#ifdef DUMP_INDEX
 
4305
                        printf("Memory List:");
 
4306
#endif
 
4307
                        i = 0;
 
4308
                        for (j=ptr->fl_start; j<ptr->fl_free_count; j++, i++) {
 
4309
#ifdef DUMP_INDEX
 
4310
                                if ((i % 40) == 0)
 
4311
                                        printf("\n");
 
4312
#endif
 
4313
                                free_count++;
 
4314
#ifdef TRACK_ACTIVITY
 
4315
                                track_block_exists(ptr->fl_page_id[j]);
 
4316
#endif
 
4317
#ifdef DUMP_INDEX
 
4318
                                printf("%2d ", (int) XT_NODE_ID(ptr->fl_page_id[j]));
 
4319
#endif
 
4320
                        }
 
4321
#ifdef DUMP_INDEX
 
4322
                        if ((i % 40) != 0)
 
4323
                                printf("\n");
 
4324
#endif
 
4325
                        ptr = ptr->fl_next_list;
 
4326
                }
 
4327
        }
 
4328
 
 
4329
        current = tab->tab_ind_free;
 
4330
        if (XT_NODE_ID(current)) {
 
4331
                u_int k = 0;
 
4332
#ifdef DUMP_INDEX
 
4333
                printf("Disk List:");
 
4334
#endif
 
4335
                while (XT_NODE_ID(current)) {
 
4336
#ifdef DUMP_INDEX
 
4337
                        if ((k % 40) == 0)
 
4338
                                printf("\n");
 
4339
#endif
 
4340
                        free_count++;
 
4341
#ifdef TRACK_ACTIVITY
 
4342
                        track_block_exists(current);
 
4343
#endif
 
4344
#ifdef DUMP_INDEX
 
4345
                        printf("%d ", (int) XT_NODE_ID(current));
 
4346
#endif
 
4347
                        if (!xt_ind_read_bytes(ot, *ind, current, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block)) {
 
4348
                                xt_log_and_clear_exception_ns();
 
4349
                                break;
 
4350
                        }
 
4351
                        XT_NODE_ID(current) = (xtIndexNodeID) XT_GET_DISK_8(free_block.if_next_block_8);
 
4352
                        k++;
 
4353
                }
 
4354
#ifdef DUMP_INDEX
 
4355
                if ((k % 40) != 0)
 
4356
                        printf("\n");
 
4357
#endif
 
4358
        }
 
4359
#ifdef DUMP_INDEX
 
4360
        printf("\n-----------------------------\n");
 
4361
        printf("used blocks %d + free blocks %d = %d\n", block_count, free_count, block_count + free_count);
 
4362
        printf("EOF = %"PRIu64", total blocks = %d\n", (xtWord8) xt_ind_node_to_offset(tab, tab->tab_ind_eof), (int) (XT_NODE_ID(tab->tab_ind_eof) - 1));
 
4363
        printf("-----------------------------\n");
 
4364
#endif
 
4365
        xt_unlock_mutex_ns(&tab->tab_ind_lock);
 
4366
#ifdef TRACK_ACTIVITY
 
4367
        track_dump_missing(tab->tab_ind_eof);
 
4368
        printf("===================================================\n");
 
4369
        track_dump_all((u_int) (XT_NODE_ID(tab->tab_ind_eof) - 1));
 
4370
#endif
 
4371
        printf("===================================================\n");
 
4372
        xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
 
4373
}
 
4374
 
 
4375
/*
 
4376
 * -----------------------------------------------------------------------
 
4377
 * Load index
 
4378
 */
 
4379
 
 
4380
static void idx_load_node(XTThreadPtr self, XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID node)
 
4381
{
 
4382
        XTIdxResultRec          result;
 
4383
        XTIndReferenceRec       iref;
 
4384
 
 
4385
        ASSERT_NS(XT_NODE_ID(node) <= XT_NODE_ID(ot->ot_table->tab_ind_eof));
 
4386
        if (!xt_ind_fetch(ot, ind, node, XT_LOCK_READ, &iref))
 
4387
                xt_throw(self);
 
4388
 
 
4389
        idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
4390
        if (result.sr_item.i_node_ref_size)
 
4391
                idx_load_node(self, ot, ind, result.sr_branch);
 
4392
        while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
4393
                idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
4394
                if (result.sr_item.i_node_ref_size)
 
4395
                        idx_load_node(self, ot, ind, result.sr_branch);
 
4396
        }
 
4397
 
 
4398
        xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 
4399
}
 
4400
 
 
4401
xtPublic void xt_load_indices(XTThreadPtr self, XTOpenTablePtr ot)
 
4402
{
 
4403
        register XTTableHPtr    tab = ot->ot_table;
 
4404
        XTIndexPtr                              *ind_ptr;
 
4405
        XTIndexPtr                              ind;
 
4406
        xtIndexNodeID                   current;
 
4407
 
 
4408
        xt_lock_mutex(self, &tab->tab_ind_flush_lock);
 
4409
        pushr_(xt_unlock_mutex, &tab->tab_ind_flush_lock);
 
4410
 
 
4411
        ind_ptr = tab->tab_dic.dic_keys;
 
4412
        for (u_int k=0; k<tab->tab_dic.dic_key_count; k++, ind_ptr++) {
 
4413
                ind = *ind_ptr;
 
4414
                XT_INDEX_WRITE_LOCK(ind, ot);
 
4415
                if ((XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
 
4416
                        idx_load_node(self, ot, ind, current);
 
4417
                XT_INDEX_UNLOCK(ind, ot);
 
4418
        }
 
4419
 
 
4420
        freer_(); // xt_unlock_mutex(&tab->tab_ind_flush_lock)
 
4421
}
 
4422
 
 
4423
/*
 
4424
 * -----------------------------------------------------------------------
 
4425
 * Count the number of deleted entries in a node:
 
4426
 */
 
4427
 
 
4428
/*
 
4429
 * {LAZY-DEL-INDEX-ITEMS}
 
4430
 *
 
4431
 * Use this function to count the number of deleted items 
 
4432
 * in a node when it is loaded.
 
4433
 *
 
4434
 * The count helps us decide of the node should be "packed".
 
4435
 */
 
4436
xtPublic void xt_ind_count_deleted_items(XTTableHPtr tab, XTIndexPtr ind, XTIndBlockPtr block)
 
4437
{
 
4438
        XTIdxResultRec          result;
 
4439
        int                                     del_count = 0;
 
4440
        xtWord2                         branch_size;
 
4441
 
 
4442
        branch_size = XT_GET_DISK_2(((XTIdxBranchDPtr) block->cb_data)->tb_size_2);
 
4443
 
 
4444
        /* This is possible when reading free pages. */
 
4445
        if (XT_GET_INDEX_BLOCK_LEN(branch_size) < 2 || XT_GET_INDEX_BLOCK_LEN(branch_size) > XT_INDEX_PAGE_SIZE)
 
4446
                return;
 
4447
 
 
4448
        idx_first_branch_item(tab, ind, (XTIdxBranchDPtr) block->cb_data, &result);
 
4449
        while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 
4450
                if (result.sr_row_id == (xtRowID) -1)
 
4451
                        del_count++;
 
4452
                idx_next_branch_item(tab, ind, (XTIdxBranchDPtr) block->cb_data, &result);
 
4453
        }
 
4454
        block->cp_del_count = del_count;
 
4455
}
 
4456
 
 
4457
/*
 
4458
 * -----------------------------------------------------------------------
 
4459
 * Dirty list
 
4460
 */
 
4461
 
 
4462
xtBool XTIndDirtyList::dl_add_block(XTIndBlockPtr block)
 
4463
{
 
4464
        XTIndDirtyBlocksPtr blocks;
 
4465
 
 
4466
        blocks = dl_block_lists;
 
4467
        if (dl_list_usage == XT_DIRTY_BLOCK_LIST_SIZE || !blocks) {
 
4468
                if (!(blocks = (XTIndDirtyBlocksPtr) xt_malloc_ns(sizeof(XTIndDirtyBlocksRec))))
 
4469
                        return FAILED;
 
4470
                dl_list_usage = 0;
 
4471
                blocks->db_next = dl_block_lists;
 
4472
                dl_block_lists = blocks;
 
4473
        }
 
4474
        blocks->db_blocks[dl_list_usage] = block;
 
4475
        dl_list_usage++;
 
4476
        dl_total_blocks++;
 
4477
        return OK;
 
4478
}
 
4479
 
 
4480
static int idx_compare_blocks(const void *a, const void *b)
 
4481
{
 
4482
        XTIndBlockPtr b_a = *((XTIndBlockPtr *) a);
 
4483
        XTIndBlockPtr b_b = *((XTIndBlockPtr *) b);
 
4484
 
 
4485
        if (b_a->cb_address == b_b->cb_address)
 
4486
                return 0;
 
4487
        if (b_a->cb_address < b_b->cb_address)
 
4488
                return -1;
 
4489
        return 1;
 
4490
}
 
4491
 
 
4492
void XTIndDirtyList::dl_sort_blocks()
 
4493
{
 
4494
        XTIndDirtyBlocksPtr blocks;
 
4495
        size_t                          size;
 
4496
 
 
4497
        size = dl_list_usage;
 
4498
        blocks = dl_block_lists;
 
4499
        while (blocks) {
 
4500
                qsort(blocks->db_blocks, size, sizeof(XTIndBlockPtr), idx_compare_blocks);
 
4501
                blocks = blocks->db_next;
 
4502
                size = XT_DIRTY_BLOCK_LIST_SIZE;
 
4503
        }
 
4504
}
 
4505
 
 
4506
void XTIndDirtyList::dl_free_all()
 
4507
{
 
4508
        XTIndDirtyBlocksPtr blocks, n_blocks;
 
4509
 
 
4510
        blocks = dl_block_lists;
 
4511
        dl_block_lists = NULL;
 
4512
        dl_total_blocks = 0;
 
4513
        dl_list_usage = 0;
 
4514
        while (blocks) {
 
4515
                n_blocks = blocks->db_next;
 
4516
                xt_free_ns(blocks);
 
4517
                blocks = n_blocks;
 
4518
        }
 
4519
}
 
4520
 
 
4521
/*
 
4522
 * -----------------------------------------------------------------------
 
4523
 * Index consistent flush
 
4524
 */
 
4525
 
 
4526
xtBool XTFlushIndexTask::tk_task(XTThreadPtr thread)
 
4527
{
 
4528
        XTOpenTablePtr          ot;
 
4529
 
 
4530
        fit_dirty_blocks = 0;
 
4531
        fit_blocks_flushed = 0;
 
4532
 
 
4533
        /* See {TASK-TABLE-GONE} */
 
4534
        if (!(xt_db_open_pool_table_ns(&ot, fit_table->tab_db, fit_table->tab_id)))
 
4535
                return FAILED;
 
4536
 
 
4537
        if (!ot) {
 
4538
                /* Can happen if the table has been dropped: */
 
4539
                if (thread->t_exception.e_xt_err)
 
4540
                        xt_log_and_clear_exception(thread);
 
4541
                xt_logf(XT_NT_WARNING, "Checkpoint skipping table (ID) %lu: table was not found\n", (u_long) fit_table->tab_id);
 
4542
                xt_checkpoint_set_flush_state(fit_table->tab_db, fit_table->tab_id, XT_CPT_STATE_DONE_ALL);
 
4543
                return OK;
 
4544
        }
 
4545
 
 
4546
        if (ot->ot_table != fit_table) {
 
4547
                /* Can happen if the table has been renamed: */
 
4548
                if (thread->t_exception.e_xt_err)
 
4549
                        xt_log_and_clear_exception(thread);
 
4550
                xt_logf(XT_NT_WARNING, "Checkpoint skipping table (ID) %lu: table has been renamed\n", (u_long) fit_table->tab_id);
 
4551
                xt_checkpoint_set_flush_state(fit_table->tab_db, fit_table->tab_id, XT_CPT_STATE_DONE_ALL);
 
4552
                goto table_gone;
 
4553
        }
 
4554
 
 
4555
        if (!xt_flush_indices(ot, NULL, FALSE, this)) {
 
4556
                xt_db_return_table_to_pool_ns(ot);
 
4557
                return FAILED;
 
4558
        }
 
4559
 
 
4560
        table_gone:
 
4561
        xt_db_return_table_to_pool_ns(ot);
 
4562
        return OK;
 
4563
}
 
4564
 
 
4565
void XTFlushIndexTask::tk_reference()
 
4566
{
 
4567
        xt_heap_reference_ns(fit_table);
 
4568
}
 
4569
 
 
4570
void XTFlushIndexTask::tk_release()
 
4571
{
 
4572
        xt_heap_release_ns(fit_table);
 
4573
}
 
4574
 
 
4575
/*
 
4576
 * Set notify_before_write to TRUE if the caller requires
 
4577
 * notification before the index file is written.
 
4578
 *
 
4579
 * This is used if the index is flushed due to lock of index cache.
 
4580
 */
 
4581
xtPublic xtBool xt_async_flush_indices(XTTableHPtr tab, xtBool notify_complete, xtBool notify_before_write, XTThreadPtr thread)
 
4582
{
 
4583
        /* Run the task: */
 
4584
        return xt_run_async_task(tab->tab_ind_flush_task, notify_complete, notify_before_write, thread, tab->tab_db);
 
4585
}
 
4586
 
 
4587
#if defined(PRINT_IND_FLUSH_STATS) || defined(TRACE_FLUSH_TIMES)
 
4588
 
 
4589
static char *idx_format(char *buffer, double v)
 
4590
{
 
4591
        if (v != 0.0) {
 
4592
                sprintf(buffer, "%9.2f", v);
 
4593
                if (strcmp(buffer, "      nan") == 0)
 
4594
                        strcpy(buffer, "         ");
 
4595
        }
 
4596
        else
 
4597
                strcpy(buffer, "         ");
 
4598
        return buffer;
 
4599
}
 
4600
 
 
4601
static char *idx_format_mb(char *buffer, double v)
 
4602
{
 
4603
        if (v != 0.0) {
 
4604
                sprintf(buffer, "%7.3f", v / (double) 1024);
 
4605
                if (strcmp(buffer, "    nan") == 0)
 
4606
                        strcpy(buffer, "       ");
 
4607
        }
 
4608
        else
 
4609
                strcpy(buffer, "       ");
 
4610
        return buffer;
 
4611
}
 
4612
#endif
 
4613
 
 
4614
#ifdef TRACE_FLUSH_TIMES
 
4615
 
 
4616
#define ILOG_FLUSH      1
 
4617
#define INDEX_FLUSH     2
 
4618
 
 
4619
struct idxstats {
 
4620
        u_int i_log_flush;
 
4621
        u_int i_log_write;
 
4622
        u_int idx_flush;
 
4623
        u_int idx_write;
 
4624
};
 
4625
 
 
4626
static void idx_print(char *msg, XTThreadPtr thread, struct idxstats *st, xtWord8 *now, int flush)
 
4627
{
 
4628
        xtWord8 then, t;
 
4629
        double  ilogw, idxw;
 
4630
        double  dilogw, didxw;
 
4631
        char    buf1[30];
 
4632
        char    buf2[30];
 
4633
        char    buf3[30];
 
4634
        char    buf4[30];
 
4635
 
 
4636
        then = xt_trace_clock();
 
4637
        t = then - *now;
 
4638
        ilogw = (double) (thread->st_statistics.st_ilog.ts_write - st->i_log_write) / (double) 1024;
 
4639
        dilogw = ((double) ilogw * (double) 1000000) / (double) t;
 
4640
        idxw = (double) (thread->st_statistics.st_ind.ts_write - st->idx_write) / (double) 1024;
 
4641
        didxw = ((double) idxw * (double) 1000000) / (double) t;
 
4642
 
 
4643
        printf("%26s | TIME: %7d ", msg, (int) t);
 
4644
        printf("ILOG: %s - %s INDX: %s - %s\n", 
 
4645
                idx_format_mb(buf1, dilogw), idx_format(buf2, ilogw),
 
4646
                idx_format_mb(buf3, didxw), idx_format(buf4, idxw));
 
4647
        st->i_log_write = thread->st_statistics.st_ilog.ts_write;
 
4648
        st->idx_write = thread->st_statistics.st_ind.ts_write;
 
4649
 
 
4650
        switch (flush) {
 
4651
                case ILOG_FLUSH:
 
4652
                        ilogw = (double) (thread->st_statistics.st_ilog.ts_write - st->i_log_flush) / (double) 1024;
 
4653
                        dilogw = ((double) ilogw * (double) 1000000) / (double) t;
 
4654
                        printf("%26s | TIME: %7s ", " ", " ");
 
4655
                        printf("ILOG: %s - %s INDX: %s - %s\n", 
 
4656
                                idx_format_mb(buf1, dilogw), idx_format(buf2, ilogw),
 
4657
                                idx_format_mb(buf3, 0.0), idx_format(buf4, 0.0));
 
4658
                        st->i_log_flush = thread->st_statistics.st_ilog.ts_write;
 
4659
                        break;
 
4660
                case INDEX_FLUSH:
 
4661
                        idxw = (double) (thread->st_statistics.st_ind.ts_write - st->idx_flush) / (double) 1024;
 
4662
                        didxw = ((double) idxw * (double) 1000000) / (double) t;
 
4663
                        printf("%26s | TIME: %7s ", " ", " ");
 
4664
                        printf("ILOG: %s - %s INDX: %s - %s\n", 
 
4665
                                idx_format_mb(buf1, 0.0), idx_format(buf2, 0.0),
 
4666
                                idx_format_mb(buf3, didxw), idx_format(buf4, idxw));
 
4667
                        st->idx_flush = thread->st_statistics.st_ind.ts_write;
 
4668
                        break;
 
4669
        }
 
4670
 
 
4671
        *now = xt_trace_clock();
 
4672
}
 
4673
 
 
4674
#define TRACE_FLUSH(a, b, c, d, e)              idx_print(a, b, c, d, e)
 
4675
 
 
4676
#else // TRACE_FLUSH_TIMES
 
4677
 
 
4678
#define TRACE_FLUSH(a, b, c, d, e)
 
4679
 
 
4680
#endif // TRACE_FLUSH_TIMES
 
4681
 
 
4682
/* Flush the indexes of a table.
 
4683
 * If a ft is given, then this means this is an asynchronous flush.
 
4684
 */
 
4685
xtPublic xtBool xt_flush_indices(XTOpenTablePtr ot, off_t *bytes_flushed, xtBool have_table_lock, XTFlushIndexTask *fit)
 
4686
{
 
4687
        register XTTableHPtr    tab = ot->ot_table;
 
4688
        XTIndexLogPtr                   il;
 
4689
        XTIndexPtr                              *indp;
 
4690
        XTIndexPtr                              ind;
 
4691
        u_int                                   i, j;
 
4692
        XTIndBlockPtr                   block, fblock;
 
4693
        xtWord1                                 *data;
 
4694
        xtIndexNodeID                   ind_free;
 
4695
        xtBool                                  block_on_free_list = FALSE;
 
4696
        xtIndexNodeID                   last_address, next_address;
 
4697
        XTIndFreeListPtr                list_ptr;
 
4698
        u_int                                   dirty_blocks;
 
4699
        XTIndDirtyListItorRec   it;
 
4700
        //u_int                                 dirty_count;
 
4701
#ifdef TRACE_FLUSH_INDEX
 
4702
        time_t                                  tnow = 0;
 
4703
#endif
 
4704
 
 
4705
#ifdef TRACE_FLUSH_TIMES
 
4706
        XTThreadPtr thread = ot->ot_thread;
 
4707
        struct idxstats st;
 
4708
        xtWord8 now;
 
4709
        st.i_log_flush = thread->st_statistics.st_ilog.ts_write;
 
4710
        st.i_log_write = thread->st_statistics.st_ilog.ts_write;
 
4711
        st.idx_flush = thread->st_statistics.st_ind.ts_write;
 
4712
        st.idx_write = thread->st_statistics.st_ind.ts_write;
 
4713
        now = xt_trace_clock();
 
4714
#endif
 
4715
 
 
4716
#ifdef DEBUG_CHECK_IND_CACHE
 
4717
        xt_ind_check_cache(NULL);
 
4718
#endif
 
4719
        xt_lock_mutex_ns(&tab->tab_ind_flush_lock);
 
4720
        TRACE_FLUSH("LOCKED flush index lock", thread, &st, &now, 0);
 
4721
 
 
4722
        if (!xt_begin_checkpoint(tab->tab_db, have_table_lock, ot->ot_thread))
 
4723
                return FAILED;
 
4724
 
 
4725
        ASSERT_NS(!tab->tab_ind_flush_ilog);
 
4726
        if (!tab->tab_db->db_indlogs.ilp_get_log(&tab->tab_ind_flush_ilog, ot->ot_thread))
 
4727
                goto failed_3;
 
4728
        il = tab->tab_ind_flush_ilog;
 
4729
 
 
4730
        if (!il->il_reset(ot))
 
4731
                goto failed_2;
 
4732
        if (!il->il_write_byte(ot, XT_DT_LOG_HEAD))
 
4733
                goto failed_2;
 
4734
        if (!il->il_write_word4(ot, tab->tab_id))
 
4735
                goto failed_2;
 
4736
        if (!il->il_write_word4(ot, 0))
 
4737
                goto failed_2;
 
4738
        TRACE_FLUSH("reset ilog", thread, &st, &now, 0);
 
4739
 
 
4740
        /* Lock all: */
 
4741
        dirty_blocks = 0;
 
4742
        indp = tab->tab_dic.dic_keys;
 
4743
        for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
4744
                ind = *indp;
 
4745
                XT_INDEX_WRITE_LOCK(ind, ot);
 
4746
                if (ind->mi_free_list && ind->mi_free_list->fl_free_count)
 
4747
                        block_on_free_list = TRUE;
 
4748
                dirty_blocks += ind->mi_dirty_blocks;
 
4749
        }
 
4750
        TRACE_FLUSH("LOCKED all indexes", thread, &st, &now, 0);
 
4751
 
 
4752
        if (!dirty_blocks && !block_on_free_list) {
 
4753
                /* Nothing to flush... */
 
4754
                indp = tab->tab_dic.dic_keys;
 
4755
                for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
4756
                        ind = *indp;
 
4757
                        XT_INDEX_UNLOCK(ind, ot);
 
4758
                }
 
4759
                goto flush_done;
 
4760
        }
 
4761
 
 
4762
#ifdef TRACE_FLUSH_INDEX
 
4763
        tnow = time(NULL);
 
4764
        printf("FLUSH INDEX pages=%lu %s\n", (u_long) dirty_blocks, tab->tab_name->ps_path);
 
4765
#endif
 
4766
 
 
4767
        if (fit)
 
4768
                fit->fit_dirty_blocks = dirty_blocks;
 
4769
 
 
4770
        // 128 dirty blocks == 2MB
 
4771
        if (bytes_flushed)
 
4772
                *bytes_flushed += (dirty_blocks * XT_INDEX_PAGE_SIZE);
 
4773
 
 
4774
        /* Collect the index roots: */
 
4775
        data = tab->tab_index_head->tp_data;
 
4776
 
 
4777
        /* Collect a complete list of all dirty blocks: */
 
4778
        indp = tab->tab_dic.dic_keys;
 
4779
        for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
4780
                ind = *indp;
 
4781
                xt_spinlock_lock(&ind->mi_dirty_lock);
 
4782
                if ((block = ind->mi_dirty_list)) {
 
4783
                        while (block) {
 
4784
                                ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_DIRTY);
 
4785
#ifdef IND_OPT_DATA_WRITTEN
 
4786
                                ASSERT_NS(block->cb_max_pos <= XT_INDEX_PAGE_SIZE-2);
 
4787
#endif
 
4788
                                tab->tab_ind_dirty_list.dl_add_block(block);
 
4789
                                fblock = block->cb_dirty_next;
 
4790
                                block->cb_dirty_next = NULL;
 
4791
                                block->cb_dirty_prev = NULL;
 
4792
                                block->cb_state = IDX_CAC_BLOCK_FLUSHING;
 
4793
                                block = fblock;
 
4794
                        }
 
4795
                }
 
4796
                //dirty_count = ind->mi_dirty_blocks;
 
4797
                ind->mi_dirty_blocks = 0;
 
4798
                ind->mi_dirty_list = NULL;
 
4799
                xt_spinlock_unlock(&ind->mi_dirty_lock);
 
4800
                //ot->ot_thread->st_statistics.st_ind_cache_dirty -= dirty_count;
 
4801
                XT_SET_NODE_REF(tab, data, ind->mi_root);
 
4802
                data += XT_NODE_REF_SIZE;
 
4803
        }
 
4804
 
 
4805
        TRACE_FLUSH("Collected all blocks", thread, &st, &now, 0);
 
4806
 
 
4807
        xt_lock_mutex_ns(&tab->tab_ind_lock);
 
4808
        TRACE_FLUSH("LOCKED table index lock", thread, &st, &now, 0);
 
4809
 
 
4810
        /* Write the free list: */
 
4811
        if (block_on_free_list) {
 
4812
                union {
 
4813
                        xtWord1                         buffer[XT_BLOCK_SIZE_FOR_DIRECT_IO];
 
4814
                        XTIndFreeBlockRec       free_block;
 
4815
                } x;
 
4816
                memset(x.buffer, 0, sizeof(XTIndFreeBlockRec));
 
4817
 
 
4818
                /* The old start of the free list: */
 
4819
                XT_NODE_ID(ind_free) = 0;
 
4820
                /* This is a list of lists: */
 
4821
                while ((list_ptr = tab->tab_ind_free_list)) {
 
4822
                        /* If this free list still has unused blocks,
 
4823
                         * pick the first. That is the front of
 
4824
                         * the list of free blocks.
 
4825
                         */
 
4826
                        if (list_ptr->fl_start < list_ptr->fl_free_count) {
 
4827
                                ind_free = list_ptr->fl_page_id[list_ptr->fl_start];
 
4828
                                break;
 
4829
                        }
 
4830
                        /* This list is empty, free it: */
 
4831
                        tab->tab_ind_free_list = list_ptr->fl_next_list;
 
4832
                        xt_free_ns(list_ptr);
 
4833
                }
 
4834
                /* If nothing is on any list, then
 
4835
                 * take the value stored in the index header.
 
4836
                 * It is the from of the list on disk.
 
4837
                 */
 
4838
                if (!XT_NODE_ID(ind_free))
 
4839
                        ind_free = tab->tab_ind_free;
 
4840
 
 
4841
                if (!il->il_write_byte(ot, XT_DT_FREE_LIST))
 
4842
                        goto failed;
 
4843
                indp = tab->tab_dic.dic_keys;
 
4844
                XT_NODE_ID(last_address) = 0;
 
4845
                for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
4846
                        ind = *indp;
 
4847
                        if (ind->mi_free_list && ind->mi_free_list->fl_free_count) {
 
4848
                                for (j=0; j<ind->mi_free_list->fl_free_count; j++) {
 
4849
                                        next_address = ind->mi_free_list->fl_page_id[j];
 
4850
                                        /* Write out the IDs of the free blocks. */
 
4851
                                        if (!il->il_write_word4(ot, XT_NODE_ID(ind->mi_free_list->fl_page_id[j])))
 
4852
                                                goto failed;
 
4853
                                        if (XT_NODE_ID(last_address)) {
 
4854
                                                /* Update the page in cache, if it is in the cache! */
 
4855
                                                XT_SET_DISK_8(x.free_block.if_next_block_8, XT_NODE_ID(next_address));
 
4856
                                                if (!xt_ind_write_cache(ot, last_address, 8, x.buffer))
 
4857
                                                        goto failed;
 
4858
                                        }
 
4859
                                        last_address = next_address;
 
4860
                                }
 
4861
                        }
 
4862
                }
 
4863
                if (!il->il_write_word4(ot, XT_NODE_ID(ind_free)))
 
4864
                        goto failed;
 
4865
                if (XT_NODE_ID(last_address)) {
 
4866
                        XT_SET_DISK_8(x.free_block.if_next_block_8, XT_NODE_ID(tab->tab_ind_free));
 
4867
                        if (!xt_ind_write_cache(ot, last_address, 8, x.buffer))
 
4868
                                goto failed;
 
4869
                }
 
4870
                if (!il->il_write_word4(ot, 0xFFFFFFFF))
 
4871
                        goto failed;
 
4872
        }
 
4873
 
 
4874
        /*
 
4875
         * Add the free list caches to the global free list cache.
 
4876
         * Added backwards to match the write order.
 
4877
         */
 
4878
        indp = tab->tab_dic.dic_keys + tab->tab_dic.dic_key_count-1;
 
4879
        for (i=0; i<tab->tab_dic.dic_key_count; i++, indp--) {
 
4880
                ind = *indp;
 
4881
                if (ind->mi_free_list) {
 
4882
                        ind->mi_free_list->fl_next_list = tab->tab_ind_free_list;
 
4883
                        tab->tab_ind_free_list = ind->mi_free_list;
 
4884
                }
 
4885
                ind->mi_free_list = NULL;
 
4886
        }
 
4887
 
 
4888
        /*
 
4889
         * The new start of the free list is the first
 
4890
         * item on the table free list:
 
4891
         */
 
4892
        XT_NODE_ID(ind_free) = 0;
 
4893
        while ((list_ptr = tab->tab_ind_free_list)) {
 
4894
                if (list_ptr->fl_start < list_ptr->fl_free_count) {
 
4895
                        ind_free = list_ptr->fl_page_id[list_ptr->fl_start];
 
4896
                        break;
 
4897
                }
 
4898
                tab->tab_ind_free_list = list_ptr->fl_next_list;
 
4899
                xt_free_ns(list_ptr);
 
4900
        }
 
4901
        if (!XT_NODE_ID(ind_free))
 
4902
                ind_free = tab->tab_ind_free;
 
4903
        TRACE_FLUSH("did free block stuff", thread, &st, &now, 0);
 
4904
        xt_unlock_mutex_ns(&tab->tab_ind_lock);
 
4905
 
 
4906
        XT_SET_DISK_6(tab->tab_index_head->tp_ind_eof_6, XT_NODE_ID(tab->tab_ind_eof));
 
4907
        XT_SET_DISK_6(tab->tab_index_head->tp_ind_free_6, XT_NODE_ID(ind_free));
 
4908
 
 
4909
        TRACE_FLUSH("UN-LOCKED table index lock", thread, &st, &now, 0);
 
4910
        if (!il->il_write_header(ot, XT_INDEX_HEAD_SIZE, (xtWord1 *) tab->tab_index_head))
 
4911
                goto failed;
 
4912
 
 
4913
        TRACE_FLUSH("wrote ilog header", thread, &st, &now, 0);
 
4914
        indp = tab->tab_dic.dic_keys;
 
4915
        for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
4916
                ind = *indp;
 
4917
                XT_INDEX_UNLOCK(ind, ot);
 
4918
        }
 
4919
 
 
4920
        TRACE_FLUSH("UN-LOCKED all indexes", thread, &st, &now, 0);
 
4921
 
 
4922
        /* Write all blocks to the index log: */
 
4923
        if (tab->tab_ind_dirty_list.dl_total_blocks) {
 
4924
                tab->tab_ind_dirty_list.dl_sort_blocks();
 
4925
                it.dli_reset();
 
4926
                while ((block = tab->tab_ind_dirty_list.dl_next_block(&it))) {
 
4927
                        XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
 
4928
                        if (block->cb_state == IDX_CAC_BLOCK_FLUSHING) {
 
4929
                                if (!il->il_write_block(ot, block)) {
 
4930
                                        XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 
4931
                                        goto failed_2;
 
4932
                                }
 
4933
                        }
 
4934
                        XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 
4935
                }
 
4936
        }
 
4937
 
 
4938
        /* {PAGE-NO-IN-INDEX-FILE}
 
4939
         * At this point all blocks have been written to the index file.
 
4940
         * It is not safe to release them from the cache.
 
4941
         * It was not safe to do this before this point because
 
4942
         * read would read the wrong data.
 
4943
         *
 
4944
         * The exception to this is freed blocks.
 
4945
         * These are cached separately in the free block list.
 
4946
         */
 
4947
        TRACE_FLUSH("Wrote all blocks to the ilog", thread, &st, &now, 0);
 
4948
 
 
4949
        if (il->il_data_written()) {
 
4950
                /* Flush the log before we flush the index.
 
4951
                 *
 
4952
                 * The reason is, we must make sure that changes that
 
4953
                 * will be in the index are already in the transaction
 
4954
                 * log.
 
4955
                 *
 
4956
                 * Only then are we able to undo those changes on
 
4957
                 * recovery.
 
4958
                 *
 
4959
                 * Simple example:
 
4960
                 * CREATE TABLE t1 (s1 INT PRIMARY KEY);
 
4961
                 * INSERT INTO t1 VALUES (1);
 
4962
                 *
 
4963
                 * BEGIN;
 
4964
                 * INSERT INTO t1 VALUES (2);
 
4965
                 *
 
4966
                 * --- INDEX IS FLUSHED HERE ---
 
4967
                 *
 
4968
                 * --- SERVER CRASH HERE ---
 
4969
                 *
 
4970
                 *
 
4971
                 * The INSERT VALUES (2) has been written
 
4972
                 * to the log, but not flushed.
 
4973
                 * But the index has been updated.
 
4974
                 * If the index is flushed it will contain
 
4975
                 * the entry for record with s1=2.
 
4976
                 * 
 
4977
                 * This entry must be removed on recovery.
 
4978
                 *
 
4979
                 * To prevent this situation I flush the log
 
4980
                 * here.
 
4981
                 */
 
4982
                if (!XT_IS_TEMP_TABLE(tab->tab_dic.dic_tab_flags)) {
 
4983
                        /* Note, thread->st_database may not be set here:
 
4984
                         * #0   0x00defba3 in xt_spinlock_set at lock_xt.h:249
 
4985
                         * #1   0x00defbfd in xt_spinlock_lock at lock_xt.h:299
 
4986
                         * #2   0x00e65a98 in XTDatabaseLog::xlog_flush_pending at xactlog_xt.cc:737
 
4987
                         * #3   0x00e6a27f in XTDatabaseLog::xlog_flush at xactlog_xt.cc:727
 
4988
                         * #4   0x00e6a308 in xt_xlog_flush_log at xactlog_xt.cc:1476
 
4989
                         * #5   0x00e22fee in xt_flush_indices at index_xt.cc:4599
 
4990
                         * #6   0x00e49fe0 in xt_close_table at table_xt.cc:2678
 
4991
                         * #7   0x00df0f10 in xt_db_pool_exit at database_xt.cc:758
 
4992
                         * #8   0x00df3ca2 in db_finalize at database_xt.cc:342
 
4993
                         * #9   0x00e17037 in xt_heap_release at heap_xt.cc:110
 
4994
                         * #10  0x00df07c0 in db_hash_free at database_xt.cc:245
 
4995
                         * #11  0x00e0b145 in xt_free_hashtable at hashtab_xt.cc:84
 
4996
                         * #12  0x00df093e in xt_exit_databases at database_xt.cc:303
 
4997
                         * #13  0x00e0d3cf in pbxt_call_exit at ha_pbxt.cc:946
 
4998
                         * #14  0x00e0d498 in ha_exit at ha_pbxt.cc:975
 
4999
                         * #15  0x00e0dce6 in pbxt_end at ha_pbxt.cc:1274
 
5000
                         * #16  0x00e0dd03 in pbxt_panic at ha_pbxt.cc:1287
 
5001
                         * #17  0x002321a1 in ha_finalize_handlerton at handler.cc:392
 
5002
                         * #18  0x002f0567 in plugin_deinitialize at sql_plugin.cc:815
 
5003
                         * #19  0x002f370e in reap_plugins at sql_plugin.cc:903
 
5004
                         * #20  0x002f3eac in plugin_shutdown at sql_plugin.cc:1512
 
5005
                         * #21  0x000f3ba6 in clean_up at mysqld.cc:1238
 
5006
                         * #22  0x000f4046 in unireg_end at mysqld.cc:1166
 
5007
                         * #23  0x000fc2c5 in kill_server at mysqld.cc:1108
 
5008
                         * #24  0x000fd0d5 in kill_server_thread at mysqld.cc:1129
 
5009
                         */
 
5010
                        if (!tab->tab_db->db_xlog.xlog_flush(ot->ot_thread))
 
5011
                                goto failed_2;
 
5012
                        TRACE_FLUSH("FLUSHED xlog", thread, &st, &now, 0);
 
5013
                }
 
5014
 
 
5015
                if (!il->il_flush(ot))
 
5016
                        goto failed_2;
 
5017
                TRACE_FLUSH("FLUSHED ilog", thread, &st, &now, ILOG_FLUSH);
 
5018
 
 
5019
                if (!il->il_apply_log_write(ot))
 
5020
                        goto failed_2;
 
5021
 
 
5022
                TRACE_FLUSH("wrote all blocks to index", thread, &st, &now, 0);
 
5023
                /*
 
5024
                 * {NOT-IN-IND-FILE}
 
5025
                 * 1.0.01 - I have split apply log into flush and write
 
5026
                 * parts.
 
5027
                 *
 
5028
                 * I have to write before waiting threads can continue
 
5029
                 * because otherwise incorrect data will be read
 
5030
                 * when the cache block is freed before it is written
 
5031
                 * here.
 
5032
                 *
 
5033
                 * Risk: (see below).
 
5034
                 */
 
5035
                it.dli_reset();
 
5036
                while ((block = tab->tab_ind_dirty_list.dl_next_block(&it))) {
 
5037
                        XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
 
5038
                        if (block->cb_state == IDX_CAC_BLOCK_LOGGED) {
 
5039
                                block->cb_state = IDX_CAC_BLOCK_CLEAN;
 
5040
                                ot->ot_thread->st_statistics.st_ind_cache_dirty--;
 
5041
                        }
 
5042
                        XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 
5043
                }
 
5044
 
 
5045
                /* This will early notification for threads waiting for this operation
 
5046
                 * to get to this point.
 
5047
                 */
 
5048
                if (fit) {
 
5049
                        fit->fit_blocks_flushed = fit->fit_dirty_blocks;
 
5050
                        fit->fit_dirty_blocks = 0;
 
5051
                        xt_async_task_notify(fit);
 
5052
                }
 
5053
 
 
5054
                /*
 
5055
                 * 1.0.01 - Note: I have moved the flush to here.
 
5056
                 * It allows the calling thread to continue during the
 
5057
                 * flush, but:
 
5058
                 *
 
5059
                 * The  problem is, if the ilog flush fails then we have
 
5060
                 * lost the information to re-create a consistent flush again!
 
5061
                 */
 
5062
                if (!il->il_apply_log_flush(ot))
 
5063
                        goto failed_2;
 
5064
                TRACE_FLUSH("FLUSHED index file", thread, &st, &now, INDEX_FLUSH);
 
5065
        }
 
5066
 
 
5067
        flush_done:
 
5068
        tab->tab_ind_dirty_list.dl_free_all();
 
5069
        tab->tab_ind_flush_ilog = NULL;
 
5070
        il->il_release();
 
5071
 
 
5072
        /* Mark this table as index flushed: */
 
5073
        xt_checkpoint_set_flush_state(tab->tab_db, tab->tab_id, XT_CPT_STATE_DONE_INDEX);
 
5074
        TRACE_FLUSH("set index state", thread, &st, &now, 0);
 
5075
 
 
5076
#ifdef TRACE_FLUSH_INDEX
 
5077
        if (tnow) {
 
5078
                printf("flush index (%d) %s DONE\n", (int) (time(NULL) - tnow), tab->tab_name->ps_path);
 
5079
                fflush(stdout);
 
5080
        }
 
5081
#endif
 
5082
 
 
5083
        xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
 
5084
        TRACE_FLUSH("UN-LOCKED flush index lock", thread, &st, &now, 0);
 
5085
 
 
5086
        if (!xt_end_checkpoint(tab->tab_db, ot->ot_thread, NULL))
 
5087
                return FAILED;
 
5088
 
 
5089
#ifdef DEBUG_CHECK_IND_CACHE
 
5090
        xt_ind_check_cache((XTIndex *) 1);
 
5091
#endif
 
5092
        return OK;
 
5093
 
 
5094
        failed:
 
5095
        indp = tab->tab_dic.dic_keys;
 
5096
        for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
 
5097
                ind = *indp;
 
5098
                XT_INDEX_UNLOCK(ind, ot);
 
5099
        }
 
5100
 
 
5101
        failed_2:
 
5102
        tab->tab_ind_dirty_list.dl_free_all();
 
5103
        tab->tab_ind_flush_ilog = NULL;
 
5104
        il->il_release();
 
5105
 
 
5106
        failed_3:
 
5107
        xt_checkpoint_set_flush_state(tab->tab_db, tab->tab_id, XT_CPT_STATE_STOP_INDEX);
 
5108
 
 
5109
#ifdef TRACE_FLUSH_INDEX
 
5110
        if (tnow) {
 
5111
                printf("flush index (%d) %s FAILED\n", (int) (time(NULL) - tnow), tab->tab_name->ps_path);
 
5112
                fflush(stdout);
 
5113
        }
 
5114
#endif
 
5115
 
 
5116
        xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
 
5117
#ifdef DEBUG_CHECK_IND_CACHE
 
5118
        xt_ind_check_cache(NULL);
 
5119
#endif
 
5120
        return FAILED;
 
5121
}
 
5122
 
 
5123
void XTIndexLogPool::ilp_init(struct XTThread *self, struct XTDatabase *db, size_t log_buffer_size)
 
5124
{
 
5125
        char                    path[PATH_MAX];
 
5126
        XTOpenDirPtr    od;
 
5127
        xtLogID                 log_id;
 
5128
        char                    *file;
 
5129
        XTIndexLogPtr   il = NULL;
 
5130
        XTOpenTablePtr  ot = NULL;
 
5131
 
 
5132
        ilp_db = db;
 
5133
        ilp_log_buffer_size = log_buffer_size;
 
5134
        xt_init_mutex_with_autoname(self, &ilp_lock);
 
5135
 
 
5136
        xt_strcpy(PATH_MAX, path, db->db_main_path);
 
5137
        xt_add_system_dir(PATH_MAX, path);
 
5138
        if (xt_fs_exists(path)) {
 
5139
                pushsr_(od, xt_dir_close, xt_dir_open(self, path, NULL));
 
5140
                while (xt_dir_next(self, od)) {
 
5141
                        file = xt_dir_name(self, od);
 
5142
                        if (xt_starts_with(file, "ilog")) {
 
5143
                                if ((log_id = (xtLogID) xt_file_name_to_id(file))) {
 
5144
                                        if (!ilp_open_log(&il, log_id, FALSE, self))
 
5145
                                                goto failed;
 
5146
                                        if (il->il_tab_id && il->il_log_eof) {
 
5147
                                                if (!il->il_open_table(&ot))
 
5148
                                                        goto failed;
 
5149
                                                if (ot) {
 
5150
                                                        if (!il->il_apply_log_write(ot))
 
5151
                                                                goto failed;
 
5152
                                                        if (!il->il_apply_log_flush(ot))
 
5153
                                                                goto failed;
 
5154
                                                        ot->ot_thread = self;
 
5155
                                                        il->il_close_table(ot);
 
5156
                                                }
 
5157
                                        }
 
5158
                                        il->il_close(TRUE);
 
5159
                                }
 
5160
                        }
 
5161
                }
 
5162
                freer_(); // xt_dir_close(od)
 
5163
        }
 
5164
        return;
 
5165
 
 
5166
        failed:
 
5167
        /* TODO: Mark index as corrupted: */
 
5168
        if (ot && il)
 
5169
                il->il_close_table(ot);
 
5170
        if (il)
 
5171
                il->il_close(FALSE);
 
5172
        xt_throw(self);
 
5173
}
 
5174
 
 
5175
void XTIndexLogPool::ilp_close(struct XTThread *XT_UNUSED(self), xtBool lock)
 
5176
{
 
5177
        XTIndexLogPtr   il;
 
5178
 
 
5179
        if (lock)
 
5180
                xt_lock_mutex_ns(&ilp_lock);
 
5181
        while ((il = ilp_log_pool)) {
 
5182
                ilp_log_pool = il->il_next_in_pool;
 
5183
                il_pool_count--;
 
5184
                il->il_close(TRUE);
 
5185
        }
 
5186
        if (lock)
 
5187
                xt_unlock_mutex_ns(&ilp_lock);
 
5188
}
 
5189
 
 
5190
void XTIndexLogPool::ilp_exit(struct XTThread *self)
 
5191
{
 
5192
        ilp_close(self, FALSE);
 
5193
        ASSERT_NS(il_pool_count == 0);
 
5194
        xt_free_mutex(&ilp_lock);
 
5195
}
 
5196
 
 
5197
void XTIndexLogPool::ilp_name(size_t size, char *path, xtLogID log_id)
 
5198
{
 
5199
        char name[50];
 
5200
 
 
5201
        sprintf(name, "ilog-%lu.xt", (u_long) log_id);
 
5202
        xt_strcpy(size, path, ilp_db->db_main_path);
 
5203
        xt_add_system_dir(size, path);
 
5204
        xt_add_dir_char(size, path);
 
5205
        xt_strcat(size, path, name);
 
5206
}
 
5207
 
 
5208
xtBool XTIndexLogPool::ilp_open_log(XTIndexLogPtr *ret_il, xtLogID log_id, xtBool excl, XTThreadPtr thread)
 
5209
{
 
5210
        char                            log_path[PATH_MAX];
 
5211
        XTIndexLogPtr           il;
 
5212
        XTIndLogHeadDRec        log_head;
 
5213
        size_t                          read_size;
 
5214
 
 
5215
        ilp_name(PATH_MAX, log_path, log_id);
 
5216
        if (!(il = (XTIndexLogPtr) xt_calloc_ns(sizeof(XTIndexLogRec))))
 
5217
                return FAILED;
 
5218
        xt_spinlock_init_with_autoname(NULL, &il->il_write_lock);
 
5219
        il->il_log_id = log_id;
 
5220
        il->il_pool = this;
 
5221
 
 
5222
        /* Writes will be rounded up to the nearest direct write block size (see {WRITE-IN-BLOCKS}),
 
5223
         * so make sure we have space in the buffer for that:
 
5224
         */
 
5225
#ifdef DEBUG
 
5226
        if (IND_WRITE_BLOCK_SIZE < XT_BLOCK_SIZE_FOR_DIRECT_IO)
 
5227
                ASSERT_NS(FALSE);
 
5228
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5229
        if (XT_INDEX_PAGE_SIZE < IND_WRITE_BLOCK_SIZE)
 
5230
                ASSERT_NS(FALSE);
 
5231
#endif
 
5232
#endif
 
5233
#ifdef IND_FILL_BLOCK_TO_NEXT
 
5234
        if (!(il->il_buffer = (xtWord1 *) xt_malloc_ns(ilp_log_buffer_size + XT_INDEX_PAGE_SIZE)))
 
5235
                goto failed;
 
5236
#else
 
5237
        if (!(il->il_buffer = (xtWord1 *) xt_malloc_ns(ilp_log_buffer_size + IND_WRITE_BLOCK_SIZE)))
 
5238
                goto failed;
 
5239
#endif
 
5240
        il->il_buffer_size = ilp_log_buffer_size;
 
5241
 
 
5242
        if (!(il->il_of = xt_open_file_ns(log_path, XT_FT_STANDARD, (excl ? XT_FS_EXCLUSIVE : 0) | XT_FS_CREATE | XT_FS_MAKE_PATH, 0)))
 
5243
                goto failed;
 
5244
 
 
5245
        if (!xt_pread_file(il->il_of, 0, sizeof(XTIndLogHeadDRec), 0, &log_head, &read_size, &thread->st_statistics.st_ilog, thread))
 
5246
                goto failed;
 
5247
 
 
5248
        if (read_size == sizeof(XTIndLogHeadDRec)) {
 
5249
                il->il_tab_id = XT_GET_DISK_4(log_head.ilh_tab_id_4);
 
5250
                il->il_log_eof = XT_GET_DISK_4(log_head.ilh_log_eof_4);
 
5251
        }
 
5252
        else {
 
5253
                il->il_tab_id = 0;
 
5254
                il->il_log_eof = 0;
 
5255
        }
 
5256
 
 
5257
        *ret_il = il;
 
5258
        return OK;
 
5259
 
 
5260
        failed:
 
5261
        il->il_close(FALSE);
 
5262
        return FAILED;
 
5263
}
 
5264
 
 
5265
xtBool XTIndexLogPool::ilp_get_log(XTIndexLogPtr *ret_il, XTThreadPtr thread)
 
5266
{
 
5267
        XTIndexLogPtr   il;
 
5268
        xtLogID                 log_id = 0;
 
5269
 
 
5270
        xt_lock_mutex_ns(&ilp_lock);
 
5271
        if ((il = ilp_log_pool)) {
 
5272
                ilp_log_pool = il->il_next_in_pool;
 
5273
                il_pool_count--;
 
5274
        }
 
5275
        else {
 
5276
                ilp_next_log_id++;
 
5277
                log_id = ilp_next_log_id;
 
5278
        }
 
5279
        xt_unlock_mutex_ns(&ilp_lock);
 
5280
        if (!il) {
 
5281
                if (!ilp_open_log(&il, log_id, TRUE, thread))
 
5282
                        return FAILED;
 
5283
        }
 
5284
        *ret_il= il;
 
5285
        return OK;
 
5286
}
 
5287
 
 
5288
void XTIndexLogPool::ilp_release_log(XTIndexLogPtr il)
 
5289
{
 
5290
        xt_lock_mutex_ns(&ilp_lock);
 
5291
        if (il_pool_count == 5)
 
5292
                il->il_close(TRUE);
 
5293
        else {
 
5294
                il_pool_count++;
 
5295
                il->il_next_in_pool = ilp_log_pool;
 
5296
                ilp_log_pool = il;
 
5297
        }
 
5298
        xt_unlock_mutex_ns(&ilp_lock);
 
5299
}
 
5300
 
 
5301
xtBool XTIndexLog::il_reset(struct XTOpenTable *ot)
 
5302
{
 
5303
        XTIndLogHeadDRec        log_head;
 
5304
        xtTableID                       tab_id = ot->ot_table->tab_id;
 
5305
 
 
5306
        il_tab_id = tab_id;
 
5307
        il_log_eof = 0;
 
5308
        il_buffer_len = 0;
 
5309
        il_buffer_offset = 0;
 
5310
 
 
5311
        /* We must write the header and flush here or the "previous" status (from the
 
5312
         * last flush run) could remain. Failure to write the file completely leave the
 
5313
         * old header in place, and other parts of the file changed.
 
5314
         * This would lead to index corruption.  
 
5315
         */
 
5316
        log_head.ilh_data_type = XT_DT_LOG_HEAD;
 
5317
        XT_SET_DISK_4(log_head.ilh_tab_id_4, tab_id);
 
5318
        XT_SET_DISK_4(log_head.ilh_log_eof_4, 0);
 
5319
 
 
5320
        if (!xt_pwrite_file(il_of, 0, sizeof(XTIndLogHeadDRec), (xtWord1 *) &log_head, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5321
                return FAILED;
 
5322
 
 
5323
        if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5324
                return FAILED;
 
5325
 
 
5326
        return OK;
 
5327
}
 
5328
 
 
5329
xtBool XTIndexLog::il_data_written()
 
5330
{
 
5331
        return il_buffer_offset != 0 || il_buffer_len != 0;
 
5332
}
 
5333
 
 
5334
void XTIndexLog::il_close(xtBool delete_it)
 
5335
{
 
5336
        xtLogID log_id = il_log_id;
 
5337
 
 
5338
        if (il_of) {
 
5339
                xt_close_file_ns(il_of);
 
5340
                il_of = NULL;
 
5341
        }
 
5342
        
 
5343
        if (delete_it && log_id) {
 
5344
                char    log_path[PATH_MAX];
 
5345
 
 
5346
                il_pool->ilp_name(PATH_MAX, log_path, log_id);
 
5347
                xt_fs_delete(NULL, log_path);
 
5348
        }
 
5349
 
 
5350
        if (il_buffer) {
 
5351
                xt_free_ns(il_buffer);
 
5352
                il_buffer = NULL;
 
5353
        }
 
5354
 
 
5355
        xt_spinlock_free(NULL, &il_write_lock);
 
5356
        xt_free_ns(this);
 
5357
}
 
5358
 
 
5359
void XTIndexLog::il_release()
 
5360
{
 
5361
        il_pool->ilp_db->db_indlogs.ilp_release_log(this);
 
5362
}
 
5363
 
 
5364
xtBool XTIndexLog::il_require_space(size_t bytes, XTThreadPtr thread)
 
5365
{
 
5366
        if (il_buffer_len + bytes > il_buffer_size) {
 
5367
                if (!xt_pwrite_file(il_of, il_buffer_offset, il_buffer_len, il_buffer, &thread->st_statistics.st_ilog, thread))
 
5368
                        return FAILED;
 
5369
                il_buffer_offset += il_buffer_len;
 
5370
                il_buffer_len = 0;
 
5371
        }
 
5372
 
 
5373
        return OK;
 
5374
}
 
5375
 
 
5376
xtBool XTIndexLog::il_write_byte(struct XTOpenTable *ot, xtWord1 byte)
 
5377
{
 
5378
        if (!il_require_space(1, ot->ot_thread))
 
5379
                return FAILED;
 
5380
        *(il_buffer + il_buffer_len) = byte;
 
5381
        il_buffer_len++;
 
5382
        return OK;
 
5383
}
 
5384
 
 
5385
xtBool XTIndexLog::il_write_word4(struct XTOpenTable *ot, xtWord4 value)
 
5386
{
 
5387
        xtWord1 *buffer;
 
5388
 
 
5389
        if (!il_require_space(4, ot->ot_thread))
 
5390
                return FAILED;
 
5391
        buffer = il_buffer + il_buffer_len;
 
5392
        XT_SET_DISK_4(buffer, value);
 
5393
        il_buffer_len += 4;
 
5394
        return OK;
 
5395
}
 
5396
 
 
5397
/*
 
5398
 * This function assumes that the block is xlocked!
 
5399
 */
 
5400
xtBool XTIndexLog::il_write_block(struct XTOpenTable *ot, XTIndBlockPtr block)
 
5401
{
 
5402
        xtIndexNodeID           node_id;
 
5403
        XTIdxBranchDPtr         node;
 
5404
        u_int                           block_len;
 
5405
 
 
5406
        node_id = block->cb_address;
 
5407
        node = (XTIdxBranchDPtr) block->cb_data;
 
5408
        block_len = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(node->tb_size_2));
 
5409
        
 
5410
        //il_min_byte_count += (block->cb_max_pos - block->cb_min_pos) + (block->cb_header ? XT_INDEX_PAGE_HEAD_SIZE : 0);
 
5411
#ifdef IND_WRITE_MIN_DATA
 
5412
        u_int                           max_pos;
 
5413
        u_int                           min_pos;
 
5414
        xtBool                          eo_block = FALSE;
 
5415
 
 
5416
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5417
        /* Round up to block boundary: */
 
5418
        max_pos = ((block->cb_max_pos + XT_INDEX_PAGE_HEAD_SIZE + IND_WRITE_BLOCK_SIZE - 1) / IND_WRITE_BLOCK_SIZE) * IND_WRITE_BLOCK_SIZE;
 
5419
        if (max_pos > block_len)
 
5420
                max_pos = block_len;
 
5421
        max_pos -= XT_INDEX_PAGE_HEAD_SIZE;
 
5422
        /* Round down to block boundary: */
 
5423
        min_pos = ((block->cb_min_pos + XT_INDEX_PAGE_HEAD_SIZE) / IND_WRITE_BLOCK_SIZE) * IND_WRITE_BLOCK_SIZE;
 
5424
        if (min_pos > 0)
 
5425
                min_pos -= XT_INDEX_PAGE_HEAD_SIZE;
 
5426
#else
 
5427
        max_pos = block->cb_max_pos;
 
5428
        min_pos = block->cb_min_pos;
 
5429
#endif
 
5430
 
 
5431
        if (block_len == max_pos + XT_INDEX_PAGE_HEAD_SIZE)
 
5432
                eo_block = TRUE;
 
5433
                
 
5434
        ASSERT_NS(max_pos <= XT_INDEX_PAGE_SIZE-XT_INDEX_PAGE_HEAD_SIZE);
 
5435
        ASSERT_NS(min_pos <= block_len-XT_INDEX_PAGE_HEAD_SIZE);
 
5436
        ASSERT_NS(max_pos <= block_len-XT_INDEX_PAGE_HEAD_SIZE);
 
5437
        ASSERT_NS(min_pos <= max_pos);
 
5438
 
 
5439
        xt_spinlock_lock(&il_write_lock);
 
5440
        if (block->cb_min_pos == block->cb_max_pos) {
 
5441
                /* This means just the header was changed. */
 
5442
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5443
                XTIndShortPageDataDPtr          sh_data;
 
5444
 
 
5445
                if (!il_require_space(offsetof(XTIndShortPageDataDRec, ild_data) + IND_WRITE_BLOCK_SIZE, ot->ot_thread))
 
5446
                        goto failed;
 
5447
 
 
5448
                sh_data = (XTIndShortPageDataDPtr) (il_buffer + il_buffer_len);
 
5449
                sh_data->ild_data_type = XT_DT_SHORT_IND_PAGE;
 
5450
                XT_SET_DISK_4(sh_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5451
                XT_SET_DISK_2(sh_data->ild_size_2, IND_WRITE_BLOCK_SIZE);
 
5452
                memcpy(sh_data->ild_data, block->cb_data, IND_WRITE_BLOCK_SIZE);
 
5453
                il_buffer_len += offsetof(XTIndShortPageDataDRec, ild_data) + IND_WRITE_BLOCK_SIZE;
 
5454
#else
 
5455
                XTIndSetPageHeadDataDPtr        sph_data;
 
5456
 
 
5457
                if (!il_require_space(sizeof(XTIndSetPageHeadDataDRec), ot->ot_thread))
 
5458
                        goto failed;
 
5459
 
 
5460
                ASSERT_NS(sizeof(XTIndSetPageHeadDataDRec) <= il_buffer_size);
 
5461
 
 
5462
                sph_data = (XTIndSetPageHeadDataDPtr) (il_buffer + il_buffer_len);
 
5463
                sph_data->ild_data_type = XT_DT_SET_PAGE_HEAD;
 
5464
                XT_SET_DISK_4(sph_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5465
                XT_COPY_DISK_2(sph_data->ild_page_head_2, block->cb_data);
 
5466
                il_buffer_len += sizeof(XTIndSetPageHeadDataDRec);
 
5467
#endif
 
5468
        }
 
5469
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5470
        else if (min_pos == 0 || (block->cb_header && min_pos == IND_WRITE_BLOCK_SIZE - XT_INDEX_PAGE_HEAD_SIZE))
 
5471
#else
 
5472
        else if (min_pos < 16 - XT_INDEX_PAGE_HEAD_SIZE)
 
5473
#endif
 
5474
        {
 
5475
                /* Fuse, and write the whole block: */
 
5476
                if (eo_block) {
 
5477
                        XTIndPageDataDPtr       p_data;
 
5478
 
 
5479
                        if (!il_require_space(offsetof(XTIndPageDataDRec, ild_data) + block_len, ot->ot_thread))
 
5480
                                goto failed;
 
5481
 
 
5482
                        ASSERT_NS(offsetof(XTIndPageDataDRec, ild_data) + block_len <= il_buffer_size);
 
5483
 
 
5484
                        p_data = (XTIndPageDataDPtr) (il_buffer + il_buffer_len);
 
5485
                        p_data->ild_data_type = XT_DT_INDEX_PAGE;
 
5486
                        XT_SET_DISK_4(p_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5487
                        memcpy(p_data->ild_data, block->cb_data, block_len);
 
5488
                        il_buffer_len += offsetof(XTIndPageDataDRec, ild_data) + block_len;
 
5489
                }
 
5490
                else {
 
5491
                        XTIndShortPageDataDPtr  sp_data;
 
5492
 
 
5493
                        block_len = max_pos + XT_INDEX_PAGE_HEAD_SIZE;
 
5494
 
 
5495
                        if (!il_require_space(offsetof(XTIndShortPageDataDRec, ild_data) + block_len, ot->ot_thread))
 
5496
                                goto failed;
 
5497
 
 
5498
                        ASSERT_NS(offsetof(XTIndShortPageDataDRec, ild_data) + block_len <= il_buffer_size);
 
5499
 
 
5500
                        sp_data = (XTIndShortPageDataDPtr) (il_buffer + il_buffer_len);
 
5501
                        sp_data->ild_data_type = XT_DT_SHORT_IND_PAGE;
 
5502
                        XT_SET_DISK_4(sp_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5503
                        XT_SET_DISK_2(sp_data->ild_size_2, block_len);
 
5504
                        memcpy(sp_data->ild_data, block->cb_data, block_len);
 
5505
                        il_buffer_len += offsetof(XTIndShortPageDataDRec, ild_data) + block_len;
 
5506
                }
 
5507
        }
 
5508
        else {
 
5509
                block_len = max_pos - min_pos;
 
5510
 
 
5511
                if (block->cb_header) {
 
5512
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5513
                        XTIndDoubleModPageDataDPtr      dd_data;
 
5514
 
 
5515
                        ASSERT_NS(min_pos + XT_INDEX_PAGE_HEAD_SIZE >= 2*IND_WRITE_BLOCK_SIZE);
 
5516
                        ASSERT_NS((min_pos + XT_INDEX_PAGE_HEAD_SIZE) % IND_WRITE_BLOCK_SIZE == 0);
 
5517
                        if (!il_require_space(offsetof(XTIndDoubleModPageDataDRec, dld_data) + IND_WRITE_BLOCK_SIZE + block_len, ot->ot_thread))
 
5518
                                goto failed;
 
5519
 
 
5520
                        dd_data = (XTIndDoubleModPageDataDPtr) (il_buffer + il_buffer_len);
 
5521
                        dd_data->dld_data_type = eo_block ? XT_DT_2_MOD_IND_PAGE_EOB : XT_DT_2_MOD_IND_PAGE;
 
5522
                        XT_SET_DISK_4(dd_data->dld_page_id_4, XT_NODE_ID(node_id));
 
5523
                        XT_SET_DISK_2(dd_data->dld_size1_2, IND_WRITE_BLOCK_SIZE);
 
5524
                        XT_SET_DISK_2(dd_data->dld_offset2_2, min_pos);
 
5525
                        XT_SET_DISK_2(dd_data->dld_size2_2, block_len);
 
5526
                        memcpy(dd_data->dld_data, block->cb_data, IND_WRITE_BLOCK_SIZE);
 
5527
                        memcpy(dd_data->dld_data + IND_WRITE_BLOCK_SIZE, block->cb_data + XT_INDEX_PAGE_HEAD_SIZE + min_pos, block_len);
 
5528
                        il_buffer_len += offsetof(XTIndDoubleModPageDataDRec, dld_data) + IND_WRITE_BLOCK_SIZE + block_len;
 
5529
#else
 
5530
                        XTIndModPageHeadDataDPtr        mph_data;
 
5531
 
 
5532
                        if (!il_require_space(offsetof(XTIndModPageHeadDataDRec, ild_data) + block_len, ot->ot_thread))
 
5533
                                goto failed;
 
5534
 
 
5535
                        mph_data = (XTIndModPageHeadDataDPtr) (il_buffer + il_buffer_len);
 
5536
                        mph_data->ild_data_type = eo_block ? XT_DT_MOD_IND_PAGE_HEAD_EOB : XT_DT_MOD_IND_PAGE_HEAD;
 
5537
                        XT_SET_DISK_4(mph_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5538
                        XT_SET_DISK_2(mph_data->ild_size_2, block_len);
 
5539
                        XT_SET_DISK_2(mph_data->ild_offset_2, min_pos);
 
5540
                        XT_COPY_DISK_2(mph_data->ild_page_head_2, block->cb_data);
 
5541
                        memcpy(mph_data->ild_data, block->cb_data + XT_INDEX_PAGE_HEAD_SIZE + min_pos, block_len);
 
5542
                        il_buffer_len += offsetof(XTIndModPageHeadDataDRec, ild_data) + block_len;
 
5543
#endif
 
5544
                }
 
5545
                else {
 
5546
                        XTIndModPageDataDPtr    mp_data;
 
5547
 
 
5548
                        if (!il_require_space(offsetof(XTIndModPageDataDRec, ild_data) + block_len, ot->ot_thread))
 
5549
                                goto failed;
 
5550
 
 
5551
                        mp_data = (XTIndModPageDataDPtr) (il_buffer + il_buffer_len);
 
5552
                        mp_data->ild_data_type = eo_block ? XT_DT_MOD_IND_PAGE_EOB : XT_DT_MOD_IND_PAGE;
 
5553
                        XT_SET_DISK_4(mp_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5554
                        XT_SET_DISK_2(mp_data->ild_size_2, block_len);
 
5555
                        XT_SET_DISK_2(mp_data->ild_offset_2, min_pos);
 
5556
                        memcpy(mp_data->ild_data, block->cb_data + XT_INDEX_PAGE_HEAD_SIZE + min_pos, block_len);
 
5557
                        il_buffer_len += offsetof(XTIndModPageDataDRec, ild_data) + block_len;
 
5558
                }
 
5559
        }
 
5560
 
 
5561
        block->cb_header = FALSE;
 
5562
        block->cb_min_pos = 0xFFFF;
 
5563
        block->cb_max_pos = 0;
 
5564
 
 
5565
#else // IND_OPT_DATA_WRITTEN
 
5566
        XTIndPageDataDPtr       page_data;
 
5567
 
 
5568
        if (!il_require_space(offsetof(XTIndPageDataDRec, ild_data) + block_len, ot->ot_thread))
 
5569
                goto failed;
 
5570
 
 
5571
        ASSERT_NS(offsetof(XTIndPageDataDRec, ild_data) + XT_INDEX_PAGE_SIZE <= il_buffer_size);
 
5572
 
 
5573
        page_data = (XTIndPageDataDPtr) (il_buffer + il_buffer_len);
 
5574
        TRACK_BLOCK_TO_FLUSH(node_id);
 
5575
        page_data->ild_data_type = XT_DT_INDEX_PAGE;
 
5576
        XT_SET_DISK_4(page_data->ild_page_id_4, XT_NODE_ID(node_id));
 
5577
        memcpy(page_data->ild_data, block->cb_data, block_len);
 
5578
 
 
5579
        il_buffer_len += offsetof(XTIndPageDataDRec, ild_data) + block_len;
 
5580
 
 
5581
#endif // IND_OPT_DATA_WRITTEN
 
5582
        xt_spinlock_unlock(&il_write_lock);
 
5583
 
 
5584
        ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_FLUSHING);
 
5585
        block->cb_state = IDX_CAC_BLOCK_LOGGED;
 
5586
 
 
5587
        TRACK_BLOCK_TO_FLUSH(node_id);
 
5588
        return OK;
 
5589
 
 
5590
        failed:
 
5591
        xt_spinlock_unlock(&il_write_lock);
 
5592
        return FAILED;
 
5593
}
 
5594
 
 
5595
xtBool XTIndexLog::il_write_header(struct XTOpenTable *ot, size_t head_size, xtWord1 *head_buf)
 
5596
{
 
5597
        XTIndHeadDataDPtr       head_data;
 
5598
 
 
5599
        if (!il_require_space(offsetof(XTIndHeadDataDRec, ilh_data) + head_size, ot->ot_thread))
 
5600
                return FAILED;
 
5601
 
 
5602
        head_data = (XTIndHeadDataDPtr) (il_buffer + il_buffer_len);
 
5603
        head_data->ilh_data_type = XT_DT_HEADER;
 
5604
        XT_SET_DISK_2(head_data->ilh_head_size_2, head_size);
 
5605
        memcpy(head_data->ilh_data, head_buf, head_size);
 
5606
 
 
5607
        il_buffer_len += offsetof(XTIndHeadDataDRec, ilh_data) + head_size;
 
5608
 
 
5609
        return OK;
 
5610
}
 
5611
 
 
5612
xtBool XTIndexLog::il_flush(struct XTOpenTable *ot)
 
5613
{
 
5614
        XTIndLogHeadDRec        log_head;
 
5615
        xtTableID                       tab_id = ot->ot_table->tab_id;
 
5616
 
 
5617
        if (il_buffer_len) {
 
5618
                if (!xt_pwrite_file(il_of, il_buffer_offset, il_buffer_len, il_buffer, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5619
                        return FAILED;
 
5620
                il_buffer_offset += il_buffer_len;
 
5621
                il_buffer_len = 0;
 
5622
        }
 
5623
 
 
5624
        if (il_log_eof != il_buffer_offset) {
 
5625
                log_head.ilh_data_type = XT_DT_LOG_HEAD;
 
5626
                XT_SET_DISK_4(log_head.ilh_tab_id_4, tab_id);
 
5627
                XT_SET_DISK_4(log_head.ilh_log_eof_4, il_buffer_offset);
 
5628
 
 
5629
                if (!XT_IS_TEMP_TABLE(ot->ot_table->tab_dic.dic_tab_flags)) {
 
5630
                        if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5631
                                return FAILED;
 
5632
                }
 
5633
 
 
5634
                if (!xt_pwrite_file(il_of, 0, sizeof(XTIndLogHeadDRec), (xtWord1 *) &log_head, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5635
                        return FAILED;
 
5636
 
 
5637
                if (!XT_IS_TEMP_TABLE(ot->ot_table->tab_dic.dic_tab_flags)) {
 
5638
                        if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5639
                                return FAILED;
 
5640
                }
 
5641
 
 
5642
                il_tab_id = tab_id;
 
5643
                il_log_eof = il_buffer_offset;
 
5644
        }
 
5645
        return OK;
 
5646
}
 
5647
 
 
5648
#ifdef CHECK_IF_WRITE_WAS_OK
 
5649
static void check_buff(void *in_a, void *in_b, int len)
 
5650
{
 
5651
        xtWord1 *a = (xtWord1 *) in_a;
 
5652
        xtWord1 *b = (xtWord1 *) in_b;
 
5653
        int offset = 0;
 
5654
 
 
5655
        while (offset < len) {
 
5656
                if (*a != *b) {
 
5657
                        printf("Missmatch at offset = %d %x != %x\n", offset, (int) *a, (int) *b);
 
5658
                        //xt_dump_trace();
 
5659
                        ASSERT_NS(FALSE);
 
5660
                }
 
5661
                offset++;
 
5662
                a++;
 
5663
                b++;
 
5664
        }
 
5665
}
 
5666
#endif
 
5667
 
 
5668
xtBool XTIndexLog::il_apply_log_write(struct XTOpenTable *ot)
 
5669
{
 
5670
        XT_NODE_TEMP;
 
5671
        register XTTableHPtr    tab = ot->ot_table;
 
5672
        off_t                                   offset;
 
5673
        size_t                                  pos;
 
5674
        xtWord1                                 *buffer;
 
5675
        off_t                                   address;
 
5676
        xtIndexNodeID                   node_id;
 
5677
        size_t                                  req_size = 0;
 
5678
        XTIdxBranchDPtr                 node;
 
5679
        u_int                                   block_len;
 
5680
        u_int                                   block_offset;
 
5681
        xtWord1                                 *block_header;
 
5682
        xtWord1                                 *block_data;
 
5683
#ifdef CHECK_IF_WRITE_WAS_OK
 
5684
        XTIndReferenceRec               c_iref;
 
5685
        XTIdxBranchDPtr                 c_node;
 
5686
        u_int                                   c_block_len;
 
5687
#endif
 
5688
 
 
5689
        offset = 0;
 
5690
        while (offset < il_log_eof) {
 
5691
                if (offset < il_buffer_offset ||
 
5692
                        offset >= il_buffer_offset + (off_t) il_buffer_len) {
 
5693
                        il_buffer_len = il_buffer_size;
 
5694
                        if (il_log_eof - offset < (off_t) il_buffer_len)
 
5695
                                il_buffer_len = (size_t) (il_log_eof - offset);
 
5696
 
 
5697
                        /* Corrupt log?! */
 
5698
                        if (il_buffer_len < req_size) {
 
5699
                                xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_LOG_CORRUPT, xt_file_path(il_of));
 
5700
                                xt_log_and_clear_exception_ns();
 
5701
                                return OK;
 
5702
                        }
 
5703
                        if (!xt_pread_file(il_of, offset, il_buffer_len, il_buffer_len, il_buffer, NULL, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5704
                                return FAILED;
 
5705
                        il_buffer_offset = offset;
 
5706
                }
 
5707
                pos = (size_t) (offset - il_buffer_offset);
 
5708
                ASSERT_NS(pos < il_buffer_len);
 
5709
#ifdef CHECK_IF_WRITE_WAS_OK
 
5710
                node_id = 0;
 
5711
#endif
 
5712
                buffer = il_buffer + pos;
 
5713
                switch (*buffer) {
 
5714
                        case XT_DT_LOG_HEAD:
 
5715
                                req_size = sizeof(XTIndLogHeadDRec);
 
5716
                                if (il_buffer_len - pos < req_size) {
 
5717
                                        il_buffer_len = 0;
 
5718
                                        continue;
 
5719
                                }
 
5720
                                offset += req_size;
 
5721
                                req_size = 0;
 
5722
                                break;
 
5723
                        case XT_DT_SHORT_IND_PAGE: {
 
5724
                                XTIndShortPageDataDPtr          sp_data;
 
5725
 
 
5726
                                req_size = offsetof(XTIndShortPageDataDRec, ild_data);
 
5727
                                if (il_buffer_len - pos < req_size) {
 
5728
                                        il_buffer_len = 0;
 
5729
                                        continue;
 
5730
                                }
 
5731
 
 
5732
                                sp_data = (XTIndShortPageDataDPtr) buffer;
 
5733
                                node_id = XT_RET_NODE_ID(XT_GET_DISK_4(sp_data->ild_page_id_4));
 
5734
                                block_len = XT_GET_DISK_2(sp_data->ild_size_2);
 
5735
                                block_data = sp_data->ild_data;
 
5736
                                goto do_ind_page;
 
5737
                        }
 
5738
                        case XT_DT_INDEX_PAGE:
 
5739
                                XTIndPageDataDPtr       p_data;
 
5740
 
 
5741
                                req_size = offsetof(XTIndPageDataDRec, ild_data);
 
5742
                                if (il_buffer_len - pos < req_size + 2) {
 
5743
                                        il_buffer_len = 0;
 
5744
                                        continue;
 
5745
                                }
 
5746
 
 
5747
                                p_data = (XTIndPageDataDPtr) buffer;
 
5748
                                node_id = XT_RET_NODE_ID(XT_GET_DISK_4(p_data->ild_page_id_4));
 
5749
                                node = (XTIdxBranchDPtr) p_data->ild_data;
 
5750
                                block_len = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(node->tb_size_2));
 
5751
                                block_data = p_data->ild_data;
 
5752
 
 
5753
                                do_ind_page:
 
5754
                                if (block_len < 2 || block_len > XT_INDEX_PAGE_SIZE) {
 
5755
                                        xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
 
5756
                                        return FAILED;
 
5757
                                }
 
5758
 
 
5759
                                req_size += block_len;
 
5760
#ifdef IND_FILL_BLOCK_TO_NEXT
 
5761
                                /* Make sure we have the start of the next block in the buffer: 
 
5762
                                 * Should always work because a XT_DT_INDEX_PAGE is never the last
 
5763
                                 * block.
 
5764
                                 */
 
5765
                                if (il_buffer_len - pos < req_size + offsetof(XTIndPageDataDRec, ild_data))
 
5766
#else
 
5767
                                if (il_buffer_len - pos < req_size)
 
5768
#endif
 
5769
                                {
 
5770
                                        il_buffer_len = 0;
 
5771
                                        continue;
 
5772
                                }
 
5773
 
 
5774
                                TRACK_BLOCK_FLUSH_N(node_id);
 
5775
                                address = xt_ind_node_to_offset(tab, node_id);
 
5776
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5777
                                /* {WRITE-IN-BLOCKS} Round up the block size. Space has been provided. */
 
5778
                                block_len = ((block_len + IND_WRITE_BLOCK_SIZE - 1) / IND_WRITE_BLOCK_SIZE) * IND_WRITE_BLOCK_SIZE;
 
5779
#endif
 
5780
                                IDX_TRACE("%d- W%x\n", (int) XT_NODE_ID(node_id), (int) XT_GET_DISK_2(block_data));
 
5781
#ifdef IND_FILL_BLOCK_TO_NEXT
 
5782
                                if (block_len < XT_INDEX_PAGE_SIZE) {
 
5783
                                        XTIndPageDataDPtr       next_page_data;
 
5784
 
 
5785
                                        next_page_data = (XTIndPageDataDPtr) (buffer + req_size);
 
5786
                                        if (next_page_data->ild_data_type == XT_DT_INDEX_PAGE) {
 
5787
                                                xtIndexNodeID next_node_id;
 
5788
 
 
5789
                                                next_node_id = XT_RET_NODE_ID(XT_GET_DISK_4(next_page_data->ild_page_id_4));
 
5790
                                                /* Write the whole page, if that means leaving no gaps! */
 
5791
                                                if (next_node_id == node_id+1)
 
5792
                                                        block_len = XT_INDEX_PAGE_SIZE;
 
5793
                                        }
 
5794
                                }
 
5795
#endif
 
5796
                                ASSERT_NS(block_len >= 2 && block_len <= XT_INDEX_PAGE_SIZE);
 
5797
                                if (!il_pwrite_file(ot, address, block_len, block_data))
 
5798
                                        return FAILED;
 
5799
 
 
5800
                                offset += req_size;
 
5801
                                req_size = 0;
 
5802
                                break;
 
5803
                        case XT_DT_SET_PAGE_HEAD: {
 
5804
                                XTIndSetPageHeadDataDPtr        sph_data;
 
5805
        
 
5806
                                req_size = sizeof(XTIndSetPageHeadDataDRec);
 
5807
                                if (il_buffer_len - pos < req_size) {
 
5808
                                        il_buffer_len = 0;
 
5809
                                        continue;
 
5810
                                }
 
5811
 
 
5812
                                sph_data = (XTIndSetPageHeadDataDPtr) buffer;
 
5813
                                node_id = XT_RET_NODE_ID(XT_GET_DISK_4(sph_data->ild_page_id_4));
 
5814
                                block_offset = 0;
 
5815
                                block_len = 0;
 
5816
                                block_header = sph_data->ild_page_head_2;
 
5817
                                block_data = NULL;
 
5818
                                goto do_mod_page;
 
5819
                        }
 
5820
                        case XT_DT_MOD_IND_PAGE_HEAD:
 
5821
                        case XT_DT_MOD_IND_PAGE_HEAD_EOB: {
 
5822
                                XTIndModPageHeadDataDPtr        mph_data;
 
5823
        
 
5824
                                req_size = offsetof(XTIndModPageHeadDataDRec, ild_data);
 
5825
                                if (il_buffer_len - pos < req_size) {
 
5826
                                        il_buffer_len = 0;
 
5827
                                        continue;
 
5828
                                }
 
5829
 
 
5830
                                mph_data = (XTIndModPageHeadDataDPtr) buffer;
 
5831
                                node_id = XT_RET_NODE_ID(XT_GET_DISK_4(mph_data->ild_page_id_4));
 
5832
                                block_offset = XT_GET_DISK_2(mph_data->ild_offset_2);
 
5833
                                block_len = XT_GET_DISK_2(mph_data->ild_size_2);
 
5834
                                block_header = mph_data->ild_page_head_2;
 
5835
                                block_data = mph_data->ild_data;
 
5836
                                goto do_mod_page;
 
5837
                        }
 
5838
                        case XT_DT_MOD_IND_PAGE:
 
5839
                        case XT_DT_MOD_IND_PAGE_EOB:
 
5840
                                XTIndModPageDataDPtr            mp_data;
 
5841
 
 
5842
                                req_size = offsetof(XTIndModPageDataDRec, ild_data);
 
5843
                                if (il_buffer_len - pos < req_size) {
 
5844
                                        il_buffer_len = 0;
 
5845
                                        continue;
 
5846
                                }
 
5847
 
 
5848
                                mp_data = (XTIndModPageDataDPtr) buffer;
 
5849
                                node_id = XT_RET_NODE_ID(XT_GET_DISK_4(mp_data->ild_page_id_4));
 
5850
                                block_offset = XT_GET_DISK_2(mp_data->ild_offset_2);
 
5851
                                block_len = XT_GET_DISK_2(mp_data->ild_size_2);
 
5852
                                block_header = NULL;
 
5853
                                block_data = mp_data->ild_data;
 
5854
 
 
5855
                                do_mod_page:
 
5856
                                if (block_offset + block_len > XT_INDEX_PAGE_DATA_SIZE) {
 
5857
                                        xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
 
5858
                                        return FAILED;
 
5859
                                }
 
5860
 
 
5861
                                req_size += block_len;
 
5862
                                if (il_buffer_len - pos < req_size) {
 
5863
                                        il_buffer_len = 0;
 
5864
                                        continue;
 
5865
                                }
 
5866
 
 
5867
                                TRACK_BLOCK_FLUSH_N(node_id);
 
5868
                                address = xt_ind_node_to_offset(tab, node_id);
 
5869
                                /* {WRITE-IN-BLOCKS} Round up the block size. Space has been provided. */
 
5870
                                IDX_TRACE("%d- W%x\n", (int) XT_NODE_ID(node_id), (int) XT_GET_DISK_2(block_data));
 
5871
                                if (block_header) {
 
5872
                                        if (!il_pwrite_file(ot, address, 2, block_header))
 
5873
                                                return FAILED;
 
5874
                                }
 
5875
 
 
5876
                                if (block_data) {
 
5877
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5878
                                        if (*buffer == XT_DT_MOD_IND_PAGE_HEAD_EOB || *buffer == XT_DT_MOD_IND_PAGE_EOB)
 
5879
                                                block_len = ((block_len + IND_WRITE_BLOCK_SIZE - 1) / IND_WRITE_BLOCK_SIZE) * IND_WRITE_BLOCK_SIZE;
 
5880
#endif
 
5881
                                        if (!il_pwrite_file(ot, address + XT_INDEX_PAGE_HEAD_SIZE + block_offset, block_len, block_data))
 
5882
                                                return FAILED;
 
5883
                                }
 
5884
 
 
5885
                                offset += req_size;
 
5886
                                req_size = 0;
 
5887
                                break;
 
5888
                        case XT_DT_FREE_LIST:
 
5889
                                xtWord4 block, nblock;
 
5890
                                union {
 
5891
                                        xtWord1                         buffer[IND_WRITE_BLOCK_SIZE];
 
5892
                                        XTIndFreeBlockRec       free_block;
 
5893
                                } x;
 
5894
                                off_t   aoff;
 
5895
 
 
5896
                                memset(x.buffer, 0, sizeof(XTIndFreeBlockRec));
 
5897
 
 
5898
                                pos++;
 
5899
                                offset++;
 
5900
                                
 
5901
                                for (;;) {
 
5902
                                        req_size = 8;
 
5903
                                        if (il_buffer_len - pos < req_size) {
 
5904
                                                il_buffer_len = il_buffer_size;
 
5905
                                                if (il_log_eof - offset < (off_t) il_buffer_len)
 
5906
                                                        il_buffer_len = (size_t) (il_log_eof - offset);
 
5907
                                                /* Corrupt log?! */
 
5908
                                                if (il_buffer_len < req_size) {
 
5909
                                                        xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_LOG_CORRUPT, xt_file_path(il_of));
 
5910
                                                        xt_log_and_clear_exception_ns();
 
5911
                                                        return OK;
 
5912
                                                }
 
5913
                                                if (!xt_pread_file(il_of, offset, il_buffer_len, il_buffer_len, il_buffer, NULL, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
5914
                                                        return FAILED;
 
5915
                                                pos = 0;
 
5916
                                        }
 
5917
                                        block = XT_GET_DISK_4(il_buffer + pos);
 
5918
                                        nblock = XT_GET_DISK_4(il_buffer + pos + 4);
 
5919
                                        if (nblock == 0xFFFFFFFF)
 
5920
                                                break;
 
5921
                                        aoff = xt_ind_node_to_offset(tab, XT_RET_NODE_ID(block));
 
5922
                                        XT_SET_DISK_8(x.free_block.if_next_block_8, nblock);
 
5923
                                        IDX_TRACE("%d- *%x\n", (int) block, (int) XT_GET_DISK_2(x.buffer));
 
5924
                                        if (!il_pwrite_file(ot, aoff, IND_WRITE_BLOCK_SIZE, x.buffer))
 
5925
                                                return FAILED;
 
5926
                                        pos += 4;
 
5927
                                        offset += 4;
 
5928
                                }
 
5929
 
 
5930
                                offset += 8;
 
5931
                                req_size = 0;
 
5932
                                break;
 
5933
                        case XT_DT_HEADER:
 
5934
                                XTIndHeadDataDPtr       head_data;
 
5935
                                size_t                          len;
 
5936
 
 
5937
                                req_size = offsetof(XTIndHeadDataDRec, ilh_data);
 
5938
                                if (il_buffer_len - pos < req_size) {
 
5939
                                        il_buffer_len = 0;
 
5940
                                        continue;
 
5941
                                }
 
5942
                                head_data = (XTIndHeadDataDPtr) buffer;
 
5943
                                len = XT_GET_DISK_2(head_data->ilh_head_size_2);
 
5944
 
 
5945
                                req_size = offsetof(XTIndHeadDataDRec, ilh_data) + len;
 
5946
                                if (il_buffer_len - pos < req_size) {
 
5947
                                        il_buffer_len = 0;
 
5948
                                        continue;
 
5949
                                }
 
5950
 
 
5951
                                if (!il_pwrite_file(ot, 0, len, head_data->ilh_data))
 
5952
                                        return FAILED;
 
5953
 
 
5954
                                offset += req_size;
 
5955
                                req_size = 0;
 
5956
                                break;
 
5957
                        case XT_DT_2_MOD_IND_PAGE:
 
5958
                        case XT_DT_2_MOD_IND_PAGE_EOB:
 
5959
                                XTIndDoubleModPageDataDPtr      dd_data;
 
5960
                                u_int                                           block_len2;
 
5961
 
 
5962
                                req_size = offsetof(XTIndDoubleModPageDataDRec, dld_data);
 
5963
                                if (il_buffer_len - pos < req_size) {
 
5964
                                        il_buffer_len = 0;
 
5965
                                        continue;
 
5966
                                }
 
5967
 
 
5968
                                dd_data = (XTIndDoubleModPageDataDPtr) buffer;
 
5969
                                node_id = XT_RET_NODE_ID(XT_GET_DISK_4(dd_data->dld_page_id_4));
 
5970
                                block_len = XT_GET_DISK_2(dd_data->dld_size1_2);
 
5971
                                block_offset = XT_GET_DISK_2(dd_data->dld_offset2_2);
 
5972
                                block_len2 = XT_GET_DISK_2(dd_data->dld_size2_2);
 
5973
                                block_data = dd_data->dld_data;
 
5974
 
 
5975
                                req_size += block_len + block_len2;
 
5976
                                if (il_buffer_len - pos < req_size)
 
5977
                                {
 
5978
                                        il_buffer_len = 0;
 
5979
                                        continue;
 
5980
                                }
 
5981
 
 
5982
                                TRACK_BLOCK_FLUSH_N(node_id);
 
5983
                                address = xt_ind_node_to_offset(tab, node_id);
 
5984
                                IDX_TRACE("%d- W%x\n", (int) XT_NODE_ID(node_id), (int) XT_GET_DISK_2(block_data));
 
5985
                                if (!il_pwrite_file(ot, address, block_len, block_data))
 
5986
                                        return FAILED;
 
5987
 
 
5988
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
5989
                                if (*buffer == XT_DT_2_MOD_IND_PAGE_EOB)
 
5990
                                        block_len2 = ((block_len2 + IND_WRITE_BLOCK_SIZE - 1) / IND_WRITE_BLOCK_SIZE) * IND_WRITE_BLOCK_SIZE;
 
5991
#endif
 
5992
                                if (!il_pwrite_file(ot, address + XT_INDEX_PAGE_HEAD_SIZE + block_offset, block_len2, block_data + block_len))
 
5993
                                        return FAILED;
 
5994
 
 
5995
                                offset += req_size;
 
5996
                                req_size = 0;
 
5997
                                break;
 
5998
                        default:
 
5999
                                xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_LOG_CORRUPT, xt_file_path(il_of));
 
6000
                                xt_log_and_clear_exception_ns();
 
6001
                                return OK;
 
6002
                }
 
6003
#ifdef CHECK_IF_WRITE_WAS_OK
 
6004
                if (node_id) {
 
6005
                        if (!xt_ind_get(ot, node_id, &c_iref))
 
6006
                                ASSERT_NS(FALSE);
 
6007
                        if (c_iref.ir_block) {
 
6008
                                c_node = (XTIdxBranchDPtr) c_iref.ir_block->cb_data;
 
6009
                                c_block_len = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(c_node->tb_size_2));
 
6010
 
 
6011
                                if (!xt_pread_file(ot->ot_ind_file, address, XT_INDEX_PAGE_SIZE, 0, &ot->ot_ind_tmp_buf, NULL, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
6012
                                        ASSERT_NS(FALSE);
 
6013
                                if (c_iref.ir_block->cb_min_pos == 0xFFFF)
 
6014
                                        check_buff(&ot->ot_ind_tmp_buf, c_node, c_block_len);
 
6015
                                else {
 
6016
                                        if (!c_iref.ir_block->cb_header)
 
6017
                                                check_buff(&ot->ot_ind_tmp_buf, c_node, 2);
 
6018
                                        check_buff(ot->ot_ind_tmp_buf.tb_data, c_node->tb_data, c_iref.ir_block->cb_min_pos);
 
6019
                                        check_buff(ot->ot_ind_tmp_buf.tb_data + c_iref.ir_block->cb_max_pos,
 
6020
                                                c_node->tb_data + c_iref.ir_block->cb_max_pos,
 
6021
                                                c_block_len - XT_INDEX_PAGE_HEAD_SIZE - c_iref.ir_block->cb_max_pos);
 
6022
                                }
 
6023
                                xt_ind_release(ot, NULL, XT_UNLOCK_WRITE, &c_iref);
 
6024
                        }
 
6025
                }
 
6026
#endif
 
6027
                if (il_bytes_written >= IND_FLUSH_THRESHOLD) {
 
6028
                        if (!il_flush_file(ot))
 
6029
                                return FAILED;
 
6030
                }
 
6031
        }
 
6032
        return OK;
 
6033
}
 
6034
 
 
6035
xtBool XTIndexLog::il_apply_log_flush(struct XTOpenTable *ot)
 
6036
{
 
6037
        register XTTableHPtr    tab = ot->ot_table;
 
6038
        XTIndLogHeadDRec                log_head;
 
6039
 
 
6040
#ifdef PRINT_IND_FLUSH_STATS
 
6041
        xtWord8                                 b_flush_time = ot->ot_thread->st_statistics.st_ind.ts_flush_time;
 
6042
#endif
 
6043
        if (!il_flush_file(ot))
 
6044
                return FAILED;
 
6045
#ifdef PRINT_IND_FLUSH_STATS
 
6046
        char    buf1[30];
 
6047
        char    buf2[30];
 
6048
        char    buf3[30];
 
6049
 
 
6050
        double time;
 
6051
        double kb;
 
6052
 
 
6053
        ot->ot_table->tab_ind_flush_time += ot->ot_thread->st_statistics.st_ind.ts_flush_time - b_flush_time;
 
6054
        ot->ot_table->tab_ind_flush++;
 
6055
 
 
6056
        time = (double) ot->ot_table->tab_ind_flush_time / (double) 1000000 / (double) ot->ot_table->tab_ind_flush;
 
6057
        kb = (double) ot->ot_table->tab_ind_write / (double) ot->ot_table->tab_ind_flush / (double) 1024;
 
6058
        printf("TIME: %s      Kbytes: %s      Mps: %s      Flush Count: %d\n", 
 
6059
                idx_format(buf1, time),
 
6060
                idx_format(buf2, kb),
 
6061
                idx_format_mb(buf3, kb / time),
 
6062
                (int) ot->ot_table->tab_ind_flush
 
6063
                );
 
6064
#endif
 
6065
 
 
6066
        log_head.ilh_data_type = XT_DT_LOG_HEAD;
 
6067
        XT_SET_DISK_4(log_head.ilh_tab_id_4, il_tab_id);
 
6068
        XT_SET_DISK_4(log_head.ilh_log_eof_4, 0);
 
6069
 
 
6070
        if (!xt_pwrite_file(il_of, 0, sizeof(XTIndLogHeadDRec), (xtWord1 *) &log_head, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
6071
                return FAILED;
 
6072
 
 
6073
        if (!XT_IS_TEMP_TABLE(tab->tab_dic.dic_tab_flags)) {
 
6074
                if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
 
6075
                        return FAILED;
 
6076
        }
 
6077
        return OK;
 
6078
}
 
6079
 
 
6080
inline xtBool XTIndexLog::il_pwrite_file(struct XTOpenTable *ot, off_t offs, size_t siz, void *dat)
 
6081
{
 
6082
#ifdef IND_WRITE_IN_BLOCK_SIZES
 
6083
        ASSERT_NS(((offs) % IND_WRITE_BLOCK_SIZE) == 0);
 
6084
        ASSERT_NS(((siz) % IND_WRITE_BLOCK_SIZE) == 0);
 
6085
#endif
 
6086
        il_bytes_written += siz;
 
6087
#ifdef PRINT_IND_FLUSH_STATS
 
6088
        xtBool ok;
 
6089
 
 
6090
        u_int   b_write = ot->ot_thread->st_statistics.st_ind.ts_write;
 
6091
        ok = xt_pwrite_file(ot->ot_ind_file, offs, siz, dat, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread);
 
6092
        ot->ot_table->tab_ind_write += ot->ot_thread->st_statistics.st_ind.ts_write - b_write;
 
6093
        return ok;
 
6094
#else
 
6095
        return xt_pwrite_file(ot->ot_ind_file, offs, siz, dat, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread);
 
6096
#endif
 
6097
}
 
6098
 
 
6099
inline xtBool XTIndexLog::il_flush_file(struct XTOpenTable *ot)
 
6100
{
 
6101
        xtBool ok = TRUE;
 
6102
 
 
6103
        il_bytes_written = 0;
 
6104
        if (!XT_IS_TEMP_TABLE(ot->ot_table->tab_dic.dic_tab_flags)) {
 
6105
                ok = xt_flush_file(ot->ot_ind_file, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread);
 
6106
        }
 
6107
        return ok;
 
6108
}
 
6109
 
 
6110
xtBool XTIndexLog::il_open_table(struct XTOpenTable **ot)
 
6111
{
 
6112
        return xt_db_open_pool_table_ns(ot, il_pool->ilp_db, il_tab_id);
 
6113
}
 
6114
 
 
6115
void XTIndexLog::il_close_table(struct XTOpenTable *ot)
 
6116
{
 
6117
        xt_db_return_table_to_pool_ns(ot);
 
6118
}
 
6119
 
 
6120