~drizzle-trunk/drizzle/development

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
/* Copyright (C) 2000-2002 MySQL AB
   Copyright (C) 2008 eBay, Inc

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* Implements various base dataspace-related functions - allocate, free, clear */

#include "heap_priv.h"

#include <cassert>


/*
  MySQL Heap tables keep data in arrays of fixed-size chunks.
  These chunks are organized into two groups of HP_BLOCK structures:
    - group1 contains indexes, with one HP_BLOCK per key
      (part of HP_KEYDEF)
    - group2 contains record data, with single HP_BLOCK
      for all records, referenced by HP_SHARE.recordspace.block

  While columns used in index are usually small, other columns
  in the table may need to accomodate larger data. Typically,
  larger data is placed into VARCHAR or BLOB columns. With actual
  sizes varying, Heap Engine has to support variable-sized records
  in memory. Heap Engine implements the concept of dataspace
  (HP_DATASPACE), which incorporates HP_BLOCK for the record data,
  and adds more information for managing variable-sized records.

  Variable-size records are stored in multiple "chunks",
  which means that a single record of data (database "row") can
  consist of multiple chunks organized into one "set". HP_BLOCK
  contains chunks. In variable-size format, one record
  is represented as one or many chunks, depending on the actual
  data, while in fixed-size mode, one record is always represented
  as one chunk. The index structures would always point to the first
  chunk in the chunkset.

  At the time of table creation, Heap Engine attempts to find out
  if variable-size records are desired. A user can request
  variable-size records by providing either row_type=dynamic or
  block_size=NNN table create option. Heap Engine will check
  whether block_size provides enough space in the first chunk
  to keep all null bits and columns that are used in indexes.
  If block_size is too small, table creation will be aborted
  with an error. Heap Engine will revert to fixed-size allocation
  mode if block_size provides no memory benefits (no VARCHAR
  fields extending past first chunk).

  In order to improve index search performance, Heap Engine needs
  to keep all null flags and all columns used as keys inside
  the first chunk of a chunkset. In particular, this means that
  all columns used as keys should be defined first in the table
  creation SQL. The length of data used by null bits and key columns
  is stored as fixed_data_length inside HP_SHARE. fixed_data_length
  will extend past last key column if more fixed-length fields can
  fit into the first chunk.

  Variable-size records are necessary only in the presence
  of variable-size columns. Heap Engine will be looking for VARCHAR
  columns, which declare length of 32 or more. If no such columns
  are found, table will be switched to fixed-size format. You should
  always try to put such columns at the end of the table definition.

  Whenever data is being inserted or updated in the table
  Heap Engine will calculate how many chunks are necessary.
  For insert operations, Heap Engine allocates new chunkset in
  the recordspace. For update operations it will modify length of
  the existing chunkset, unlinking unnecessary chunks at the end,
  or allocating and adding more if larger length is necessary.

  When writing data to chunks or copying data back to record,
  Heap Engine will first copy fixed_data_length of data using single
  memcpy call. The rest of the columns are processed one-by-one.
  Non-VARCHAR columns are copied in their full format. VARCHAR's
  are copied based on their actual length. Any NULL values after
  fixed_data_length are skipped.

  The allocation and contents of the actual chunks varies between
  fixed and variable-size modes. Total chunk length is always
  aligned to the next sizeof(unsigned char*). Here is the format of
  fixed-size chunk:
      unsigned char[] - sizeof=chunk_dataspace_length, but at least
               sizeof(unsigned char*) bytes. Keeps actual data or pointer
               to the next deleted chunk.
               chunk_dataspace_length equals to full record length
      unsigned char   - status field (1 means "in use", 0 means "deleted")
  Variable-size uses different format:
      unsigned char[] - sizeof=chunk_dataspace_length, but at least
               sizeof(unsigned char*) bytes. Keeps actual data or pointer
               to the next deleted chunk.
               chunk_dataspace_length is set according to table
               setup (block_size)
      unsigned char*  - pointer to the next chunk in this chunkset,
               or NULL for the last chunk
      unsigned char  -  status field (1 means "first", 0 means "deleted",
               2 means "linked")

  When allocating a new chunkset of N chunks, Heap Engine will try
  to allocate chunks one-by-one, linking them as they become
  allocated. Allocation of a single chunk will attempt to reuse
  a deleted (freed) chunk. If no free chunks are available,
  it will attempt to allocate a new area inside HP_BLOCK.
  Freeing chunks will place them at the front of free list
  referenced by del_link in HP_DATASPACE. The newly freed chunk
  will contain reference to the previously freed chunk in its first
  sizeof(unsigned char*) of the payload space.

  Here is open issues:
    1. It is not very nice to require people to keep key columns
       at the beginning of the table creation SQL. There are three
       proposed resolutions:
       a. Leave it as is. It's a reasonable limitation
       b. Add new HA_KEEP_KEY_COLUMNS_TO_FRONT flag to handler.h and
          make table.cpp align columns when it creates the table
       c. Make HeapEngine reorder columns in the chunk data, so that
          key columns go first. Add parallel HA_KEYSEG structures
          to distinguish positions in record vs. positions in
          the first chunk. Copy all data field-by-field rather than
          using single memcpy unless DBA kept key columns to
          the beginning.
    2. heap_check_heap needs verify linked chunks, looking for
       issues such as orphans, cycles, and bad links. However,
       Heap Engine today does not do similar things even for
       free list.
    3. With new HP_DATASPACE allocation mechaism, BLOB will become
       increasingly simple to implement, but I may not have time
       for that. In one approach, BLOB data can be placed at
       the end of the same record. In another approach (which I
       prefer) BLOB data would have its own HP_DATASPACE with
       variable-size entries.
    4. In a more sophisticated implementation, some space can
       be saved even with all fixed-size columns if many of them
       have NULL value, as long as these columns are not used
       in indexes
    5. In variable-size format status should be moved to lower
       bits of the "next" pointer. Pointer is always aligned
       to sizeof(unsigned char*), which is at least 4, leaving 2 lower
       bits free. This will save 8 bytes per chunk
       on 64-bit platform.
    6. As we do not want to modify FRM format, BLOCK_SIZE option
       of "CREATE TABLE" is saved as "RAID_CHUNKSIZE" for
       Heap Engine tables.
*/

static unsigned char *hp_allocate_one_chunk(HP_DATASPACE *info);


/**
  Clear a dataspace

  Frees memory and zeros-out any relevant counters in the dataspace

  @param  info  the dataspace to clear
*/

void hp_clear_dataspace(HP_DATASPACE *info)
{
  if (info->block.levels)
  {
    hp_free_level(&info->block,info->block.levels,info->block.root,
                  (unsigned char*) 0);
  }
  info->block.levels=0;
  info->del_chunk_count= info->chunk_count= 0;
  info->del_link=0;
  info->total_data_length= 0;
}


/**
  Allocate or reallocate a chunkset in the dataspace

  Attempts to allocate a new chunkset or change the size of an existing chunkset

  @param  info            the hosting dataspace
  @param  chunk_count     the number of chunks that we expect as the result
  @param  existing_set    non-null value asks function to resize existing chunkset,
                          return value would point to this set

  @return  Pointer to the first chunk in the new or updated chunkset, or NULL if unsuccessful
*/

static unsigned char *hp_allocate_variable_chunkset(HP_DATASPACE *info,
                                           uint32_t chunk_count, unsigned char* existing_set)
{
  int alloc_count= chunk_count, i;
  unsigned char *first_chunk= 0, *curr_chunk= 0, *prev_chunk= 0, *last_existing_chunk= 0;

  assert(alloc_count);

  if (existing_set)
  {
    first_chunk= existing_set;

    curr_chunk= existing_set;
    while (curr_chunk && alloc_count)
    {
      prev_chunk= curr_chunk;
      curr_chunk= *((unsigned char**)(curr_chunk + info->offset_link));
      alloc_count--;
    }

    if (!alloc_count)
    {
      if (curr_chunk)
      {
        /* We came through all chunks and there is more left, let's truncate the list */
        *((unsigned char**)(prev_chunk + info->offset_link)) = NULL;
        hp_free_chunks(info, curr_chunk);
      }

      return first_chunk;
    }

    last_existing_chunk = prev_chunk;
  }

  /* We can reach this point only if we're allocating new chunkset or more chunks in existing set */

  for (i=0; i<alloc_count; i++)
  {
      curr_chunk= hp_allocate_one_chunk(info);
      if (!curr_chunk)
      {
        /* no space in the current block */

        if (last_existing_chunk)
        {
          /* Truncate whatever was added at the end of the existing chunkset */
          prev_chunk= last_existing_chunk;
          curr_chunk= *((unsigned char**)(prev_chunk + info->offset_link));
          *((unsigned char**)(prev_chunk + info->offset_link)) = NULL;
          hp_free_chunks(info, curr_chunk);
        }
        else if (first_chunk)
        {
          /* free any chunks previously allocated */
          hp_free_chunks(info, first_chunk);
        }

        return NULL;
      }

      /* mark as if this chunk is last in the chunkset */
      *((unsigned char**) (curr_chunk + info->offset_link))= 0;

      if (prev_chunk)
      {
        /* tie them into a linked list */
        *((unsigned char**) (prev_chunk + info->offset_link))= curr_chunk;
        curr_chunk[info->offset_status]= CHUNK_STATUS_LINKED;			/* Record linked from active */
      }
      else
      {
        curr_chunk[info->offset_status]= CHUNK_STATUS_ACTIVE;			  /* Record active */
      }

      if (!first_chunk)
      {
        first_chunk= curr_chunk;
      }

      prev_chunk= curr_chunk;
  }

  return first_chunk;
}


/**
  Allocate a new chunkset in the dataspace

  Attempts to allocate a new chunkset

  @param  info            the hosting dataspace
  @param  chunk_count     the number of chunks that we expect as the result

  @return  Pointer to the first chunk in the new or updated chunkset, or NULL if unsuccessful
*/

unsigned char *hp_allocate_chunkset(HP_DATASPACE *info, uint32_t chunk_count)
{
  unsigned char* result;


  if (info->is_variable_size)
  {
    result = hp_allocate_variable_chunkset(info, chunk_count, NULL);
  }
  else
  {
    result= hp_allocate_one_chunk(info);
    if (result)
    {
      result[info->offset_status]= CHUNK_STATUS_ACTIVE;
    }

    return(result);
  }

  return(result);
}


/**
  Reallocate an existing chunkset in the dataspace

  Attempts to change the size of an existing chunkset

  @param  info            the hosting dataspace
  @param  chunk_count     the number of chunks that we expect as the result
  @param  pos             pointer to the existing chunkset

  @return  Error code or zero if successful
*/

int hp_reallocate_chunkset(HP_DATASPACE *info, uint32_t chunk_count, unsigned char* pos)
{

  if (!info->is_variable_size)
  {
    /* Update should never change chunk_count in fixed-size mode */
    errno=HA_ERR_WRONG_COMMAND;
    return errno;
  }

  /* Reallocate never moves the first chunk */
  if (!hp_allocate_variable_chunkset(info, chunk_count, pos))
    return(errno);

  return(0);
}


/**
  Allocate a single chunk in the dataspace

  Attempts to allocate a new chunk or reuse one from deleted list

  @param  info            the hosting dataspace

  @return  Pointer to the chunk, or NULL if unsuccessful
*/

static unsigned char *hp_allocate_one_chunk(HP_DATASPACE *info)
{
  unsigned char* curr_chunk;
  size_t length, block_pos;

  if (info->del_link)
  {
    curr_chunk=info->del_link;
    info->del_link= *((unsigned char**) curr_chunk);
    info->del_chunk_count--;

    return curr_chunk;
  }

  block_pos= (info->chunk_count % info->block.records_in_block);
  if (!block_pos)
  {
    if (hp_get_new_block(&info->block,&length))
    {
      /* no space in the current block */
      return NULL;
    }

    info->total_data_length+= length;
  }

  info->chunk_count++;
  curr_chunk= ((unsigned char*) info->block.level_info[0].last_blocks +
    block_pos * info->block.recbuffer);


  return curr_chunk;
}


/**
  Free a list of chunks

  Reclaims all chunks linked by the pointer,
  which could be the whole chunkset or a part of an existing chunkset

  @param  info            the hosting dataspace
  @param  pos             pointer to the head of the chunkset
*/

void hp_free_chunks(HP_DATASPACE *info, unsigned char *pos)
{
  unsigned char* curr_chunk= pos;

  while (curr_chunk) {
    info->del_chunk_count++;
    *((unsigned char**) curr_chunk)= info->del_link;
    info->del_link= curr_chunk;

    curr_chunk[info->offset_status]= CHUNK_STATUS_DELETED;


    if (!info->is_variable_size)
    {
      break;
    }

    /* Delete next chunk in this chunkset */
    curr_chunk= *((unsigned char**)(curr_chunk + info->offset_link));
  }
}