~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000-2006 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
16
/**
17
  @file handler.cc
18
19
  Handler-calling-functions
20
*/
21
22
#ifdef USE_PRAGMA_IMPLEMENTATION
23
#pragma implementation				// gcc: Class implementation
24
#endif
25
26
#include "mysql_priv.h"
27
#include "rpl_filter.h"
28
#include <myisampack.h>
29
#include <errno.h>
30
31
/*
32
  While we have legacy_db_type, we have this array to
33
  check for dups and to find handlerton from legacy_db_type.
34
  Remove when legacy_db_type is finally gone
35
*/
36
st_plugin_int *hton2plugin[MAX_HA];
37
38
static handlerton *installed_htons[128];
39
40
#define BITMAP_STACKBUF_SIZE (128/8)
41
42
KEY_CREATE_INFO default_key_create_info= { HA_KEY_ALG_UNDEF, 0, {NullS,0}, {NullS,0} };
43
44
/* number of entries in handlertons[] */
45
ulong total_ha= 0;
46
/* number of storage engines (from handlertons[]) that support 2pc */
47
ulong total_ha_2pc= 0;
48
/* size of savepoint storage area (see ha_init) */
49
ulong savepoint_alloc_size= 0;
50
51
static const LEX_STRING sys_table_aliases[]=
52
{
53
  { C_STRING_WITH_LEN("INNOBASE") },  { C_STRING_WITH_LEN("INNODB") },
54
  { C_STRING_WITH_LEN("HEAP") },      { C_STRING_WITH_LEN("MEMORY") },
55
  {NullS, 0}
56
};
57
58
const char *ha_row_type[] = {
59
  "", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT", "PAGE", "?","?","?"
60
};
61
62
const char *tx_isolation_names[] =
63
{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
64
  NullS};
65
TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
66
			       tx_isolation_names, NULL};
67
68
static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
69
uint known_extensions_id= 0;
70
71
72
73
static plugin_ref ha_default_plugin(THD *thd)
74
{
75
  if (thd->variables.table_plugin)
76
    return thd->variables.table_plugin;
77
  return my_plugin_lock(thd, &global_system_variables.table_plugin);
78
}
79
80
81
/**
82
  Return the default storage engine handlerton for thread
83
84
  @param ha_default_handlerton(thd)
85
  @param thd         current thread
86
87
  @return
88
    pointer to handlerton
89
*/
90
handlerton *ha_default_handlerton(THD *thd)
91
{
92
  plugin_ref plugin= ha_default_plugin(thd);
93
  DBUG_ASSERT(plugin);
94
  handlerton *hton= plugin_data(plugin, handlerton*);
95
  DBUG_ASSERT(hton);
96
  return hton;
97
}
98
99
100
/**
101
  Return the storage engine handlerton for the supplied name
102
  
103
  @param thd         current thread
104
  @param name        name of storage engine
105
  
106
  @return
107
    pointer to storage engine plugin handle
108
*/
109
plugin_ref ha_resolve_by_name(THD *thd, const LEX_STRING *name)
110
{
111
  const LEX_STRING *table_alias;
112
  plugin_ref plugin;
113
114
redo:
115
  /* my_strnncoll is a macro and gcc doesn't do early expansion of macro */
116
  if (thd && !my_charset_latin1.coll->strnncoll(&my_charset_latin1,
117
                           (const uchar *)name->str, name->length,
118
                           (const uchar *)STRING_WITH_LEN("DEFAULT"), 0))
119
    return ha_default_plugin(thd);
120
121
  if ((plugin= my_plugin_lock_by_name(thd, name, MYSQL_STORAGE_ENGINE_PLUGIN)))
122
  {
123
    handlerton *hton= plugin_data(plugin, handlerton *);
124
    if (!(hton->flags & HTON_NOT_USER_SELECTABLE))
125
      return plugin;
126
      
127
    /*
128
      unlocking plugin immediately after locking is relatively low cost.
129
    */
130
    plugin_unlock(thd, plugin);
131
  }
132
133
  /*
134
    We check for the historical aliases.
135
  */
136
  for (table_alias= sys_table_aliases; table_alias->str; table_alias+= 2)
137
  {
138
    if (!my_strnncoll(&my_charset_latin1,
139
                      (const uchar *)name->str, name->length,
140
                      (const uchar *)table_alias->str, table_alias->length))
141
    {
142
      name= table_alias + 1;
143
      goto redo;
144
    }
145
  }
146
147
  return NULL;
148
}
149
150
151
plugin_ref ha_lock_engine(THD *thd, handlerton *hton)
152
{
153
  if (hton)
154
  {
155
    st_plugin_int **plugin= hton2plugin + hton->slot;
156
    
157
#ifdef DBUG_OFF
158
    return my_plugin_lock(thd, plugin);
159
#else
160
    return my_plugin_lock(thd, &plugin);
161
#endif
162
  }
163
  return NULL;
164
}
165
166
167
handlerton *ha_resolve_by_legacy_type(THD *thd, enum legacy_db_type db_type)
168
{
169
  plugin_ref plugin;
170
  switch (db_type) {
171
  case DB_TYPE_DEFAULT:
172
    return ha_default_handlerton(thd);
173
  default:
174
    if (db_type > DB_TYPE_UNKNOWN && db_type < DB_TYPE_DEFAULT &&
175
        (plugin= ha_lock_engine(thd, installed_htons[db_type])))
176
      return plugin_data(plugin, handlerton*);
177
    /* fall through */
178
  case DB_TYPE_UNKNOWN:
179
    return NULL;
180
  }
181
}
182
183
184
/**
185
  Use other database handler if databasehandler is not compiled in.
186
*/
187
handlerton *ha_checktype(THD *thd, enum legacy_db_type database_type,
188
                          bool no_substitute, bool report_error)
189
{
190
  handlerton *hton= ha_resolve_by_legacy_type(thd, database_type);
191
  if (ha_storage_engine_is_enabled(hton))
192
    return hton;
193
194
  if (no_substitute)
195
  {
196
    if (report_error)
197
    {
198
      const char *engine_name= ha_resolve_storage_engine_name(hton);
199
      my_error(ER_FEATURE_DISABLED,MYF(0),engine_name,engine_name);
200
    }
201
    return NULL;
202
  }
203
204
  switch (database_type) {
205
  case DB_TYPE_HASH:
206
    return ha_resolve_by_legacy_type(thd, DB_TYPE_HASH);
207
  default:
208
    break;
209
  }
210
211
  return ha_default_handlerton(thd);
212
} /* ha_checktype */
213
214
215
handler *get_new_handler(TABLE_SHARE *share, MEM_ROOT *alloc,
216
                         handlerton *db_type)
217
{
218
  handler *file;
219
  DBUG_ENTER("get_new_handler");
220
  DBUG_PRINT("enter", ("alloc: 0x%lx", (long) alloc));
221
222
  if (db_type && db_type->state == SHOW_OPTION_YES && db_type->create)
223
  {
224
    if ((file= db_type->create(db_type, share, alloc)))
225
      file->init();
226
    DBUG_RETURN(file);
227
  }
228
  /*
229
    Try the default table type
230
    Here the call to current_thd() is ok as we call this function a lot of
231
    times but we enter this branch very seldom.
232
  */
233
  DBUG_RETURN(get_new_handler(share, alloc, ha_default_handlerton(current_thd)));
234
}
235
236
237
/**
238
  Register handler error messages for use with my_error().
239
240
  @retval
241
    0           OK
242
  @retval
243
    !=0         Error
244
*/
245
246
int ha_init_errors(void)
247
{
248
#define SETMSG(nr, msg) errmsgs[(nr) - HA_ERR_FIRST]= (msg)
249
  const char    **errmsgs;
250
251
  /* Allocate a pointer array for the error message strings. */
252
  /* Zerofill it to avoid uninitialized gaps. */
253
  if (! (errmsgs= (const char**) my_malloc(HA_ERR_ERRORS * sizeof(char*),
254
                                           MYF(MY_WME | MY_ZEROFILL))))
255
    return 1;
256
257
  /* Set the dedicated error messages. */
258
  SETMSG(HA_ERR_KEY_NOT_FOUND,          ER(ER_KEY_NOT_FOUND));
259
  SETMSG(HA_ERR_FOUND_DUPP_KEY,         ER(ER_DUP_KEY));
260
  SETMSG(HA_ERR_RECORD_CHANGED,         "Update wich is recoverable");
261
  SETMSG(HA_ERR_WRONG_INDEX,            "Wrong index given to function");
262
  SETMSG(HA_ERR_CRASHED,                ER(ER_NOT_KEYFILE));
263
  SETMSG(HA_ERR_WRONG_IN_RECORD,        ER(ER_CRASHED_ON_USAGE));
264
  SETMSG(HA_ERR_OUT_OF_MEM,             "Table handler out of memory");
265
  SETMSG(HA_ERR_NOT_A_TABLE,            "Incorrect file format '%.64s'");
266
  SETMSG(HA_ERR_WRONG_COMMAND,          "Command not supported");
267
  SETMSG(HA_ERR_OLD_FILE,               ER(ER_OLD_KEYFILE));
268
  SETMSG(HA_ERR_NO_ACTIVE_RECORD,       "No record read in update");
269
  SETMSG(HA_ERR_RECORD_DELETED,         "Intern record deleted");
270
  SETMSG(HA_ERR_RECORD_FILE_FULL,       ER(ER_RECORD_FILE_FULL));
271
  SETMSG(HA_ERR_INDEX_FILE_FULL,        "No more room in index file '%.64s'");
272
  SETMSG(HA_ERR_END_OF_FILE,            "End in next/prev/first/last");
273
  SETMSG(HA_ERR_UNSUPPORTED,            ER(ER_ILLEGAL_HA));
274
  SETMSG(HA_ERR_TO_BIG_ROW,             "Too big row");
275
  SETMSG(HA_WRONG_CREATE_OPTION,        "Wrong create option");
276
  SETMSG(HA_ERR_FOUND_DUPP_UNIQUE,      ER(ER_DUP_UNIQUE));
277
  SETMSG(HA_ERR_UNKNOWN_CHARSET,        "Can't open charset");
278
  SETMSG(HA_ERR_WRONG_MRG_TABLE_DEF,    ER(ER_WRONG_MRG_TABLE));
279
  SETMSG(HA_ERR_CRASHED_ON_REPAIR,      ER(ER_CRASHED_ON_REPAIR));
280
  SETMSG(HA_ERR_CRASHED_ON_USAGE,       ER(ER_CRASHED_ON_USAGE));
281
  SETMSG(HA_ERR_LOCK_WAIT_TIMEOUT,      ER(ER_LOCK_WAIT_TIMEOUT));
282
  SETMSG(HA_ERR_LOCK_TABLE_FULL,        ER(ER_LOCK_TABLE_FULL));
283
  SETMSG(HA_ERR_READ_ONLY_TRANSACTION,  ER(ER_READ_ONLY_TRANSACTION));
284
  SETMSG(HA_ERR_LOCK_DEADLOCK,          ER(ER_LOCK_DEADLOCK));
285
  SETMSG(HA_ERR_CANNOT_ADD_FOREIGN,     ER(ER_CANNOT_ADD_FOREIGN));
286
  SETMSG(HA_ERR_NO_REFERENCED_ROW,      ER(ER_NO_REFERENCED_ROW_2));
287
  SETMSG(HA_ERR_ROW_IS_REFERENCED,      ER(ER_ROW_IS_REFERENCED_2));
288
  SETMSG(HA_ERR_NO_SAVEPOINT,           "No savepoint with that name");
289
  SETMSG(HA_ERR_NON_UNIQUE_BLOCK_SIZE,  "Non unique key block size");
290
  SETMSG(HA_ERR_NO_SUCH_TABLE,          "No such table: '%.64s'");
291
  SETMSG(HA_ERR_TABLE_EXIST,            ER(ER_TABLE_EXISTS_ERROR));
292
  SETMSG(HA_ERR_NO_CONNECTION,          "Could not connect to storage engine");
293
  SETMSG(HA_ERR_TABLE_DEF_CHANGED,      ER(ER_TABLE_DEF_CHANGED));
294
  SETMSG(HA_ERR_FOREIGN_DUPLICATE_KEY,  "FK constraint would lead to duplicate key");
295
  SETMSG(HA_ERR_TABLE_NEEDS_UPGRADE,    ER(ER_TABLE_NEEDS_UPGRADE));
296
  SETMSG(HA_ERR_TABLE_READONLY,         ER(ER_OPEN_AS_READONLY));
297
  SETMSG(HA_ERR_AUTOINC_READ_FAILED,    ER(ER_AUTOINC_READ_FAILED));
298
  SETMSG(HA_ERR_AUTOINC_ERANGE,         ER(ER_WARN_DATA_OUT_OF_RANGE));
299
300
  /* Register the error messages for use with my_error(). */
301
  return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
302
}
303
304
305
/**
306
  Unregister handler error messages.
307
308
  @retval
309
    0           OK
310
  @retval
311
    !=0         Error
312
*/
313
static int ha_finish_errors(void)
314
{
315
  const char    **errmsgs;
316
317
  /* Allocate a pointer array for the error message strings. */
318
  if (! (errmsgs= my_error_unregister(HA_ERR_FIRST, HA_ERR_LAST)))
319
    return 1;
320
  my_free((uchar*) errmsgs, MYF(0));
321
  return 0;
322
}
323
324
325
int ha_finalize_handlerton(st_plugin_int *plugin)
326
{
327
  handlerton *hton= (handlerton *)plugin->data;
328
  DBUG_ENTER("ha_finalize_handlerton");
329
330
  switch (hton->state)
331
  {
332
  case SHOW_OPTION_NO:
333
  case SHOW_OPTION_DISABLED:
334
    break;
335
  case SHOW_OPTION_YES:
336
    if (installed_htons[hton->db_type] == hton)
337
      installed_htons[hton->db_type]= NULL;
338
    break;
339
  };
340
341
  if (hton->panic)
342
    hton->panic(hton, HA_PANIC_CLOSE);
343
344
  if (plugin->plugin->deinit)
345
  {
346
    /*
347
      Today we have no defined/special behavior for uninstalling
348
      engine plugins.
349
    */
350
    DBUG_PRINT("info", ("Deinitializing plugin: '%s'", plugin->name.str));
351
    if (plugin->plugin->deinit(NULL))
352
    {
353
      DBUG_PRINT("warning", ("Plugin '%s' deinit function returned error.",
354
                             plugin->name.str));
355
    }
356
  }
357
358
  my_free((uchar*)hton, MYF(0));
359
360
  DBUG_RETURN(0);
361
}
362
363
364
int ha_initialize_handlerton(st_plugin_int *plugin)
365
{
366
  handlerton *hton;
367
  DBUG_ENTER("ha_initialize_handlerton");
368
  DBUG_PRINT("plugin", ("initialize plugin: '%s'", plugin->name.str));
369
370
  hton= (handlerton *)my_malloc(sizeof(handlerton),
371
                                MYF(MY_WME | MY_ZEROFILL));
372
  /* 
373
    FIXME: the MY_ZEROFILL flag above doesn't zero all the bytes.
374
    
375
    This was detected after adding get_backup_engine member to handlerton
376
    structure. Apparently get_backup_engine was not NULL even though it was
377
    not initialized.
378
   */
379
  bzero(hton, sizeof(hton));
380
  /* Historical Requirement */
381
  plugin->data= hton; // shortcut for the future
382
  if (plugin->plugin->init)
383
  {
384
    if (plugin->plugin->init(hton))
385
    {
386
      sql_print_error("Plugin '%s' init function returned error.",
387
                      plugin->name.str);
388
      goto err;
389
    }
390
  }
391
392
  /*
393
    the switch below and hton->state should be removed when
394
    command-line options for plugins will be implemented
395
  */
396
  switch (hton->state) {
397
  case SHOW_OPTION_NO:
398
    break;
399
  case SHOW_OPTION_YES:
400
    {
401
      uint tmp;
402
      /* now check the db_type for conflict */
403
      if (hton->db_type <= DB_TYPE_UNKNOWN ||
404
          hton->db_type >= DB_TYPE_DEFAULT ||
405
          installed_htons[hton->db_type])
406
      {
407
        int idx= (int) DB_TYPE_FIRST_DYNAMIC;
408
409
        while (idx < (int) DB_TYPE_DEFAULT && installed_htons[idx])
410
          idx++;
411
412
        if (idx == (int) DB_TYPE_DEFAULT)
413
        {
414
          sql_print_warning("Too many storage engines!");
415
          DBUG_RETURN(1);
416
        }
417
        if (hton->db_type != DB_TYPE_UNKNOWN)
418
          sql_print_warning("Storage engine '%s' has conflicting typecode. "
419
                            "Assigning value %d.", plugin->plugin->name, idx);
420
        hton->db_type= (enum legacy_db_type) idx;
421
      }
422
      installed_htons[hton->db_type]= hton;
423
      tmp= hton->savepoint_offset;
424
      hton->savepoint_offset= savepoint_alloc_size;
425
      savepoint_alloc_size+= tmp;
426
      hton->slot= total_ha++;
427
      hton2plugin[hton->slot]=plugin;
428
      if (hton->prepare)
429
        total_ha_2pc++;
430
      break;
431
    }
432
    /* fall through */
433
  default:
434
    hton->state= SHOW_OPTION_DISABLED;
435
    break;
436
  }
437
  
438
  /* 
439
    This is entirely for legacy. We will create a new "disk based" hton and a 
440
    "memory" hton which will be configurable longterm. We should be able to 
441
    remove partition and myisammrg.
442
  */
443
  switch (hton->db_type) {
444
  case DB_TYPE_HEAP:
445
    heap_hton= hton;
446
    break;
447
  case DB_TYPE_MYISAM:
448
    myisam_hton= hton;
449
    break;
450
  default:
451
    break;
452
  };
453
454
  DBUG_RETURN(0);
455
err:
456
  DBUG_RETURN(1);
457
}
458
459
int ha_init()
460
{
461
  int error= 0;
462
  DBUG_ENTER("ha_init");
463
464
  DBUG_ASSERT(total_ha < MAX_HA);
465
  /*
466
    Check if there is a transaction-capable storage engine besides the
467
    binary log (which is considered a transaction-capable storage engine in
468
    counting total_ha)
469
  */
470
  opt_using_transactions= total_ha>(ulong)opt_bin_log;
471
  savepoint_alloc_size+= sizeof(SAVEPOINT);
472
  DBUG_RETURN(error);
473
}
474
475
int ha_end()
476
{
477
  int error= 0;
478
  DBUG_ENTER("ha_end");
479
480
481
  /* 
482
    This should be eventualy based  on the graceful shutdown flag.
483
    So if flag is equal to HA_PANIC_CLOSE, the deallocate
484
    the errors.
485
  */
486
  if (ha_finish_errors())
487
    error= 1;
488
489
  DBUG_RETURN(error);
490
}
491
492
static my_bool dropdb_handlerton(THD *unused1, plugin_ref plugin,
493
                                 void *path)
494
{
495
  handlerton *hton= plugin_data(plugin, handlerton *);
496
  if (hton->state == SHOW_OPTION_YES && hton->drop_database)
497
    hton->drop_database(hton, (char *)path);
498
  return FALSE;
499
}
500
501
502
void ha_drop_database(char* path)
503
{
504
  plugin_foreach(NULL, dropdb_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, path);
505
}
506
507
508
static my_bool closecon_handlerton(THD *thd, plugin_ref plugin,
509
                                   void *unused)
510
{
511
  handlerton *hton= plugin_data(plugin, handlerton *);
512
  /*
513
    there's no need to rollback here as all transactions must
514
    be rolled back already
515
  */
516
  if (hton->state == SHOW_OPTION_YES && hton->close_connection &&
517
      thd_get_ha_data(thd, hton))
518
    hton->close_connection(hton, thd);
519
  return FALSE;
520
}
521
522
523
/**
524
  @note
525
    don't bother to rollback here, it's done already
526
*/
527
void ha_close_connection(THD* thd)
528
{
529
  plugin_foreach(thd, closecon_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, 0);
530
}
531
532
/* ========================================================================
533
 ======================= TRANSACTIONS ===================================*/
534
535
/**
536
  Transaction handling in the server
537
  ==================================
538
539
  In each client connection, MySQL maintains two transactional
540
  states:
541
  - a statement transaction,
542
  - a standard, also called normal transaction.
543
544
  Historical note
545
  ---------------
546
  "Statement transaction" is a non-standard term that comes
547
  from the times when MySQL supported BerkeleyDB storage engine.
548
549
  First of all, it should be said that in BerkeleyDB auto-commit
550
  mode auto-commits operations that are atomic to the storage
551
  engine itself, such as a write of a record, and are too
552
  high-granular to be atomic from the application perspective
553
  (MySQL). One SQL statement could involve many BerkeleyDB
554
  auto-committed operations and thus BerkeleyDB auto-commit was of
555
  little use to MySQL.
556
557
  Secondly, instead of SQL standard savepoints, BerkeleyDB
558
  provided the concept of "nested transactions". In a nutshell,
559
  transactions could be arbitrarily nested, but when the parent
560
  transaction was committed or aborted, all its child (nested)
561
  transactions were handled committed or aborted as well.
562
  Commit of a nested transaction, in turn, made its changes
563
  visible, but not durable: it destroyed the nested transaction,
564
  all its changes would become available to the parent and
565
  currently active nested transactions of this parent.
566
567
  So the mechanism of nested transactions was employed to
568
  provide "all or nothing" guarantee of SQL statements
569
  required by the standard.
570
  A nested transaction would be created at start of each SQL
571
  statement, and destroyed (committed or aborted) at statement
572
  end. Such nested transaction was internally referred to as
573
  a "statement transaction" and gave birth to the term.
574
575
  <Historical note ends>
576
577
  Since then a statement transaction is started for each statement
578
  that accesses transactional tables or uses the binary log.  If
579
  the statement succeeds, the statement transaction is committed.
580
  If the statement fails, the transaction is rolled back. Commits
581
  of statement transactions are not durable -- each such
582
  transaction is nested in the normal transaction, and if the
583
  normal transaction is rolled back, the effects of all enclosed
584
  statement transactions are undone as well.  Technically,
585
  a statement transaction can be viewed as a savepoint which is
586
  maintained automatically in order to make effects of one
587
  statement atomic.
588
589
  The normal transaction is started by the user and is ended
590
  usually upon a user request as well. The normal transaction
591
  encloses transactions of all statements issued between
592
  its beginning and its end.
593
  In autocommit mode, the normal transaction is equivalent
594
  to the statement transaction.
595
596
  Since MySQL supports PSEA (pluggable storage engine
597
  architecture), more than one transactional engine can be
598
  active at a time. Hence transactions, from the server
599
  point of view, are always distributed. In particular,
600
  transactional state is maintained independently for each
601
  engine. In order to commit a transaction the two phase
602
  commit protocol is employed.
603
604
  Not all statements are executed in context of a transaction.
605
  Administrative and status information statements do not modify
606
  engine data, and thus do not start a statement transaction and
607
  also have no effect on the normal transaction. Examples of such
608
  statements are SHOW STATUS and RESET SLAVE.
609
610
  Similarly DDL statements are not transactional,
611
  and therefore a transaction is [almost] never started for a DDL
612
  statement. The difference between a DDL statement and a purely
613
  administrative statement though is that a DDL statement always
614
  commits the current transaction before proceeding, if there is
615
  any.
616
617
  At last, SQL statements that work with non-transactional
618
  engines also have no effect on the transaction state of the
619
  connection. Even though they are written to the binary log,
620
  and the binary log is, overall, transactional, the writes
621
  are done in "write-through" mode, directly to the binlog
622
  file, followed with a OS cache sync, in other words,
623
  bypassing the binlog undo log (translog).
624
  They do not commit the current normal transaction.
625
  A failure of a statement that uses non-transactional tables
626
  would cause a rollback of the statement transaction, but
627
  in case there no non-transactional tables are used,
628
  no statement transaction is started.
629
630
  Data layout
631
  -----------
632
633
  The server stores its transaction-related data in
634
  thd->transaction. This structure has two members of type
635
  THD_TRANS. These members correspond to the statement and
636
  normal transactions respectively:
637
638
  - thd->transaction.stmt contains a list of engines
639
  that are participating in the given statement
640
  - thd->transaction.all contains a list of engines that
641
  have participated in any of the statement transactions started
642
  within the context of the normal transaction.
643
  Each element of the list contains a pointer to the storage
644
  engine, engine-specific transactional data, and engine-specific
645
  transaction flags.
646
647
  In autocommit mode thd->transaction.all is empty.
648
  Instead, data of thd->transaction.stmt is
649
  used to commit/rollback the normal transaction.
650
651
  The list of registered engines has a few important properties:
652
  - no engine is registered in the list twice
653
  - engines are present in the list a reverse temporal order --
654
  new participants are always added to the beginning of the list.
655
656
  Transaction life cycle
657
  ----------------------
658
659
  When a new connection is established, thd->transaction
660
  members are initialized to an empty state.
661
  If a statement uses any tables, all affected engines
662
  are registered in the statement engine list. In
663
  non-autocommit mode, the same engines are registered in
664
  the normal transaction list.
665
  At the end of the statement, the server issues a commit
666
  or a roll back for all engines in the statement list.
667
  At this point transaction flags of an engine, if any, are
668
  propagated from the statement list to the list of the normal
669
  transaction.
670
  When commit/rollback is finished, the statement list is
671
  cleared. It will be filled in again by the next statement,
672
  and emptied again at the next statement's end.
673
674
  The normal transaction is committed in a similar way
675
  (by going over all engines in thd->transaction.all list)
676
  but at different times:
677
  - upon COMMIT SQL statement is issued by the user
678
  - implicitly, by the server, at the beginning of a DDL statement
679
  or SET AUTOCOMMIT={0|1} statement.
680
681
  The normal transaction can be rolled back as well:
682
  - if the user has requested so, by issuing ROLLBACK SQL
683
  statement
684
  - if one of the storage engines requested a rollback
685
  by setting thd->transaction_rollback_request. This may
686
  happen in case, e.g., when the transaction in the engine was
687
  chosen a victim of the internal deadlock resolution algorithm
688
  and rolled back internally. When such a situation happens, there
689
  is little the server can do and the only option is to rollback
690
  transactions in all other participating engines.  In this case
691
  the rollback is accompanied by an error sent to the user.
692
693
  As follows from the use cases above, the normal transaction
694
  is never committed when there is an outstanding statement
695
  transaction. In most cases there is no conflict, since
696
  commits of the normal transaction are issued by a stand-alone
697
  administrative or DDL statement, thus no outstanding statement
698
  transaction of the previous statement exists. Besides,
699
  all statements that manipulate with the normal transaction
700
  are prohibited in stored functions and triggers, therefore
701
  no conflicting situation can occur in a sub-statement either.
702
  The remaining rare cases when the server explicitly has
703
  to commit the statement transaction prior to committing the normal
704
  one cover error-handling scenarios (see for example
705
  SQLCOM_LOCK_TABLES).
706
707
  When committing a statement or a normal transaction, the server
708
  either uses the two-phase commit protocol, or issues a commit
709
  in each engine independently. The two-phase commit protocol
710
  is used only if:
711
  - all participating engines support two-phase commit (provide
712
    handlerton::prepare PSEA API call) and
713
  - transactions in at least two engines modify data (i.e. are
714
  not read-only).
715
716
  Note that the two phase commit is used for
717
  statement transactions, even though they are not durable anyway.
718
  This is done to ensure logical consistency of data in a multiple-
719
  engine transaction.
720
  For example, imagine that some day MySQL supports unique
721
  constraint checks deferred till the end of statement. In such
722
  case a commit in one of the engines may yield ER_DUP_KEY,
723
  and MySQL should be able to gracefully abort statement
724
  transactions of other participants.
725
726
  After the normal transaction has been committed,
727
  thd->transaction.all list is cleared.
728
729
  When a connection is closed, the current normal transaction, if
730
  any, is rolled back.
731
732
  Roles and responsibilities
733
  --------------------------
734
735
  The server has no way to know that an engine participates in
736
  the statement and a transaction has been started
737
  in it unless the engine says so. Thus, in order to be
738
  a part of a transaction, the engine must "register" itself.
739
  This is done by invoking trans_register_ha() server call.
740
  Normally the engine registers itself whenever handler::external_lock()
741
  is called. trans_register_ha() can be invoked many times: if
742
  an engine is already registered, the call does nothing.
743
  In case autocommit is not set, the engine must register itself
744
  twice -- both in the statement list and in the normal transaction
745
  list.
746
  In which list to register is a parameter of trans_register_ha().
747
748
  Note, that although the registration interface in itself is
749
  fairly clear, the current usage practice often leads to undesired
750
  effects. E.g. since a call to trans_register_ha() in most engines
751
  is embedded into implementation of handler::external_lock(), some
752
  DDL statements start a transaction (at least from the server
753
  point of view) even though they are not expected to. E.g.
754
  CREATE TABLE does not start a transaction, since
755
  handler::external_lock() is never called during CREATE TABLE. But
756
  CREATE TABLE ... SELECT does, since handler::external_lock() is
757
  called for the table that is being selected from. This has no
758
  practical effects currently, but must be kept in mind
759
  nevertheless.
760
761
  Once an engine is registered, the server will do the rest
762
  of the work.
763
764
  During statement execution, whenever any of data-modifying
765
  PSEA API methods is used, e.g. handler::write_row() or
766
  handler::update_row(), the read-write flag is raised in the
767
  statement transaction for the involved engine.
768
  Currently All PSEA calls are "traced", and the data can not be
769
  changed in a way other than issuing a PSEA call. Important:
770
  unless this invariant is preserved the server will not know that
771
  a transaction in a given engine is read-write and will not
772
  involve the two-phase commit protocol!
773
774
  At the end of a statement, server call
775
  ha_autocommit_or_rollback() is invoked. This call in turn
776
  invokes handlerton::prepare() for every involved engine.
777
  Prepare is followed by a call to handlerton::commit_one_phase()
778
  If a one-phase commit will suffice, handlerton::prepare() is not
779
  invoked and the server only calls handlerton::commit_one_phase().
780
  At statement commit, the statement-related read-write engine
781
  flag is propagated to the corresponding flag in the normal
782
  transaction.  When the commit is complete, the list of registered
783
  engines is cleared.
784
785
  Rollback is handled in a similar fashion.
786
787
  Additional notes on DDL and the normal transaction.
788
  ---------------------------------------------------
789
790
  DDLs and operations with non-transactional engines
791
  do not "register" in thd->transaction lists, and thus do not
792
  modify the transaction state. Besides, each DDL in
793
  MySQL is prefixed with an implicit normal transaction commit
794
  (a call to end_active_trans()), and thus leaves nothing
795
  to modify.
796
  However, as it has been pointed out with CREATE TABLE .. SELECT,
797
  some DDL statements can start a *new* transaction.
798
799
  Behaviour of the server in this case is currently badly
800
  defined.
801
  DDL statements use a form of "semantic" logging
802
  to maintain atomicity: if CREATE TABLE .. SELECT failed,
803
  the newly created table is deleted.
804
  In addition, some DDL statements issue interim transaction
805
  commits: e.g. ALTER TABLE issues a commit after data is copied
806
  from the original table to the internal temporary table. Other
807
  statements, e.g. CREATE TABLE ... SELECT do not always commit
808
  after itself.
809
  And finally there is a group of DDL statements such as
810
  RENAME/DROP TABLE that doesn't start a new transaction
811
  and doesn't commit.
812
813
  This diversity makes it hard to say what will happen if
814
  by chance a stored function is invoked during a DDL --
815
  whether any modifications it makes will be committed or not
816
  is not clear. Fortunately, SQL grammar of few DDLs allows
817
  invocation of a stored function.
818
819
  A consistent behaviour is perhaps to always commit the normal
820
  transaction after all DDLs, just like the statement transaction
821
  is always committed at the end of all statements.
822
*/
823
824
/**
825
  Register a storage engine for a transaction.
826
827
  Every storage engine MUST call this function when it starts
828
  a transaction or a statement (that is it must be called both for the
829
  "beginning of transaction" and "beginning of statement").
830
  Only storage engines registered for the transaction/statement
831
  will know when to commit/rollback it.
832
833
  @note
834
    trans_register_ha is idempotent - storage engine may register many
835
    times per transaction.
836
837
*/
838
void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
839
{
840
  THD_TRANS *trans;
841
  Ha_trx_info *ha_info;
842
  DBUG_ENTER("trans_register_ha");
843
  DBUG_PRINT("enter",("%s", all ? "all" : "stmt"));
844
845
  if (all)
846
  {
847
    trans= &thd->transaction.all;
848
    thd->server_status|= SERVER_STATUS_IN_TRANS;
849
  }
850
  else
851
    trans= &thd->transaction.stmt;
852
853
  ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
854
855
  if (ha_info->is_started())
856
    DBUG_VOID_RETURN; /* already registered, return */
857
858
  ha_info->register_ha(trans, ht_arg);
859
860
  trans->no_2pc|=(ht_arg->prepare==0);
861
  if (thd->transaction.xid_state.xid.is_null())
862
    thd->transaction.xid_state.xid.set(thd->query_id);
863
864
  DBUG_VOID_RETURN;
865
}
866
867
/**
868
  @retval
869
    0   ok
870
  @retval
871
    1   error, transaction was rolled back
872
*/
873
int ha_prepare(THD *thd)
874
{
875
  int error=0, all=1;
876
  THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
877
  Ha_trx_info *ha_info= trans->ha_list;
878
  DBUG_ENTER("ha_prepare");
879
  if (ha_info)
880
  {
881
    for (; ha_info; ha_info= ha_info->next())
882
    {
883
      int err;
884
      handlerton *ht= ha_info->ht();
885
      status_var_increment(thd->status_var.ha_prepare_count);
886
      if (ht->prepare)
887
      {
888
        if ((err= ht->prepare(ht, thd, all)))
889
        {
890
          my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
891
          ha_rollback_trans(thd, all);
892
          error=1;
893
          break;
894
        }
895
      }
896
      else
897
      {
898
        push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
899
                            ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
900
                            ha_resolve_storage_engine_name(ht));
901
      }
902
    }
903
  }
904
  DBUG_RETURN(error);
905
}
906
907
/**
908
  Check if we can skip the two-phase commit.
909
910
  A helper function to evaluate if two-phase commit is mandatory.
911
  As a side effect, propagates the read-only/read-write flags
912
  of the statement transaction to its enclosing normal transaction.
913
914
  @retval TRUE   we must run a two-phase commit. Returned
915
                 if we have at least two engines with read-write changes.
916
  @retval FALSE  Don't need two-phase commit. Even if we have two
917
                 transactional engines, we can run two independent
918
                 commits if changes in one of the engines are read-only.
919
*/
920
921
static
922
bool
923
ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
924
                                    bool all)
925
{
926
  /* The number of storage engines that have actual changes. */
927
  unsigned rw_ha_count= 0;
928
  Ha_trx_info *ha_info;
929
930
  for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
931
  {
932
    if (ha_info->is_trx_read_write())
933
      ++rw_ha_count;
934
935
    if (! all)
936
    {
937
      Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
938
      DBUG_ASSERT(ha_info != ha_info_all);
939
      /*
940
        Merge read-only/read-write information about statement
941
        transaction to its enclosing normal transaction. Do this
942
        only if in a real transaction -- that is, if we know
943
        that ha_info_all is registered in thd->transaction.all.
944
        Since otherwise we only clutter the normal transaction flags.
945
      */
946
      if (ha_info_all->is_started()) /* FALSE if autocommit. */
947
        ha_info_all->coalesce_trx_with(ha_info);
948
    }
949
    else if (rw_ha_count > 1)
950
    {
951
      /*
952
        It is a normal transaction, so we don't need to merge read/write
953
        information up, and the need for two-phase commit has been
954
        already established. Break the loop prematurely.
955
      */
956
      break;
957
    }
958
  }
959
  return rw_ha_count > 1;
960
}
961
962
963
/**
964
  @retval
965
    0   ok
966
  @retval
967
    1   transaction was rolled back
968
  @retval
969
    2   error during commit, data may be inconsistent
970
971
  @todo
972
    Since we don't support nested statement transactions in 5.0,
973
    we can't commit or rollback stmt transactions while we are inside
974
    stored functions or triggers. So we simply do nothing now.
975
    TODO: This should be fixed in later ( >= 5.1) releases.
976
*/
977
int ha_commit_trans(THD *thd, bool all)
978
{
979
  int error= 0, cookie= 0;
980
  /*
981
    'all' means that this is either an explicit commit issued by
982
    user, or an implicit commit issued by a DDL.
983
  */
984
  THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
985
  bool is_real_trans= all || thd->transaction.all.ha_list == 0;
986
  Ha_trx_info *ha_info= trans->ha_list;
987
  my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
988
  DBUG_ENTER("ha_commit_trans");
989
990
  /*
991
    We must not commit the normal transaction if a statement
992
    transaction is pending. Otherwise statement transaction
993
    flags will not get propagated to its normal transaction's
994
    counterpart.
995
  */
996
  DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
997
              trans == &thd->transaction.stmt);
998
999
  if (thd->in_sub_stmt)
1000
  {
1001
    /*
1002
      Since we don't support nested statement transactions in 5.0,
1003
      we can't commit or rollback stmt transactions while we are inside
1004
      stored functions or triggers. So we simply do nothing now.
1005
      TODO: This should be fixed in later ( >= 5.1) releases.
1006
    */
1007
    if (!all)
1008
      DBUG_RETURN(0);
1009
    /*
1010
      We assume that all statements which commit or rollback main transaction
1011
      are prohibited inside of stored functions or triggers. So they should
1012
      bail out with error even before ha_commit_trans() call. To be 100% safe
1013
      let us throw error in non-debug builds.
1014
    */
1015
    DBUG_ASSERT(0);
1016
    my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1017
    DBUG_RETURN(2);
1018
  }
1019
  if (ha_info)
1020
  {
1021
    bool must_2pc;
1022
1023
    if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
1024
    {
1025
      ha_rollback_trans(thd, all);
1026
      DBUG_RETURN(1);
1027
    }
1028
1029
    if (   is_real_trans
1030
        && opt_readonly
1031
        && ! thd->slave_thread
1032
       )
1033
    {
1034
      my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
1035
      ha_rollback_trans(thd, all);
1036
      error= 1;
1037
      goto end;
1038
    }
1039
1040
    must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
1041
1042
    if (!trans->no_2pc && must_2pc)
1043
    {
1044
      for (; ha_info && !error; ha_info= ha_info->next())
1045
      {
1046
        int err;
1047
        handlerton *ht= ha_info->ht();
1048
        /*
1049
          Do not call two-phase commit if this particular
1050
          transaction is read-only. This allows for simpler
1051
          implementation in engines that are always read-only.
1052
        */
1053
        if (! ha_info->is_trx_read_write())
1054
          continue;
1055
        /*
1056
          Sic: we know that prepare() is not NULL since otherwise
1057
          trans->no_2pc would have been set.
1058
        */
1059
        if ((err= ht->prepare(ht, thd, all)))
1060
        {
1061
          my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1062
          error= 1;
1063
        }
1064
        status_var_increment(thd->status_var.ha_prepare_count);
1065
      }
1066
      DBUG_EXECUTE_IF("crash_commit_after_prepare", abort(););
1067
      if (error || (is_real_trans && xid &&
1068
                    (error= !(cookie= tc_log->log_xid(thd, xid)))))
1069
      {
1070
        ha_rollback_trans(thd, all);
1071
        error= 1;
1072
        goto end;
1073
      }
1074
      DBUG_EXECUTE_IF("crash_commit_after_log", abort(););
1075
    }
1076
    error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
1077
    DBUG_EXECUTE_IF("crash_commit_before_unlog", abort(););
1078
    if (cookie)
1079
      tc_log->unlog(cookie, xid);
1080
    DBUG_EXECUTE_IF("crash_commit_after", abort(););
1081
end:
1082
    if (is_real_trans)
1083
      start_waiting_global_read_lock(thd);
1084
  }
1085
  DBUG_RETURN(error);
1086
}
1087
1088
/**
1089
  @note
1090
  This function does not care about global read lock. A caller should.
1091
*/
1092
int ha_commit_one_phase(THD *thd, bool all)
1093
{
1094
  int error=0;
1095
  THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1096
  bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1097
  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1098
  DBUG_ENTER("ha_commit_one_phase");
1099
  if (ha_info)
1100
  {
1101
    for (; ha_info; ha_info= ha_info_next)
1102
    {
1103
      int err;
1104
      handlerton *ht= ha_info->ht();
1105
      if ((err= ht->commit(ht, thd, all)))
1106
      {
1107
        my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
1108
        error=1;
1109
      }
1110
      status_var_increment(thd->status_var.ha_commit_count);
1111
      ha_info_next= ha_info->next();
1112
      ha_info->reset(); /* keep it conveniently zero-filled */
1113
    }
1114
    trans->ha_list= 0;
1115
    trans->no_2pc=0;
1116
    if (is_real_trans)
1117
      thd->transaction.xid_state.xid.null();
1118
    if (all)
1119
    {
1120
      thd->variables.tx_isolation=thd->session_tx_isolation;
1121
      thd->transaction.cleanup();
1122
    }
1123
  }
1124
  DBUG_RETURN(error);
1125
}
1126
1127
1128
int ha_rollback_trans(THD *thd, bool all)
1129
{
1130
  int error=0;
1131
  THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
1132
  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1133
  bool is_real_trans=all || thd->transaction.all.ha_list == 0;
1134
  DBUG_ENTER("ha_rollback_trans");
1135
1136
  /*
1137
    We must not rollback the normal transaction if a statement
1138
    transaction is pending.
1139
  */
1140
  DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
1141
              trans == &thd->transaction.stmt);
1142
1143
  if (thd->in_sub_stmt)
1144
  {
1145
    /*
1146
      If we are inside stored function or trigger we should not commit or
1147
      rollback current statement transaction. See comment in ha_commit_trans()
1148
      call for more information.
1149
    */
1150
    if (!all)
1151
      DBUG_RETURN(0);
1152
    DBUG_ASSERT(0);
1153
    my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
1154
    DBUG_RETURN(1);
1155
  }
1156
  if (ha_info)
1157
  {
1158
    for (; ha_info; ha_info= ha_info_next)
1159
    {
1160
      int err;
1161
      handlerton *ht= ha_info->ht();
1162
      if ((err= ht->rollback(ht, thd, all)))
1163
      { // cannot happen
1164
        my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1165
        error=1;
1166
      }
1167
      status_var_increment(thd->status_var.ha_rollback_count);
1168
      ha_info_next= ha_info->next();
1169
      ha_info->reset(); /* keep it conveniently zero-filled */
1170
    }
1171
    trans->ha_list= 0;
1172
    trans->no_2pc=0;
1173
    if (is_real_trans)
1174
      thd->transaction.xid_state.xid.null();
1175
    if (all)
1176
    {
1177
      thd->variables.tx_isolation=thd->session_tx_isolation;
1178
      thd->transaction.cleanup();
1179
    }
1180
  }
1181
  if (all)
1182
    thd->transaction_rollback_request= FALSE;
1183
1184
  /*
1185
    If a non-transactional table was updated, warn; don't warn if this is a
1186
    slave thread (because when a slave thread executes a ROLLBACK, it has
1187
    been read from the binary log, so it's 100% sure and normal to produce
1188
    error ER_WARNING_NOT_COMPLETE_ROLLBACK. If we sent the warning to the
1189
    slave SQL thread, it would not stop the thread but just be printed in
1190
    the error log; but we don't want users to wonder why they have this
1191
    message in the error log, so we don't send it.
1192
  */
1193
  if (is_real_trans && thd->transaction.all.modified_non_trans_table &&
1194
      !thd->slave_thread && thd->killed != THD::KILL_CONNECTION)
1195
    push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
1196
                 ER_WARNING_NOT_COMPLETE_ROLLBACK,
1197
                 ER(ER_WARNING_NOT_COMPLETE_ROLLBACK));
1198
  DBUG_RETURN(error);
1199
}
1200
1201
/**
1202
  This is used to commit or rollback a single statement depending on
1203
  the value of error.
1204
1205
  @note
1206
    Note that if the autocommit is on, then the following call inside
1207
    InnoDB will commit or rollback the whole transaction (= the statement). The
1208
    autocommit mechanism built into InnoDB is based on counting locks, but if
1209
    the user has used LOCK TABLES then that mechanism does not know to do the
1210
    commit.
1211
*/
1212
int ha_autocommit_or_rollback(THD *thd, int error)
1213
{
1214
  DBUG_ENTER("ha_autocommit_or_rollback");
1215
  if (thd->transaction.stmt.ha_list)
1216
  {
1217
    if (!error)
1218
    {
1219
      if (ha_commit_trans(thd, 0))
1220
	error=1;
1221
    }
1222
    else 
1223
    {
1224
      (void) ha_rollback_trans(thd, 0);
1225
      if (thd->transaction_rollback_request && !thd->in_sub_stmt)
1226
        (void) ha_rollback(thd);
1227
    }
1228
1229
    thd->variables.tx_isolation=thd->session_tx_isolation;
1230
  }
1231
  DBUG_RETURN(error);
1232
}
1233
1234
1235
struct xahton_st {
1236
  XID *xid;
1237
  int result;
1238
};
1239
1240
static my_bool xacommit_handlerton(THD *unused1, plugin_ref plugin,
1241
                                   void *arg)
1242
{
1243
  handlerton *hton= plugin_data(plugin, handlerton *);
1244
  if (hton->state == SHOW_OPTION_YES && hton->recover)
1245
  {
1246
    hton->commit_by_xid(hton, ((struct xahton_st *)arg)->xid);
1247
    ((struct xahton_st *)arg)->result= 0;
1248
  }
1249
  return FALSE;
1250
}
1251
1252
static my_bool xarollback_handlerton(THD *unused1, plugin_ref plugin,
1253
                                     void *arg)
1254
{
1255
  handlerton *hton= plugin_data(plugin, handlerton *);
1256
  if (hton->state == SHOW_OPTION_YES && hton->recover)
1257
  {
1258
    hton->rollback_by_xid(hton, ((struct xahton_st *)arg)->xid);
1259
    ((struct xahton_st *)arg)->result= 0;
1260
  }
1261
  return FALSE;
1262
}
1263
1264
1265
int ha_commit_or_rollback_by_xid(XID *xid, bool commit)
1266
{
1267
  struct xahton_st xaop;
1268
  xaop.xid= xid;
1269
  xaop.result= 1;
1270
1271
  plugin_foreach(NULL, commit ? xacommit_handlerton : xarollback_handlerton,
1272
                 MYSQL_STORAGE_ENGINE_PLUGIN, &xaop);
1273
1274
  return xaop.result;
1275
}
1276
1277
1278
#ifndef DBUG_OFF
1279
/**
1280
  @note
1281
    This does not need to be multi-byte safe or anything
1282
*/
1283
static char* xid_to_str(char *buf, XID *xid)
1284
{
1285
  int i;
1286
  char *s=buf;
1287
  *s++='\'';
1288
  for (i=0; i < xid->gtrid_length+xid->bqual_length; i++)
1289
  {
1290
    uchar c=(uchar)xid->data[i];
1291
    /* is_next_dig is set if next character is a number */
1292
    bool is_next_dig= FALSE;
1293
    if (i < XIDDATASIZE)
1294
    {
1295
      char ch= xid->data[i+1];
1296
      is_next_dig= (ch >= '0' && ch <='9');
1297
    }
1298
    if (i == xid->gtrid_length)
1299
    {
1300
      *s++='\'';
1301
      if (xid->bqual_length)
1302
      {
1303
        *s++='.';
1304
        *s++='\'';
1305
      }
1306
    }
1307
    if (c < 32 || c > 126)
1308
    {
1309
      *s++='\\';
1310
      /*
1311
        If next character is a number, write current character with
1312
        3 octal numbers to ensure that the next number is not seen
1313
        as part of the octal number
1314
      */
1315
      if (c > 077 || is_next_dig)
1316
        *s++=_dig_vec_lower[c >> 6];
1317
      if (c > 007 || is_next_dig)
1318
        *s++=_dig_vec_lower[(c >> 3) & 7];
1319
      *s++=_dig_vec_lower[c & 7];
1320
    }
1321
    else
1322
    {
1323
      if (c == '\'' || c == '\\')
1324
        *s++='\\';
1325
      *s++=c;
1326
    }
1327
  }
1328
  *s++='\'';
1329
  *s=0;
1330
  return buf;
1331
}
1332
#endif
1333
1334
/**
1335
  recover() step of xa.
1336
1337
  @note
1338
    there are three modes of operation:
1339
    - automatic recover after a crash
1340
    in this case commit_list != 0, tc_heuristic_recover==0
1341
    all xids from commit_list are committed, others are rolled back
1342
    - manual (heuristic) recover
1343
    in this case commit_list==0, tc_heuristic_recover != 0
1344
    DBA has explicitly specified that all prepared transactions should
1345
    be committed (or rolled back).
1346
    - no recovery (MySQL did not detect a crash)
1347
    in this case commit_list==0, tc_heuristic_recover == 0
1348
    there should be no prepared transactions in this case.
1349
*/
1350
struct xarecover_st
1351
{
1352
  int len, found_foreign_xids, found_my_xids;
1353
  XID *list;
1354
  HASH *commit_list;
1355
  bool dry_run;
1356
};
1357
1358
static my_bool xarecover_handlerton(THD *unused, plugin_ref plugin,
1359
                                    void *arg)
1360
{
1361
  handlerton *hton= plugin_data(plugin, handlerton *);
1362
  struct xarecover_st *info= (struct xarecover_st *) arg;
1363
  int got;
1364
1365
  if (hton->state == SHOW_OPTION_YES && hton->recover)
1366
  {
1367
    while ((got= hton->recover(hton, info->list, info->len)) > 0 )
1368
    {
1369
      sql_print_information("Found %d prepared transaction(s) in %s",
1370
                            got, ha_resolve_storage_engine_name(hton));
1371
      for (int i=0; i < got; i ++)
1372
      {
1373
        my_xid x=info->list[i].get_my_xid();
1374
        if (!x) // not "mine" - that is generated by external TM
1375
        {
1376
#ifndef DBUG_OFF
1377
          char buf[XIDDATASIZE*4+6]; // see xid_to_str
1378
          sql_print_information("ignore xid %s", xid_to_str(buf, info->list+i));
1379
#endif
1380
          xid_cache_insert(info->list+i, XA_PREPARED);
1381
          info->found_foreign_xids++;
1382
          continue;
1383
        }
1384
        if (info->dry_run)
1385
        {
1386
          info->found_my_xids++;
1387
          continue;
1388
        }
1389
        // recovery mode
1390
        if (info->commit_list ?
1391
            hash_search(info->commit_list, (uchar *)&x, sizeof(x)) != 0 :
1392
            tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
1393
        {
1394
#ifndef DBUG_OFF
1395
          char buf[XIDDATASIZE*4+6]; // see xid_to_str
1396
          sql_print_information("commit xid %s", xid_to_str(buf, info->list+i));
1397
#endif
1398
          hton->commit_by_xid(hton, info->list+i);
1399
        }
1400
        else
1401
        {
1402
#ifndef DBUG_OFF
1403
          char buf[XIDDATASIZE*4+6]; // see xid_to_str
1404
          sql_print_information("rollback xid %s",
1405
                                xid_to_str(buf, info->list+i));
1406
#endif
1407
          hton->rollback_by_xid(hton, info->list+i);
1408
        }
1409
      }
1410
      if (got < info->len)
1411
        break;
1412
    }
1413
  }
1414
  return FALSE;
1415
}
1416
1417
int ha_recover(HASH *commit_list)
1418
{
1419
  struct xarecover_st info;
1420
  DBUG_ENTER("ha_recover");
1421
  info.found_foreign_xids= info.found_my_xids= 0;
1422
  info.commit_list= commit_list;
1423
  info.dry_run= (info.commit_list==0 && tc_heuristic_recover==0);
1424
  info.list= NULL;
1425
1426
  /* commit_list and tc_heuristic_recover cannot be set both */
1427
  DBUG_ASSERT(info.commit_list==0 || tc_heuristic_recover==0);
1428
  /* if either is set, total_ha_2pc must be set too */
1429
  DBUG_ASSERT(info.dry_run || total_ha_2pc>(ulong)opt_bin_log);
1430
1431
  if (total_ha_2pc <= (ulong)opt_bin_log)
1432
    DBUG_RETURN(0);
1433
1434
  if (info.commit_list)
1435
    sql_print_information("Starting crash recovery...");
1436
1437
1438
#ifndef WILL_BE_DELETED_LATER
1439
1440
  /*
1441
    for now, only InnoDB supports 2pc. It means we can always safely
1442
    rollback all pending transactions, without risking inconsistent data
1443
  */
1444
1445
  DBUG_ASSERT(total_ha_2pc == (ulong) opt_bin_log+1); // only InnoDB and binlog
1446
  tc_heuristic_recover= TC_HEURISTIC_RECOVER_ROLLBACK; // forcing ROLLBACK
1447
  info.dry_run=FALSE;
1448
#endif
1449
1450
1451
  for (info.len= MAX_XID_LIST_SIZE ; 
1452
       info.list==0 && info.len > MIN_XID_LIST_SIZE; info.len/=2)
1453
  {
1454
    info.list=(XID *)my_malloc(info.len*sizeof(XID), MYF(0));
1455
  }
1456
  if (!info.list)
1457
  {
1458
    sql_print_error(ER(ER_OUTOFMEMORY), info.len*sizeof(XID));
1459
    DBUG_RETURN(1);
1460
  }
1461
1462
  plugin_foreach(NULL, xarecover_handlerton, 
1463
                 MYSQL_STORAGE_ENGINE_PLUGIN, &info);
1464
1465
  my_free((uchar*)info.list, MYF(0));
1466
  if (info.found_foreign_xids)
1467
    sql_print_warning("Found %d prepared XA transactions", 
1468
                      info.found_foreign_xids);
1469
  if (info.dry_run && info.found_my_xids)
1470
  {
1471
    sql_print_error("Found %d prepared transactions! It means that mysqld was "
1472
                    "not shut down properly last time and critical recovery "
1473
                    "information (last binlog or %s file) was manually deleted "
1474
                    "after a crash. You have to start mysqld with "
1475
                    "--tc-heuristic-recover switch to commit or rollback "
1476
                    "pending transactions.",
1477
                    info.found_my_xids, opt_tc_log_file);
1478
    DBUG_RETURN(1);
1479
  }
1480
  if (info.commit_list)
1481
    sql_print_information("Crash recovery finished.");
1482
  DBUG_RETURN(0);
1483
}
1484
1485
/**
1486
  return the list of XID's to a client, the same way SHOW commands do.
1487
1488
  @note
1489
    I didn't find in XA specs that an RM cannot return the same XID twice,
1490
    so mysql_xa_recover does not filter XID's to ensure uniqueness.
1491
    It can be easily fixed later, if necessary.
1492
*/
1493
bool mysql_xa_recover(THD *thd)
1494
{
1495
  List<Item> field_list;
1496
  Protocol *protocol= thd->protocol;
1497
  int i=0;
1498
  XID_STATE *xs;
1499
  DBUG_ENTER("mysql_xa_recover");
1500
1501
  field_list.push_back(new Item_int("formatID", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1502
  field_list.push_back(new Item_int("gtrid_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1503
  field_list.push_back(new Item_int("bqual_length", 0, MY_INT32_NUM_DECIMAL_DIGITS));
1504
  field_list.push_back(new Item_empty_string("data",XIDDATASIZE));
1505
1506
  if (protocol->send_fields(&field_list,
1507
                            Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
1508
    DBUG_RETURN(1);
1509
1510
  pthread_mutex_lock(&LOCK_xid_cache);
1511
  while ((xs= (XID_STATE*)hash_element(&xid_cache, i++)))
1512
  {
1513
    if (xs->xa_state==XA_PREPARED)
1514
    {
1515
      protocol->prepare_for_resend();
1516
      protocol->store_longlong((longlong)xs->xid.formatID, FALSE);
1517
      protocol->store_longlong((longlong)xs->xid.gtrid_length, FALSE);
1518
      protocol->store_longlong((longlong)xs->xid.bqual_length, FALSE);
1519
      protocol->store(xs->xid.data, xs->xid.gtrid_length+xs->xid.bqual_length,
1520
                      &my_charset_bin);
1521
      if (protocol->write())
1522
      {
1523
        pthread_mutex_unlock(&LOCK_xid_cache);
1524
        DBUG_RETURN(1);
1525
      }
1526
    }
1527
  }
1528
1529
  pthread_mutex_unlock(&LOCK_xid_cache);
1530
  my_eof(thd);
1531
  DBUG_RETURN(0);
1532
}
1533
1534
/**
1535
  @details
1536
  This function should be called when MySQL sends rows of a SELECT result set
1537
  or the EOF mark to the client. It releases a possible adaptive hash index
1538
  S-latch held by thd in InnoDB and also releases a possible InnoDB query
1539
  FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to
1540
  keep them over several calls of the InnoDB handler interface when a join
1541
  is executed. But when we let the control to pass to the client they have
1542
  to be released because if the application program uses mysql_use_result(),
1543
  it may deadlock on the S-latch if the application on another connection
1544
  performs another SQL query. In MySQL-4.1 this is even more important because
1545
  there a connection can have several SELECT queries open at the same time.
1546
1547
  @param thd           the thread handle of the current connection
1548
1549
  @return
1550
    always 0
1551
*/
1552
static my_bool release_temporary_latches(THD *thd, plugin_ref plugin,
1553
                                 void *unused)
1554
{
1555
  handlerton *hton= plugin_data(plugin, handlerton *);
1556
1557
  if (hton->state == SHOW_OPTION_YES && hton->release_temporary_latches)
1558
    hton->release_temporary_latches(hton, thd);
1559
1560
  return FALSE;
1561
}
1562
1563
1564
int ha_release_temporary_latches(THD *thd)
1565
{
1566
  plugin_foreach(thd, release_temporary_latches, MYSQL_STORAGE_ENGINE_PLUGIN, 
1567
                 NULL);
1568
1569
  return 0;
1570
}
1571
1572
int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
1573
{
1574
  int error=0;
1575
  THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1576
                                        &thd->transaction.all);
1577
  Ha_trx_info *ha_info, *ha_info_next;
1578
1579
  DBUG_ENTER("ha_rollback_to_savepoint");
1580
1581
  trans->no_2pc=0;
1582
  /*
1583
    rolling back to savepoint in all storage engines that were part of the
1584
    transaction when the savepoint was set
1585
  */
1586
  for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
1587
  {
1588
    int err;
1589
    handlerton *ht= ha_info->ht();
1590
    DBUG_ASSERT(ht);
1591
    DBUG_ASSERT(ht->savepoint_set != 0);
1592
    if ((err= ht->savepoint_rollback(ht, thd,
1593
                                     (uchar *)(sv+1)+ht->savepoint_offset)))
1594
    { // cannot happen
1595
      my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1596
      error=1;
1597
    }
1598
    status_var_increment(thd->status_var.ha_savepoint_rollback_count);
1599
    trans->no_2pc|= ht->prepare == 0;
1600
  }
1601
  /*
1602
    rolling back the transaction in all storage engines that were not part of
1603
    the transaction when the savepoint was set
1604
  */
1605
  for (ha_info= trans->ha_list; ha_info != sv->ha_list;
1606
       ha_info= ha_info_next)
1607
  {
1608
    int err;
1609
    handlerton *ht= ha_info->ht();
1610
    if ((err= ht->rollback(ht, thd, !thd->in_sub_stmt)))
1611
    { // cannot happen
1612
      my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
1613
      error=1;
1614
    }
1615
    status_var_increment(thd->status_var.ha_rollback_count);
1616
    ha_info_next= ha_info->next();
1617
    ha_info->reset(); /* keep it conveniently zero-filled */
1618
  }
1619
  trans->ha_list= sv->ha_list;
1620
  DBUG_RETURN(error);
1621
}
1622
1623
/**
1624
  @note
1625
  according to the sql standard (ISO/IEC 9075-2:2003)
1626
  section "4.33.4 SQL-statements and transaction states",
1627
  SAVEPOINT is *not* transaction-initiating SQL-statement
1628
*/
1629
int ha_savepoint(THD *thd, SAVEPOINT *sv)
1630
{
1631
  int error=0;
1632
  THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
1633
                                        &thd->transaction.all);
1634
  Ha_trx_info *ha_info= trans->ha_list;
1635
  DBUG_ENTER("ha_savepoint");
1636
  for (; ha_info; ha_info= ha_info->next())
1637
  {
1638
    int err;
1639
    handlerton *ht= ha_info->ht();
1640
    DBUG_ASSERT(ht);
1641
    if (! ht->savepoint_set)
1642
    {
1643
      my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
1644
      error=1;
1645
      break;
1646
    }
1647
    if ((err= ht->savepoint_set(ht, thd, (uchar *)(sv+1)+ht->savepoint_offset)))
1648
    { // cannot happen
1649
      my_error(ER_GET_ERRNO, MYF(0), err);
1650
      error=1;
1651
    }
1652
    status_var_increment(thd->status_var.ha_savepoint_count);
1653
  }
1654
  /*
1655
    Remember the list of registered storage engines. All new
1656
    engines are prepended to the beginning of the list.
1657
  */
1658
  sv->ha_list= trans->ha_list;
1659
  DBUG_RETURN(error);
1660
}
1661
1662
int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
1663
{
1664
  int error=0;
1665
  Ha_trx_info *ha_info= sv->ha_list;
1666
  DBUG_ENTER("ha_release_savepoint");
1667
1668
  for (; ha_info; ha_info= ha_info->next())
1669
  {
1670
    int err;
1671
    handlerton *ht= ha_info->ht();
1672
    /* Savepoint life time is enclosed into transaction life time. */
1673
    DBUG_ASSERT(ht);
1674
    if (!ht->savepoint_release)
1675
      continue;
1676
    if ((err= ht->savepoint_release(ht, thd,
1677
                                    (uchar *)(sv+1) + ht->savepoint_offset)))
1678
    { // cannot happen
1679
      my_error(ER_GET_ERRNO, MYF(0), err);
1680
      error=1;
1681
    }
1682
  }
1683
  DBUG_RETURN(error);
1684
}
1685
1686
1687
static my_bool snapshot_handlerton(THD *thd, plugin_ref plugin,
1688
                                   void *arg)
1689
{
1690
  handlerton *hton= plugin_data(plugin, handlerton *);
1691
  if (hton->state == SHOW_OPTION_YES &&
1692
      hton->start_consistent_snapshot)
1693
  {
1694
    hton->start_consistent_snapshot(hton, thd);
1695
    *((bool *)arg)= false;
1696
  }
1697
  return FALSE;
1698
}
1699
1700
int ha_start_consistent_snapshot(THD *thd)
1701
{
1702
  bool warn= true;
1703
1704
  plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
1705
1706
  /*
1707
    Same idea as when one wants to CREATE TABLE in one engine which does not
1708
    exist:
1709
  */
1710
  if (warn)
1711
    push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
1712
                 "This MySQL server does not support any "
1713
                 "consistent-read capable storage engine");
1714
  return 0;
1715
}
1716
1717
1718
static my_bool flush_handlerton(THD *thd, plugin_ref plugin,
1719
                                void *arg)
1720
{
1721
  handlerton *hton= plugin_data(plugin, handlerton *);
1722
  if (hton->state == SHOW_OPTION_YES && hton->flush_logs && 
1723
      hton->flush_logs(hton))
1724
    return TRUE;
1725
  return FALSE;
1726
}
1727
1728
1729
bool ha_flush_logs(handlerton *db_type)
1730
{
1731
  if (db_type == NULL)
1732
  {
1733
    if (plugin_foreach(NULL, flush_handlerton,
1734
                          MYSQL_STORAGE_ENGINE_PLUGIN, 0))
1735
      return TRUE;
1736
  }
1737
  else
1738
  {
1739
    if (db_type->state != SHOW_OPTION_YES ||
1740
        (db_type->flush_logs && db_type->flush_logs(db_type)))
1741
      return TRUE;
1742
  }
1743
  return FALSE;
1744
}
1745
1746
static const char *check_lowercase_names(handler *file, const char *path,
1747
                                         char *tmp_path)
1748
{
1749
  if (lower_case_table_names != 2 || (file->ha_table_flags() & HA_FILE_BASED))
1750
    return path;
1751
1752
  /* Ensure that table handler get path in lower case */
1753
  if (tmp_path != path)
1754
    strmov(tmp_path, path);
1755
1756
  /*
1757
    we only should turn into lowercase database/table part
1758
    so start the process after homedirectory
1759
  */
1760
  my_casedn_str(files_charset_info, tmp_path + mysql_data_home_len);
1761
  return tmp_path;
1762
}
1763
1764
1765
/**
1766
  An interceptor to hijack the text of the error message without
1767
  setting an error in the thread. We need the text to present it
1768
  in the form of a warning to the user.
1769
*/
1770
1771
struct Ha_delete_table_error_handler: public Internal_error_handler
1772
{
1773
public:
1774
  virtual bool handle_error(uint sql_errno,
1775
                            const char *message,
1776
                            MYSQL_ERROR::enum_warning_level level,
1777
                            THD *thd);
1778
  char buff[MYSQL_ERRMSG_SIZE];
1779
};
1780
1781
1782
bool
1783
Ha_delete_table_error_handler::
1784
handle_error(uint sql_errno,
1785
             const char *message,
1786
             MYSQL_ERROR::enum_warning_level level,
1787
             THD *thd)
1788
{
1789
  /* Grab the error message */
1790
  strmake(buff, message, sizeof(buff)-1);
1791
  return TRUE;
1792
}
1793
1794
1795
/**
1796
  This should return ENOENT if the file doesn't exists.
1797
  The .frm file will be deleted only if we return 0 or ENOENT
1798
*/
1799
int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
1800
                    const char *db, const char *alias, bool generate_warning)
1801
{
1802
  handler *file;
1803
  char tmp_path[FN_REFLEN];
1804
  int error;
1805
  TABLE dummy_table;
1806
  TABLE_SHARE dummy_share;
1807
  DBUG_ENTER("ha_delete_table");
1808
1809
  bzero((char*) &dummy_table, sizeof(dummy_table));
1810
  bzero((char*) &dummy_share, sizeof(dummy_share));
1811
  dummy_table.s= &dummy_share;
1812
1813
  /* DB_TYPE_UNKNOWN is used in ALTER TABLE when renaming only .frm files */
1814
  if (table_type == NULL ||
1815
      ! (file=get_new_handler((TABLE_SHARE*)0, thd->mem_root, table_type)))
1816
    DBUG_RETURN(ENOENT);
1817
1818
  path= check_lowercase_names(file, path, tmp_path);
1819
  if ((error= file->ha_delete_table(path)) && generate_warning)
1820
  {
1821
    /*
1822
      Because file->print_error() use my_error() to generate the error message
1823
      we use an internal error handler to intercept it and store the text
1824
      in a temporary buffer. Later the message will be presented to user
1825
      as a warning.
1826
    */
1827
    Ha_delete_table_error_handler ha_delete_table_error_handler;
1828
1829
    /* Fill up strucutures that print_error may need */
1830
    dummy_share.path.str= (char*) path;
1831
    dummy_share.path.length= strlen(path);
1832
    dummy_share.db.str= (char*) db;
1833
    dummy_share.db.length= strlen(db);
1834
    dummy_share.table_name.str= (char*) alias;
1835
    dummy_share.table_name.length= strlen(alias);
1836
    dummy_table.alias= alias;
1837
1838
    file->change_table_ptr(&dummy_table, &dummy_share);
1839
1840
    thd->push_internal_handler(&ha_delete_table_error_handler);
1841
    file->print_error(error, 0);
1842
1843
    thd->pop_internal_handler();
1844
1845
    /*
1846
      XXX: should we convert *all* errors to warnings here?
1847
      What if the error is fatal?
1848
    */
1849
    push_warning(thd, MYSQL_ERROR::WARN_LEVEL_ERROR, error,
1850
                ha_delete_table_error_handler.buff);
1851
  }
1852
  delete file;
1853
  DBUG_RETURN(error);
1854
}
1855
1856
/****************************************************************************
1857
** General handler functions
1858
****************************************************************************/
1859
handler *handler::clone(MEM_ROOT *mem_root)
1860
{
1861
  handler *new_handler= get_new_handler(table->s, mem_root, table->s->db_type());
1862
  /*
1863
    Allocate handler->ref here because otherwise ha_open will allocate it
1864
    on this->table->mem_root and we will not be able to reclaim that memory 
1865
    when the clone handler object is destroyed.
1866
  */
1867
  if (!(new_handler->ref= (uchar*) alloc_root(mem_root, ALIGN_SIZE(ref_length)*2)))
1868
    return NULL;
1869
  if (new_handler && !new_handler->ha_open(table,
1870
                                           table->s->normalized_path.str,
1871
                                           table->db_stat,
1872
                                           HA_OPEN_IGNORE_IF_LOCKED))
1873
    return new_handler;
1874
  return NULL;
1875
}
1876
1877
1878
1879
void handler::ha_statistic_increment(ulong SSV::*offset) const
1880
{
1881
  status_var_increment(table->in_use->status_var.*offset);
1882
}
1883
1884
void **handler::ha_data(THD *thd) const
1885
{
1886
  return thd_ha_data(thd, ht);
1887
}
1888
1889
THD *handler::ha_thd(void) const
1890
{
1891
  DBUG_ASSERT(!table || !table->in_use || table->in_use == current_thd);
1892
  return (table && table->in_use) ? table->in_use : current_thd;
1893
}
1894
1895
1896
/**
1897
   Get tablespace name from handler 
1898
   Returns the tablespace name associated
1899
   with the table or NULL if not defined
1900
*/
1901
const 
1902
char* handler::get_tablespace_name()
1903
{
1904
  return table->s->tablespace;
1905
}
1906
1907
/**
1908
  Open database-handler.
1909
1910
  Try O_RDONLY if cannot open as O_RDWR
1911
  Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
1912
*/
1913
int handler::ha_open(TABLE *table_arg, const char *name, int mode,
1914
                     int test_if_locked)
1915
{
1916
  int error;
1917
  DBUG_ENTER("handler::ha_open");
1918
  DBUG_PRINT("enter",
1919
             ("name: %s  db_type: %d  db_stat: %d  mode: %d  lock_test: %d",
1920
              name, ht->db_type, table_arg->db_stat, mode,
1921
              test_if_locked));
1922
1923
  table= table_arg;
1924
  DBUG_ASSERT(table->s == table_share);
1925
  DBUG_ASSERT(alloc_root_inited(&table->mem_root));
1926
1927
  if ((error=open(name,mode,test_if_locked)))
1928
  {
1929
    if ((error == EACCES || error == EROFS) && mode == O_RDWR &&
1930
	(table->db_stat & HA_TRY_READ_ONLY))
1931
    {
1932
      table->db_stat|=HA_READ_ONLY;
1933
      error=open(name,O_RDONLY,test_if_locked);
1934
    }
1935
  }
1936
  if (error)
1937
  {
1938
    my_errno= error;                            /* Safeguard */
1939
    DBUG_PRINT("error",("error: %d  errno: %d",error,errno));
1940
  }
1941
  else
1942
  {
1943
    if (table->s->db_options_in_use & HA_OPTION_READ_ONLY_DATA)
1944
      table->db_stat|=HA_READ_ONLY;
1945
    (void) extra(HA_EXTRA_NO_READCHECK);	// Not needed in SQL
1946
1947
    /* ref is already allocated for us if we're called from handler::clone() */
1948
    if (!ref && !(ref= (uchar*) alloc_root(&table->mem_root, 
1949
                                          ALIGN_SIZE(ref_length)*2)))
1950
    {
1951
      close();
1952
      error=HA_ERR_OUT_OF_MEM;
1953
    }
1954
    else
1955
      dup_ref=ref+ALIGN_SIZE(ref_length);
1956
    cached_table_flags= table_flags();
1957
  }
1958
  DBUG_RETURN(error);
1959
}
1960
1961
/**
1962
  one has to use this method when to find
1963
  random position by record as the plain
1964
  position() call doesn't work for some
1965
  handlers for random position
1966
*/
1967
1968
int handler::rnd_pos_by_record(uchar *record)
1969
{
1970
  register int error;
1971
  DBUG_ENTER("handler::rnd_pos_by_record");
1972
1973
  position(record);
1974
  if (inited && (error= ha_index_end()))
1975
    DBUG_RETURN(error);
1976
  if ((error= ha_rnd_init(FALSE)))
1977
    DBUG_RETURN(error);
1978
1979
  DBUG_RETURN(rnd_pos(record, ref));
1980
}
1981
1982
/**
1983
  Read first row (only) from a table.
1984
1985
  This is never called for InnoDB tables, as these table types
1986
  has the HA_STATS_RECORDS_IS_EXACT set.
1987
*/
1988
int handler::read_first_row(uchar * buf, uint primary_key)
1989
{
1990
  register int error;
1991
  DBUG_ENTER("handler::read_first_row");
1992
1993
  ha_statistic_increment(&SSV::ha_read_first_count);
1994
1995
  /*
1996
    If there is very few deleted rows in the table, find the first row by
1997
    scanning the table.
1998
    TODO remove the test for HA_READ_ORDER
1999
  */
2000
  if (stats.deleted < 10 || primary_key >= MAX_KEY ||
2001
      !(index_flags(primary_key, 0, 0) & HA_READ_ORDER))
2002
  {
2003
    (void) ha_rnd_init(1);
2004
    while ((error= rnd_next(buf)) == HA_ERR_RECORD_DELETED) ;
2005
    (void) ha_rnd_end();
2006
  }
2007
  else
2008
  {
2009
    /* Find the first row through the primary key */
2010
    (void) ha_index_init(primary_key, 0);
2011
    error=index_first(buf);
2012
    (void) ha_index_end();
2013
  }
2014
  DBUG_RETURN(error);
2015
}
2016
2017
/**
2018
  Generate the next auto-increment number based on increment and offset.
2019
  computes the lowest number
2020
  - strictly greater than "nr"
2021
  - of the form: auto_increment_offset + N * auto_increment_increment
2022
2023
  In most cases increment= offset= 1, in which case we get:
2024
  @verbatim 1,2,3,4,5,... @endverbatim
2025
    If increment=10 and offset=5 and previous number is 1, we get:
2026
  @verbatim 1,5,15,25,35,... @endverbatim
2027
*/
2028
inline uint64_t
2029
compute_next_insert_id(uint64_t nr,struct system_variables *variables)
2030
{
2031
  if (variables->auto_increment_increment == 1)
2032
    return (nr+1); // optimization of the formula below
2033
  nr= (((nr+ variables->auto_increment_increment -
2034
         variables->auto_increment_offset)) /
2035
       (uint64_t) variables->auto_increment_increment);
2036
  return (nr* (uint64_t) variables->auto_increment_increment +
2037
          variables->auto_increment_offset);
2038
}
2039
2040
2041
void handler::adjust_next_insert_id_after_explicit_value(uint64_t nr)
2042
{
2043
  /*
2044
    If we have set THD::next_insert_id previously and plan to insert an
2045
    explicitely-specified value larger than this, we need to increase
2046
    THD::next_insert_id to be greater than the explicit value.
2047
  */
2048
  if ((next_insert_id > 0) && (nr >= next_insert_id))
2049
    set_next_insert_id(compute_next_insert_id(nr, &table->in_use->variables));
2050
}
2051
2052
2053
/**
2054
  Compute a previous insert id
2055
2056
  Computes the largest number X:
2057
  - smaller than or equal to "nr"
2058
  - of the form: auto_increment_offset + N * auto_increment_increment
2059
    where N>=0.
2060
2061
  @param nr            Number to "round down"
2062
  @param variables     variables struct containing auto_increment_increment and
2063
                       auto_increment_offset
2064
2065
  @return
2066
    The number X if it exists, "nr" otherwise.
2067
*/
2068
inline uint64_t
2069
prev_insert_id(uint64_t nr, struct system_variables *variables)
2070
{
2071
  if (unlikely(nr < variables->auto_increment_offset))
2072
  {
2073
    /*
2074
      There's nothing good we can do here. That is a pathological case, where
2075
      the offset is larger than the column's max possible value, i.e. not even
2076
      the first sequence value may be inserted. User will receive warning.
2077
    */
2078
    DBUG_PRINT("info",("auto_increment: nr: %lu cannot honour "
2079
                       "auto_increment_offset: %lu",
2080
                       (ulong) nr, variables->auto_increment_offset));
2081
    return nr;
2082
  }
2083
  if (variables->auto_increment_increment == 1)
2084
    return nr; // optimization of the formula below
2085
  nr= (((nr - variables->auto_increment_offset)) /
2086
       (uint64_t) variables->auto_increment_increment);
2087
  return (nr * (uint64_t) variables->auto_increment_increment +
2088
          variables->auto_increment_offset);
2089
}
2090
2091
2092
/**
2093
  Update the auto_increment field if necessary.
2094
2095
  Updates columns with type NEXT_NUMBER if:
2096
2097
  - If column value is set to NULL (in which case
2098
    auto_increment_field_not_null is 0)
2099
  - If column is set to 0 and (sql_mode & MODE_NO_AUTO_VALUE_ON_ZERO) is not
2100
    set. In the future we will only set NEXT_NUMBER fields if one sets them
2101
    to NULL (or they are not included in the insert list).
2102
2103
    In those cases, we check if the currently reserved interval still has
2104
    values we have not used. If yes, we pick the smallest one and use it.
2105
    Otherwise:
2106
2107
  - If a list of intervals has been provided to the statement via SET
2108
    INSERT_ID or via an Intvar_log_event (in a replication slave), we pick the
2109
    first unused interval from this list, consider it as reserved.
2110
2111
  - Otherwise we set the column for the first row to the value
2112
    next_insert_id(get_auto_increment(column))) which is usually
2113
    max-used-column-value+1.
2114
    We call get_auto_increment() for the first row in a multi-row
2115
    statement. get_auto_increment() will tell us the interval of values it
2116
    reserved for us.
2117
2118
  - In both cases, for the following rows we use those reserved values without
2119
    calling the handler again (we just progress in the interval, computing
2120
    each new value from the previous one). Until we have exhausted them, then
2121
    we either take the next provided interval or call get_auto_increment()
2122
    again to reserve a new interval.
2123
2124
  - In both cases, the reserved intervals are remembered in
2125
    thd->auto_inc_intervals_in_cur_stmt_for_binlog if statement-based
2126
    binlogging; the last reserved interval is remembered in
2127
    auto_inc_interval_for_cur_row.
2128
2129
    The idea is that generated auto_increment values are predictable and
2130
    independent of the column values in the table.  This is needed to be
2131
    able to replicate into a table that already has rows with a higher
2132
    auto-increment value than the one that is inserted.
2133
2134
    After we have already generated an auto-increment number and the user
2135
    inserts a column with a higher value than the last used one, we will
2136
    start counting from the inserted value.
2137
2138
    This function's "outputs" are: the table's auto_increment field is filled
2139
    with a value, thd->next_insert_id is filled with the value to use for the
2140
    next row, if a value was autogenerated for the current row it is stored in
2141
    thd->insert_id_for_cur_row, if get_auto_increment() was called
2142
    thd->auto_inc_interval_for_cur_row is modified, if that interval is not
2143
    present in thd->auto_inc_intervals_in_cur_stmt_for_binlog it is added to
2144
    this list.
2145
2146
  @todo
2147
    Replace all references to "next number" or NEXT_NUMBER to
2148
    "auto_increment", everywhere (see below: there is
2149
    table->auto_increment_field_not_null, and there also exists
2150
    table->next_number_field, it's not consistent).
2151
2152
  @retval
2153
    0	ok
2154
  @retval
2155
    HA_ERR_AUTOINC_READ_FAILED  get_auto_increment() was called and
2156
    returned ~(uint64_t) 0
2157
  @retval
2158
    HA_ERR_AUTOINC_ERANGE storing value in field caused strict mode
2159
    failure.
2160
*/
2161
2162
#define AUTO_INC_DEFAULT_NB_ROWS 1 // Some prefer 1024 here
2163
#define AUTO_INC_DEFAULT_NB_MAX_BITS 16
2164
#define AUTO_INC_DEFAULT_NB_MAX ((1 << AUTO_INC_DEFAULT_NB_MAX_BITS) - 1)
2165
2166
int handler::update_auto_increment()
2167
{
2168
  uint64_t nr, nb_reserved_values;
2169
  bool append= FALSE;
2170
  THD *thd= table->in_use;
2171
  struct system_variables *variables= &thd->variables;
2172
  DBUG_ENTER("handler::update_auto_increment");
2173
2174
  /*
2175
    next_insert_id is a "cursor" into the reserved interval, it may go greater
2176
    than the interval, but not smaller.
2177
  */
2178
  DBUG_ASSERT(next_insert_id >= auto_inc_interval_for_cur_row.minimum());
2179
2180
  if (((nr= table->next_number_field->val_int()) != 0) || 
2181
      (table->auto_increment_field_not_null && (thd->variables.sql_mode & MODE_NO_AUTO_VALUE_ON_ZERO)))
2182
  {
2183
    /*
2184
      Update next_insert_id if we had already generated a value in this
2185
      statement (case of INSERT VALUES(null),(3763),(null):
2186
      the last NULL needs to insert 3764, not the value of the first NULL plus
2187
      1).
2188
    */
2189
    adjust_next_insert_id_after_explicit_value(nr);
2190
    insert_id_for_cur_row= 0; // didn't generate anything
2191
    DBUG_RETURN(0);
2192
  }
2193
2194
  if ((nr= next_insert_id) >= auto_inc_interval_for_cur_row.maximum())
2195
  {
2196
    /* next_insert_id is beyond what is reserved, so we reserve more. */
2197
    const Discrete_interval *forced=
2198
      thd->auto_inc_intervals_forced.get_next();
2199
    if (forced != NULL)
2200
    {
2201
      nr= forced->minimum();
2202
      nb_reserved_values= forced->values();
2203
    }
2204
    else
2205
    {
2206
      /*
2207
        handler::estimation_rows_to_insert was set by
2208
        handler::ha_start_bulk_insert(); if 0 it means "unknown".
2209
      */
2210
      uint nb_already_reserved_intervals=
2211
        thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements();
2212
      uint64_t nb_desired_values;
2213
      /*
2214
        If an estimation was given to the engine:
2215
        - use it.
2216
        - if we already reserved numbers, it means the estimation was
2217
        not accurate, then we'll reserve 2*AUTO_INC_DEFAULT_NB_ROWS the 2nd
2218
        time, twice that the 3rd time etc.
2219
        If no estimation was given, use those increasing defaults from the
2220
        start, starting from AUTO_INC_DEFAULT_NB_ROWS.
2221
        Don't go beyond a max to not reserve "way too much" (because
2222
        reservation means potentially losing unused values).
2223
      */
2224
      if (nb_already_reserved_intervals == 0 &&
2225
          (estimation_rows_to_insert > 0))
2226
        nb_desired_values= estimation_rows_to_insert;
2227
      else /* go with the increasing defaults */
2228
      {
2229
        /* avoid overflow in formula, with this if() */
2230
        if (nb_already_reserved_intervals <= AUTO_INC_DEFAULT_NB_MAX_BITS)
2231
        {
2232
          nb_desired_values= AUTO_INC_DEFAULT_NB_ROWS * 
2233
            (1 << nb_already_reserved_intervals);
2234
          set_if_smaller(nb_desired_values, AUTO_INC_DEFAULT_NB_MAX);
2235
        }
2236
        else
2237
          nb_desired_values= AUTO_INC_DEFAULT_NB_MAX;
2238
      }
2239
      /* This call ignores all its parameters but nr, currently */
2240
      get_auto_increment(variables->auto_increment_offset,
2241
                         variables->auto_increment_increment,
2242
                         nb_desired_values, &nr,
2243
                         &nb_reserved_values);
2244
      if (nr == ~(uint64_t) 0)
2245
        DBUG_RETURN(HA_ERR_AUTOINC_READ_FAILED);  // Mark failure
2246
      
2247
      /*
2248
        That rounding below should not be needed when all engines actually
2249
        respect offset and increment in get_auto_increment(). But they don't
2250
        so we still do it. Wonder if for the not-first-in-index we should do
2251
        it. Hope that this rounding didn't push us out of the interval; even
2252
        if it did we cannot do anything about it (calling the engine again
2253
        will not help as we inserted no row).
2254
      */
2255
      nr= compute_next_insert_id(nr-1, variables);
2256
    }
2257
    
2258
    if (table->s->next_number_keypart == 0)
2259
    {
2260
      /* We must defer the appending until "nr" has been possibly truncated */
2261
      append= TRUE;
2262
    }
2263
    else
2264
    {
2265
      /*
2266
        For such auto_increment there is no notion of interval, just a
2267
        singleton. The interval is not even stored in
2268
        thd->auto_inc_interval_for_cur_row, so we are sure to call the engine
2269
        for next row.
2270
      */
2271
      DBUG_PRINT("info",("auto_increment: special not-first-in-index"));
2272
    }
2273
  }
2274
2275
  DBUG_PRINT("info",("auto_increment: %lu", (ulong) nr));
2276
2277
  if (unlikely(table->next_number_field->store((longlong) nr, TRUE)))
2278
  {
2279
    /*
2280
      first test if the query was aborted due to strict mode constraints
2281
    */
2282
    if (thd->killed == THD::KILL_BAD_DATA)
2283
      DBUG_RETURN(HA_ERR_AUTOINC_ERANGE);
2284
2285
    /*
2286
      field refused this value (overflow) and truncated it, use the result of
2287
      the truncation (which is going to be inserted); however we try to
2288
      decrease it to honour auto_increment_* variables.
2289
      That will shift the left bound of the reserved interval, we don't
2290
      bother shifting the right bound (anyway any other value from this
2291
      interval will cause a duplicate key).
2292
    */
2293
    nr= prev_insert_id(table->next_number_field->val_int(), variables);
2294
    if (unlikely(table->next_number_field->store((longlong) nr, TRUE)))
2295
      nr= table->next_number_field->val_int();
2296
  }
2297
  if (append)
2298
  {
2299
    auto_inc_interval_for_cur_row.replace(nr, nb_reserved_values,
2300
                                          variables->auto_increment_increment);
2301
    /* Row-based replication does not need to store intervals in binlog */
2302
    if (!thd->current_stmt_binlog_row_based)
2303
        thd->auto_inc_intervals_in_cur_stmt_for_binlog.append(auto_inc_interval_for_cur_row.minimum(),
2304
                                                              auto_inc_interval_for_cur_row.values(),
2305
                                                              variables->auto_increment_increment);
2306
  }
2307
2308
  /*
2309
    Record this autogenerated value. If the caller then
2310
    succeeds to insert this value, it will call
2311
    record_first_successful_insert_id_in_cur_stmt()
2312
    which will set first_successful_insert_id_in_cur_stmt if it's not
2313
    already set.
2314
  */
2315
  insert_id_for_cur_row= nr;
2316
  /*
2317
    Set next insert id to point to next auto-increment value to be able to
2318
    handle multi-row statements.
2319
  */
2320
  set_next_insert_id(compute_next_insert_id(nr, variables));
2321
2322
  DBUG_RETURN(0);
2323
}
2324
2325
2326
/**
2327
  MySQL signal that it changed the column bitmap
2328
2329
  This is for handlers that needs to setup their own column bitmaps.
2330
  Normally the handler should set up their own column bitmaps in
2331
  index_init() or rnd_init() and in any column_bitmaps_signal() call after
2332
  this.
2333
2334
  The handler is allowed to do changes to the bitmap after a index_init or
2335
  rnd_init() call is made as after this, MySQL will not use the bitmap
2336
  for any program logic checking.
2337
*/
2338
void handler::column_bitmaps_signal()
2339
{
2340
  DBUG_ENTER("column_bitmaps_signal");
2341
  DBUG_PRINT("info", ("read_set: 0x%lx  write_set: 0x%lx", (long) table->read_set,
2342
                      (long) table->write_set));
2343
  DBUG_VOID_RETURN;
2344
}
2345
2346
2347
/**
2348
  Reserves an interval of auto_increment values from the handler.
2349
2350
  offset and increment means that we want values to be of the form
2351
  offset + N * increment, where N>=0 is integer.
2352
  If the function sets *first_value to ~(uint64_t)0 it means an error.
2353
  If the function sets *nb_reserved_values to ULONGLONG_MAX it means it has
2354
  reserved to "positive infinite".
2355
2356
  @param offset
2357
  @param increment
2358
  @param nb_desired_values   how many values we want
2359
  @param first_value         (OUT) the first value reserved by the handler
2360
  @param nb_reserved_values  (OUT) how many values the handler reserved
2361
*/
2362
void handler::get_auto_increment(uint64_t offset, uint64_t increment,
2363
                                 uint64_t nb_desired_values,
2364
                                 uint64_t *first_value,
2365
                                 uint64_t *nb_reserved_values)
2366
{
2367
  uint64_t nr;
2368
  int error;
2369
2370
  (void) extra(HA_EXTRA_KEYREAD);
2371
  table->mark_columns_used_by_index_no_reset(table->s->next_number_index,
2372
                                        table->read_set);
2373
  column_bitmaps_signal();
2374
  index_init(table->s->next_number_index, 1);
2375
  if (table->s->next_number_keypart == 0)
2376
  {						// Autoincrement at key-start
2377
    error=index_last(table->record[1]);
2378
    /*
2379
      MySQL implicitely assumes such method does locking (as MySQL decides to
2380
      use nr+increment without checking again with the handler, in
2381
      handler::update_auto_increment()), so reserves to infinite.
2382
    */
2383
    *nb_reserved_values= ULONGLONG_MAX;
2384
  }
2385
  else
2386
  {
2387
    uchar key[MAX_KEY_LENGTH];
2388
    key_copy(key, table->record[0],
2389
             table->key_info + table->s->next_number_index,
2390
             table->s->next_number_key_offset);
2391
    error= index_read_map(table->record[1], key,
2392
                          make_prev_keypart_map(table->s->next_number_keypart),
2393
                          HA_READ_PREFIX_LAST);
2394
    /*
2395
      MySQL needs to call us for next row: assume we are inserting ("a",null)
2396
      here, we return 3, and next this statement will want to insert
2397
      ("b",null): there is no reason why ("b",3+1) would be the good row to
2398
      insert: maybe it already exists, maybe 3+1 is too large...
2399
    */
2400
    *nb_reserved_values= 1;
2401
  }
2402
2403
  if (error)
2404
    nr=1;
2405
  else
2406
    nr= ((uint64_t) table->next_number_field->
2407
         val_int_offset(table->s->rec_buff_length)+1);
2408
  index_end();
2409
  (void) extra(HA_EXTRA_NO_KEYREAD);
2410
  *first_value= nr;
2411
}
2412
2413
2414
void handler::ha_release_auto_increment()
2415
{
2416
  release_auto_increment();
2417
  insert_id_for_cur_row= 0;
2418
  auto_inc_interval_for_cur_row.replace(0, 0, 0);
2419
  if (next_insert_id > 0)
2420
  {
2421
    next_insert_id= 0;
2422
    /*
2423
      this statement used forced auto_increment values if there were some,
2424
      wipe them away for other statements.
2425
    */
2426
    table->in_use->auto_inc_intervals_forced.empty();
2427
  }
2428
}
2429
2430
2431
void handler::print_keydup_error(uint key_nr, const char *msg)
2432
{
2433
  /* Write the duplicated key in the error message */
2434
  char key[MAX_KEY_LENGTH];
2435
  String str(key,sizeof(key),system_charset_info);
2436
2437
  if (key_nr == MAX_KEY)
2438
  {
2439
    /* Key is unknown */
2440
    str.copy("", 0, system_charset_info);
2441
    my_printf_error(ER_DUP_ENTRY, msg, MYF(0), str.c_ptr(), "*UNKNOWN*");
2442
  }
2443
  else
2444
  {
2445
    /* Table is opened and defined at this point */
2446
    key_unpack(&str,table,(uint) key_nr);
2447
    uint max_length=MYSQL_ERRMSG_SIZE-(uint) strlen(msg);
2448
    if (str.length() >= max_length)
2449
    {
2450
      str.length(max_length-4);
2451
      str.append(STRING_WITH_LEN("..."));
2452
    }
2453
    my_printf_error(ER_DUP_ENTRY, msg,
2454
		    MYF(0), str.c_ptr(), table->key_info[key_nr].name);
2455
  }
2456
}
2457
2458
2459
/**
2460
  Print error that we got from handler function.
2461
2462
  @note
2463
    In case of delete table it's only safe to use the following parts of
2464
    the 'table' structure:
2465
    - table->s->path
2466
    - table->alias
2467
*/
2468
void handler::print_error(int error, myf errflag)
2469
{
2470
  DBUG_ENTER("handler::print_error");
2471
  DBUG_PRINT("enter",("error: %d",error));
2472
2473
  int textno=ER_GET_ERRNO;
2474
  switch (error) {
2475
  case EACCES:
2476
    textno=ER_OPEN_AS_READONLY;
2477
    break;
2478
  case EAGAIN:
2479
    textno=ER_FILE_USED;
2480
    break;
2481
  case ENOENT:
2482
    textno=ER_FILE_NOT_FOUND;
2483
    break;
2484
  case HA_ERR_KEY_NOT_FOUND:
2485
  case HA_ERR_NO_ACTIVE_RECORD:
2486
  case HA_ERR_END_OF_FILE:
2487
    textno=ER_KEY_NOT_FOUND;
2488
    break;
2489
  case HA_ERR_WRONG_MRG_TABLE_DEF:
2490
    textno=ER_WRONG_MRG_TABLE;
2491
    break;
2492
  case HA_ERR_FOUND_DUPP_KEY:
2493
  {
2494
    uint key_nr=get_dup_key(error);
2495
    if ((int) key_nr >= 0)
2496
    {
2497
      print_keydup_error(key_nr, ER(ER_DUP_ENTRY_WITH_KEY_NAME));
2498
      DBUG_VOID_RETURN;
2499
    }
2500
    textno=ER_DUP_KEY;
2501
    break;
2502
  }
2503
  case HA_ERR_FOREIGN_DUPLICATE_KEY:
2504
  {
2505
    uint key_nr= get_dup_key(error);
2506
    if ((int) key_nr >= 0)
2507
    {
2508
      uint max_length;
2509
      /* Write the key in the error message */
2510
      char key[MAX_KEY_LENGTH];
2511
      String str(key,sizeof(key),system_charset_info);
2512
      /* Table is opened and defined at this point */
2513
      key_unpack(&str,table,(uint) key_nr);
2514
      max_length= (MYSQL_ERRMSG_SIZE-
2515
                   (uint) strlen(ER(ER_FOREIGN_DUPLICATE_KEY)));
2516
      if (str.length() >= max_length)
2517
      {
2518
        str.length(max_length-4);
2519
        str.append(STRING_WITH_LEN("..."));
2520
      }
2521
      my_error(ER_FOREIGN_DUPLICATE_KEY, MYF(0), table_share->table_name.str,
2522
        str.c_ptr(), key_nr+1);
2523
      DBUG_VOID_RETURN;
2524
    }
2525
    textno= ER_DUP_KEY;
2526
    break;
2527
  }
2528
  case HA_ERR_FOUND_DUPP_UNIQUE:
2529
    textno=ER_DUP_UNIQUE;
2530
    break;
2531
  case HA_ERR_RECORD_CHANGED:
2532
    textno=ER_CHECKREAD;
2533
    break;
2534
  case HA_ERR_CRASHED:
2535
    textno=ER_NOT_KEYFILE;
2536
    break;
2537
  case HA_ERR_WRONG_IN_RECORD:
2538
    textno= ER_CRASHED_ON_USAGE;
2539
    break;
2540
  case HA_ERR_CRASHED_ON_USAGE:
2541
    textno=ER_CRASHED_ON_USAGE;
2542
    break;
2543
  case HA_ERR_NOT_A_TABLE:
2544
    textno= error;
2545
    break;
2546
  case HA_ERR_CRASHED_ON_REPAIR:
2547
    textno=ER_CRASHED_ON_REPAIR;
2548
    break;
2549
  case HA_ERR_OUT_OF_MEM:
2550
    textno=ER_OUT_OF_RESOURCES;
2551
    break;
2552
  case HA_ERR_WRONG_COMMAND:
2553
    textno=ER_ILLEGAL_HA;
2554
    break;
2555
  case HA_ERR_OLD_FILE:
2556
    textno=ER_OLD_KEYFILE;
2557
    break;
2558
  case HA_ERR_UNSUPPORTED:
2559
    textno=ER_UNSUPPORTED_EXTENSION;
2560
    break;
2561
  case HA_ERR_RECORD_FILE_FULL:
2562
  case HA_ERR_INDEX_FILE_FULL:
2563
    textno=ER_RECORD_FILE_FULL;
2564
    break;
2565
  case HA_ERR_LOCK_WAIT_TIMEOUT:
2566
    textno=ER_LOCK_WAIT_TIMEOUT;
2567
    break;
2568
  case HA_ERR_LOCK_TABLE_FULL:
2569
    textno=ER_LOCK_TABLE_FULL;
2570
    break;
2571
  case HA_ERR_LOCK_DEADLOCK:
2572
    textno=ER_LOCK_DEADLOCK;
2573
    break;
2574
  case HA_ERR_READ_ONLY_TRANSACTION:
2575
    textno=ER_READ_ONLY_TRANSACTION;
2576
    break;
2577
  case HA_ERR_CANNOT_ADD_FOREIGN:
2578
    textno=ER_CANNOT_ADD_FOREIGN;
2579
    break;
2580
  case HA_ERR_ROW_IS_REFERENCED:
2581
  {
2582
    String str;
2583
    get_error_message(error, &str);
2584
    my_error(ER_ROW_IS_REFERENCED_2, MYF(0), str.c_ptr_safe());
2585
    DBUG_VOID_RETURN;
2586
  }
2587
  case HA_ERR_NO_REFERENCED_ROW:
2588
  {
2589
    String str;
2590
    get_error_message(error, &str);
2591
    my_error(ER_NO_REFERENCED_ROW_2, MYF(0), str.c_ptr_safe());
2592
    DBUG_VOID_RETURN;
2593
  }
2594
  case HA_ERR_TABLE_DEF_CHANGED:
2595
    textno=ER_TABLE_DEF_CHANGED;
2596
    break;
2597
  case HA_ERR_NO_SUCH_TABLE:
2598
    my_error(ER_NO_SUCH_TABLE, MYF(0), table_share->db.str,
2599
             table_share->table_name.str);
2600
    DBUG_VOID_RETURN;
2601
  case HA_ERR_RBR_LOGGING_FAILED:
2602
    textno= ER_BINLOG_ROW_LOGGING_FAILED;
2603
    break;
2604
  case HA_ERR_DROP_INDEX_FK:
2605
  {
2606
    const char *ptr= "???";
2607
    uint key_nr= get_dup_key(error);
2608
    if ((int) key_nr >= 0)
2609
      ptr= table->key_info[key_nr].name;
2610
    my_error(ER_DROP_INDEX_FK, MYF(0), ptr);
2611
    DBUG_VOID_RETURN;
2612
  }
2613
  case HA_ERR_TABLE_NEEDS_UPGRADE:
2614
    textno=ER_TABLE_NEEDS_UPGRADE;
2615
    break;
2616
  case HA_ERR_TABLE_READONLY:
2617
    textno= ER_OPEN_AS_READONLY;
2618
    break;
2619
  case HA_ERR_AUTOINC_READ_FAILED:
2620
    textno= ER_AUTOINC_READ_FAILED;
2621
    break;
2622
  case HA_ERR_AUTOINC_ERANGE:
2623
    textno= ER_WARN_DATA_OUT_OF_RANGE;
2624
    break;
2625
  case HA_ERR_LOCK_OR_ACTIVE_TRANSACTION:
2626
    my_message(ER_LOCK_OR_ACTIVE_TRANSACTION,
2627
               ER(ER_LOCK_OR_ACTIVE_TRANSACTION), MYF(0));
2628
    DBUG_VOID_RETURN;
2629
    break;
2630
  default:
2631
    {
2632
      /* The error was "unknown" to this function.
2633
	 Ask handler if it has got a message for this error */
2634
      bool temporary= FALSE;
2635
      String str;
2636
      temporary= get_error_message(error, &str);
2637
      if (!str.is_empty())
2638
      {
2639
	const char* engine= table_type();
2640
	if (temporary)
2641
	  my_error(ER_GET_TEMPORARY_ERRMSG, MYF(0), error, str.ptr(), engine);
2642
	else
2643
	  my_error(ER_GET_ERRMSG, MYF(0), error, str.ptr(), engine);
2644
      }
2645
      else
2646
	my_error(ER_GET_ERRNO,errflag,error);
2647
      DBUG_VOID_RETURN;
2648
    }
2649
  }
2650
  my_error(textno, errflag, table_share->table_name.str, error);
2651
  DBUG_VOID_RETURN;
2652
}
2653
2654
2655
/**
2656
  Return an error message specific to this handler.
2657
2658
  @param error  error code previously returned by handler
2659
  @param buf    pointer to String where to add error message
2660
2661
  @return
2662
    Returns true if this is a temporary error
2663
*/
2664
bool handler::get_error_message(int error, String* buf)
2665
{
2666
  return FALSE;
2667
}
2668
2669
2670
int handler::ha_check_for_upgrade(HA_CHECK_OPT *check_opt)
2671
{
2672
  KEY *keyinfo, *keyend;
2673
  KEY_PART_INFO *keypart, *keypartend;
2674
2675
  if (!table->s->mysql_version)
2676
  {
2677
    /* check for blob-in-key error */
2678
    keyinfo= table->key_info;
2679
    keyend= table->key_info + table->s->keys;
2680
    for (; keyinfo < keyend; keyinfo++)
2681
    {
2682
      keypart= keyinfo->key_part;
2683
      keypartend= keypart + keyinfo->key_parts;
2684
      for (; keypart < keypartend; keypart++)
2685
      {
2686
        if (!keypart->fieldnr)
2687
          continue;
2688
        Field *field= table->field[keypart->fieldnr-1];
2689
        if (field->type() == MYSQL_TYPE_BLOB)
2690
        {
2691
          if (check_opt->sql_flags & TT_FOR_UPGRADE)
2692
            check_opt->flags= T_MEDIUM;
2693
          return HA_ADMIN_NEEDS_CHECK;
2694
        }
2695
      }
2696
    }
2697
  }
2698
  return check_for_upgrade(check_opt);
2699
}
2700
2701
2702
/* Code left, but Drizzle has no legacy yet (while MySQL did) */
2703
int handler::check_old_types()
2704
{
2705
  return 0;
2706
}
2707
2708
2709
static bool update_frm_version(TABLE *table)
2710
{
2711
  char path[FN_REFLEN];
2712
  File file;
31 by Brian Aker
Removed my versions of pread/pwrite from the Kernel
2713
  bool result= true;
1 by brian
clean slate
2714
  DBUG_ENTER("update_frm_version");
2715
2716
  /*
2717
    No need to update frm version in case table was created or checked
2718
    by server with the same version. This also ensures that we do not
2719
    update frm version for temporary tables as this code doesn't support
2720
    temporary tables.
2721
  */
2722
  if (table->s->mysql_version == MYSQL_VERSION_ID)
2723
    DBUG_RETURN(0);
2724
2725
  strxmov(path, table->s->normalized_path.str, reg_ext, NullS);
2726
2727
  if ((file= my_open(path, O_RDWR|O_BINARY, MYF(MY_WME))) >= 0)
2728
  {
2729
    uchar version[4];
2730
    char *key= table->s->table_cache_key.str;
2731
    uint key_length= table->s->table_cache_key.length;
2732
    TABLE *entry;
2733
    HASH_SEARCH_STATE state;
2734
2735
    int4store(version, MYSQL_VERSION_ID);
2736
31 by Brian Aker
Removed my versions of pread/pwrite from the Kernel
2737
    if (pwrite(file, (uchar*)version, 4, 51L) == 0)
2738
    {
2739
      result= false;
1 by brian
clean slate
2740
      goto err;
31 by Brian Aker
Removed my versions of pread/pwrite from the Kernel
2741
    }
1 by brian
clean slate
2742
2743
    for (entry=(TABLE*) hash_first(&open_cache,(uchar*) key,key_length, &state);
2744
         entry;
2745
         entry= (TABLE*) hash_next(&open_cache,(uchar*) key,key_length, &state))
2746
      entry->s->mysql_version= MYSQL_VERSION_ID;
2747
  }
2748
err:
2749
  if (file >= 0)
2750
    VOID(my_close(file,MYF(MY_WME)));
2751
  DBUG_RETURN(result);
2752
}
2753
2754
2755
2756
/**
2757
  @return
2758
    key if error because of duplicated keys
2759
*/
2760
uint handler::get_dup_key(int error)
2761
{
2762
  DBUG_ENTER("handler::get_dup_key");
2763
  table->file->errkey  = (uint) -1;
2764
  if (error == HA_ERR_FOUND_DUPP_KEY || error == HA_ERR_FOREIGN_DUPLICATE_KEY ||
2765
      error == HA_ERR_FOUND_DUPP_UNIQUE ||
2766
      error == HA_ERR_DROP_INDEX_FK)
2767
    info(HA_STATUS_ERRKEY | HA_STATUS_NO_LOCK);
2768
  DBUG_RETURN(table->file->errkey);
2769
}
2770
2771
2772
/**
2773
  Delete all files with extension from bas_ext().
2774
2775
  @param name		Base name of table
2776
2777
  @note
2778
    We assume that the handler may return more extensions than
2779
    was actually used for the file.
2780
2781
  @retval
2782
    0   If we successfully deleted at least one file from base_ext and
2783
    didn't get any other errors than ENOENT
2784
  @retval
2785
    !0  Error
2786
*/
2787
int handler::delete_table(const char *name)
2788
{
2789
  int error= 0;
2790
  int enoent_or_zero= ENOENT;                   // Error if no file was deleted
2791
  char buff[FN_REFLEN];
2792
2793
  for (const char **ext=bas_ext(); *ext ; ext++)
2794
  {
2795
    fn_format(buff, name, "", *ext, MY_UNPACK_FILENAME|MY_APPEND_EXT);
2796
    if (my_delete_with_symlink(buff, MYF(0)))
2797
    {
2798
      if ((error= my_errno) != ENOENT)
2799
	break;
2800
    }
2801
    else
2802
      enoent_or_zero= 0;                        // No error for ENOENT
2803
    error= enoent_or_zero;
2804
  }
2805
  return error;
2806
}
2807
2808
2809
int handler::rename_table(const char * from, const char * to)
2810
{
2811
  int error= 0;
2812
  for (const char **ext= bas_ext(); *ext ; ext++)
2813
  {
2814
    if (rename_file_ext(from, to, *ext))
2815
    {
2816
      if ((error=my_errno) != ENOENT)
2817
	break;
2818
      error= 0;
2819
    }
2820
  }
2821
  return error;
2822
}
2823
2824
2825
void handler::drop_table(const char *name)
2826
{
2827
  close();
2828
  delete_table(name);
2829
}
2830
2831
2832
/**
2833
  Performs checks upon the table.
2834
2835
  @param thd                thread doing CHECK TABLE operation
2836
  @param check_opt          options from the parser
2837
2838
  @retval
2839
    HA_ADMIN_OK               Successful upgrade
2840
  @retval
2841
    HA_ADMIN_NEEDS_UPGRADE    Table has structures requiring upgrade
2842
  @retval
2843
    HA_ADMIN_NEEDS_ALTER      Table has structures requiring ALTER TABLE
2844
  @retval
2845
    HA_ADMIN_NOT_IMPLEMENTED
2846
*/
2847
int handler::ha_check(THD *thd, HA_CHECK_OPT *check_opt)
2848
{
2849
  int error;
2850
2851
  if ((table->s->mysql_version >= MYSQL_VERSION_ID) &&
2852
      (check_opt->sql_flags & TT_FOR_UPGRADE))
2853
    return 0;
2854
2855
  if (table->s->mysql_version < MYSQL_VERSION_ID)
2856
  {
2857
    if ((error= check_old_types()))
2858
      return error;
2859
    error= ha_check_for_upgrade(check_opt);
2860
    if (error && (error != HA_ADMIN_NEEDS_CHECK))
2861
      return error;
2862
    if (!error && (check_opt->sql_flags & TT_FOR_UPGRADE))
2863
      return 0;
2864
  }
2865
  if ((error= check(thd, check_opt)))
2866
    return error;
2867
  return update_frm_version(table);
2868
}
2869
2870
/**
2871
  A helper function to mark a transaction read-write,
2872
  if it is started.
2873
*/
2874
2875
inline
2876
void
2877
handler::mark_trx_read_write()
2878
{
2879
  Ha_trx_info *ha_info= &ha_thd()->ha_data[ht->slot].ha_info[0];
2880
  /*
2881
    When a storage engine method is called, the transaction must
2882
    have been started, unless it's a DDL call, for which the
2883
    storage engine starts the transaction internally, and commits
2884
    it internally, without registering in the ha_list.
2885
    Unfortunately here we can't know know for sure if the engine
2886
    has registered the transaction or not, so we must check.
2887
  */
2888
  if (ha_info->is_started())
2889
  {
2890
    DBUG_ASSERT(has_transactions());
2891
    /*
2892
      table_share can be NULL in ha_delete_table(). See implementation
2893
      of standalone function ha_delete_table() in sql_base.cc.
2894
    */
2895
    if (table_share == NULL || table_share->tmp_table == NO_TMP_TABLE)
2896
      ha_info->set_trx_read_write();
2897
  }
2898
}
2899
2900
2901
/**
2902
  Repair table: public interface.
2903
2904
  @sa handler::repair()
2905
*/
2906
2907
int handler::ha_repair(THD* thd, HA_CHECK_OPT* check_opt)
2908
{
2909
  int result;
2910
2911
  mark_trx_read_write();
2912
2913
  if ((result= repair(thd, check_opt)))
2914
    return result;
2915
  return update_frm_version(table);
2916
}
2917
2918
2919
/**
2920
  Bulk update row: public interface.
2921
2922
  @sa handler::bulk_update_row()
2923
*/
2924
2925
int
2926
handler::ha_bulk_update_row(const uchar *old_data, uchar *new_data,
2927
                            uint *dup_key_found)
2928
{
2929
  mark_trx_read_write();
2930
2931
  return bulk_update_row(old_data, new_data, dup_key_found);
2932
}
2933
2934
2935
/**
2936
  Delete all rows: public interface.
2937
2938
  @sa handler::delete_all_rows()
2939
*/
2940
2941
int
2942
handler::ha_delete_all_rows()
2943
{
2944
  mark_trx_read_write();
2945
2946
  return delete_all_rows();
2947
}
2948
2949
2950
/**
2951
  Reset auto increment: public interface.
2952
2953
  @sa handler::reset_auto_increment()
2954
*/
2955
2956
int
2957
handler::ha_reset_auto_increment(uint64_t value)
2958
{
2959
  mark_trx_read_write();
2960
2961
  return reset_auto_increment(value);
2962
}
2963
2964
2965
/**
2966
  Optimize table: public interface.
2967
2968
  @sa handler::optimize()
2969
*/
2970
2971
int
2972
handler::ha_optimize(THD* thd, HA_CHECK_OPT* check_opt)
2973
{
2974
  mark_trx_read_write();
2975
2976
  return optimize(thd, check_opt);
2977
}
2978
2979
2980
/**
2981
  Analyze table: public interface.
2982
2983
  @sa handler::analyze()
2984
*/
2985
2986
int
2987
handler::ha_analyze(THD* thd, HA_CHECK_OPT* check_opt)
2988
{
2989
  mark_trx_read_write();
2990
2991
  return analyze(thd, check_opt);
2992
}
2993
2994
2995
/**
2996
  Check and repair table: public interface.
2997
2998
  @sa handler::check_and_repair()
2999
*/
3000
3001
bool
3002
handler::ha_check_and_repair(THD *thd)
3003
{
3004
  mark_trx_read_write();
3005
3006
  return check_and_repair(thd);
3007
}
3008
3009
3010
/**
3011
  Disable indexes: public interface.
3012
3013
  @sa handler::disable_indexes()
3014
*/
3015
3016
int
3017
handler::ha_disable_indexes(uint mode)
3018
{
3019
  mark_trx_read_write();
3020
3021
  return disable_indexes(mode);
3022
}
3023
3024
3025
/**
3026
  Enable indexes: public interface.
3027
3028
  @sa handler::enable_indexes()
3029
*/
3030
3031
int
3032
handler::ha_enable_indexes(uint mode)
3033
{
3034
  mark_trx_read_write();
3035
3036
  return enable_indexes(mode);
3037
}
3038
3039
3040
/**
3041
  Discard or import tablespace: public interface.
3042
3043
  @sa handler::discard_or_import_tablespace()
3044
*/
3045
3046
int
3047
handler::ha_discard_or_import_tablespace(my_bool discard)
3048
{
3049
  mark_trx_read_write();
3050
3051
  return discard_or_import_tablespace(discard);
3052
}
3053
3054
3055
/**
3056
  Prepare for alter: public interface.
3057
3058
  Called to prepare an *online* ALTER.
3059
3060
  @sa handler::prepare_for_alter()
3061
*/
3062
3063
void
3064
handler::ha_prepare_for_alter()
3065
{
3066
  mark_trx_read_write();
3067
3068
  prepare_for_alter();
3069
}
3070
3071
3072
/**
3073
  Rename table: public interface.
3074
3075
  @sa handler::rename_table()
3076
*/
3077
3078
int
3079
handler::ha_rename_table(const char *from, const char *to)
3080
{
3081
  mark_trx_read_write();
3082
3083
  return rename_table(from, to);
3084
}
3085
3086
3087
/**
3088
  Delete table: public interface.
3089
3090
  @sa handler::delete_table()
3091
*/
3092
3093
int
3094
handler::ha_delete_table(const char *name)
3095
{
3096
  mark_trx_read_write();
3097
3098
  return delete_table(name);
3099
}
3100
3101
3102
/**
3103
  Drop table in the engine: public interface.
3104
3105
  @sa handler::drop_table()
3106
*/
3107
3108
void
3109
handler::ha_drop_table(const char *name)
3110
{
3111
  mark_trx_read_write();
3112
3113
  return drop_table(name);
3114
}
3115
3116
3117
/**
3118
  Create a table in the engine: public interface.
3119
3120
  @sa handler::create()
3121
*/
3122
3123
int
3124
handler::ha_create(const char *name, TABLE *form, HA_CREATE_INFO *info)
3125
{
3126
  mark_trx_read_write();
3127
3128
  return create(name, form, info);
3129
}
3130
3131
3132
/**
3133
  Create handler files for CREATE TABLE: public interface.
3134
3135
  @sa handler::create_handler_files()
3136
*/
3137
3138
int
3139
handler::ha_create_handler_files(const char *name, const char *old_name,
3140
                        int action_flag, HA_CREATE_INFO *info)
3141
{
3142
  mark_trx_read_write();
3143
3144
  return create_handler_files(name, old_name, action_flag, info);
3145
}
3146
3147
3148
/**
3149
  Tell the storage engine that it is allowed to "disable transaction" in the
3150
  handler. It is a hint that ACID is not required - it is used in NDB for
3151
  ALTER TABLE, for example, when data are copied to temporary table.
3152
  A storage engine may treat this hint any way it likes. NDB for example
3153
  starts to commit every now and then automatically.
3154
  This hint can be safely ignored.
3155
*/
3156
int ha_enable_transaction(THD *thd, bool on)
3157
{
3158
  int error=0;
3159
  DBUG_ENTER("ha_enable_transaction");
3160
  DBUG_PRINT("enter", ("on: %d", (int) on));
3161
3162
  if ((thd->transaction.on= on))
3163
  {
3164
    /*
3165
      Now all storage engines should have transaction handling enabled.
3166
      But some may have it enabled all the time - "disabling" transactions
3167
      is an optimization hint that storage engine is free to ignore.
3168
      So, let's commit an open transaction (if any) now.
3169
    */
3170
    if (!(error= ha_commit_trans(thd, 0)))
3171
      error= end_trans(thd, COMMIT);
3172
  }
3173
  DBUG_RETURN(error);
3174
}
3175
3176
int handler::index_next_same(uchar *buf, const uchar *key, uint keylen)
3177
{
3178
  int error;
3179
  DBUG_ENTER("index_next_same");
3180
  if (!(error=index_next(buf)))
3181
  {
3182
    my_ptrdiff_t ptrdiff= buf - table->record[0];
3183
    uchar *save_record_0= NULL;
3184
    KEY *key_info= NULL;
3185
    KEY_PART_INFO *key_part;
3186
    KEY_PART_INFO *key_part_end= NULL;
3187
3188
    /*
3189
      key_cmp_if_same() compares table->record[0] against 'key'.
3190
      In parts it uses table->record[0] directly, in parts it uses
3191
      field objects with their local pointers into table->record[0].
3192
      If 'buf' is distinct from table->record[0], we need to move
3193
      all record references. This is table->record[0] itself and
3194
      the field pointers of the fields used in this key.
3195
    */
3196
    if (ptrdiff)
3197
    {
3198
      save_record_0= table->record[0];
3199
      table->record[0]= buf;
3200
      key_info= table->key_info + active_index;
3201
      key_part= key_info->key_part;
3202
      key_part_end= key_part + key_info->key_parts;
3203
      for (; key_part < key_part_end; key_part++)
3204
      {
3205
        DBUG_ASSERT(key_part->field);
3206
        key_part->field->move_field_offset(ptrdiff);
3207
      }
3208
    }
3209
3210
    if (key_cmp_if_same(table, key, active_index, keylen))
3211
    {
3212
      table->status=STATUS_NOT_FOUND;
3213
      error=HA_ERR_END_OF_FILE;
3214
    }
3215
3216
    /* Move back if necessary. */
3217
    if (ptrdiff)
3218
    {
3219
      table->record[0]= save_record_0;
3220
      for (key_part= key_info->key_part; key_part < key_part_end; key_part++)
3221
        key_part->field->move_field_offset(-ptrdiff);
3222
    }
3223
  }
3224
  DBUG_RETURN(error);
3225
}
3226
3227
3228
/****************************************************************************
3229
** Some general functions that isn't in the handler class
3230
****************************************************************************/
3231
3232
/**
3233
  Initiates table-file and calls appropriate database-creator.
3234
3235
  @retval
3236
   0  ok
3237
  @retval
3238
   1  error
3239
*/
3240
int ha_create_table(THD *thd, const char *path,
3241
                    const char *db, const char *table_name,
3242
                    HA_CREATE_INFO *create_info,
3243
		    bool update_create_info)
3244
{
3245
  int error= 1;
3246
  TABLE table;
3247
  char name_buff[FN_REFLEN];
3248
  const char *name;
3249
  TABLE_SHARE share;
3250
  DBUG_ENTER("ha_create_table");
3251
  
3252
  init_tmp_table_share(thd, &share, db, 0, table_name, path);
3253
  if (open_table_def(thd, &share, 0) ||
3254
      open_table_from_share(thd, &share, "", 0, (uint) READ_ALL, 0, &table,
3255
                            OTM_CREATE))
3256
    goto err;
3257
3258
  if (update_create_info)
3259
    update_create_info_from_table(create_info, &table);
3260
3261
  name= check_lowercase_names(table.file, share.path.str, name_buff);
3262
3263
  error= table.file->ha_create(name, &table, create_info);
3264
  VOID(closefrm(&table, 0));
3265
  if (error)
3266
  {
3267
    strxmov(name_buff, db, ".", table_name, NullS);
3268
    my_error(ER_CANT_CREATE_TABLE, MYF(ME_BELL+ME_WAITTANG), name_buff, error);
3269
  }
3270
err:
3271
  free_table_share(&share);
3272
  DBUG_RETURN(error != 0);
3273
}
3274
3275
/**
3276
  Try to discover table from engine.
3277
3278
  @note
3279
    If found, write the frm file to disk.
3280
3281
  @retval
3282
  -1    Table did not exists
3283
  @retval
3284
   0    Table created ok
3285
  @retval
3286
   > 0  Error, table existed but could not be created
3287
*/
3288
int ha_create_table_from_engine(THD* thd, const char *db, const char *name)
3289
{
3290
  int error;
3291
  uchar *frmblob;
3292
  size_t frmlen;
3293
  char path[FN_REFLEN];
3294
  HA_CREATE_INFO create_info;
3295
  TABLE table;
3296
  TABLE_SHARE share;
3297
  DBUG_ENTER("ha_create_table_from_engine");
3298
  DBUG_PRINT("enter", ("name '%s'.'%s'", db, name));
3299
3300
  bzero((uchar*) &create_info,sizeof(create_info));
3301
  if ((error= ha_discover(thd, db, name, &frmblob, &frmlen)))
3302
  {
3303
    /* Table could not be discovered and thus not created */
3304
    DBUG_RETURN(error);
3305
  }
3306
3307
  /*
3308
    Table exists in handler and could be discovered
3309
    frmblob and frmlen are set, write the frm to disk
3310
  */
3311
3312
  build_table_filename(path, FN_REFLEN-1, db, name, "", 0);
3313
  // Save the frm file
3314
  error= writefrm(path, frmblob, frmlen);
3315
  my_free(frmblob, MYF(0));
3316
  if (error)
3317
    DBUG_RETURN(2);
3318
3319
  init_tmp_table_share(thd, &share, db, 0, name, path);
3320
  if (open_table_def(thd, &share, 0))
3321
  {
3322
    DBUG_RETURN(3);
3323
  }
3324
  if (open_table_from_share(thd, &share, "" ,0, 0, 0, &table, OTM_OPEN))
3325
  {
3326
    free_table_share(&share);
3327
    DBUG_RETURN(3);
3328
  }
3329
3330
  update_create_info_from_table(&create_info, &table);
3331
  create_info.table_options|= HA_OPTION_CREATE_FROM_ENGINE;
3332
3333
  check_lowercase_names(table.file, path, path);
3334
  error=table.file->ha_create(path, &table, &create_info);
3335
  VOID(closefrm(&table, 1));
3336
3337
  DBUG_RETURN(error != 0);
3338
}
3339
3340
void st_ha_check_opt::init()
3341
{
3342
  flags= sql_flags= 0;
3343
  sort_buffer_size = current_thd->variables.myisam_sort_buff_size;
3344
}
3345
3346
3347
/*****************************************************************************
3348
  Key cache handling.
3349
3350
  This code is only relevant for ISAM/MyISAM tables
3351
3352
  key_cache->cache may be 0 only in the case where a key cache is not
3353
  initialized or when we where not able to init the key cache in a previous
3354
  call to ha_init_key_cache() (probably out of memory)
3355
*****************************************************************************/
3356
3357
/**
3358
  Init a key cache if it has not been initied before.
3359
*/
3360
int ha_init_key_cache(const char *name, KEY_CACHE *key_cache)
3361
{
3362
  DBUG_ENTER("ha_init_key_cache");
3363
3364
  if (!key_cache->key_cache_inited)
3365
  {
3366
    pthread_mutex_lock(&LOCK_global_system_variables);
3367
    ulong tmp_buff_size= (ulong) key_cache->param_buff_size;
3368
    uint tmp_block_size= (uint) key_cache->param_block_size;
3369
    uint division_limit= key_cache->param_division_limit;
3370
    uint age_threshold=  key_cache->param_age_threshold;
3371
    pthread_mutex_unlock(&LOCK_global_system_variables);
3372
    DBUG_RETURN(!init_key_cache(key_cache,
3373
				tmp_block_size,
3374
				tmp_buff_size,
3375
				division_limit, age_threshold));
3376
  }
3377
  DBUG_RETURN(0);
3378
}
3379
3380
3381
/**
3382
  Resize key cache.
3383
*/
3384
int ha_resize_key_cache(KEY_CACHE *key_cache)
3385
{
3386
  DBUG_ENTER("ha_resize_key_cache");
3387
3388
  if (key_cache->key_cache_inited)
3389
  {
3390
    pthread_mutex_lock(&LOCK_global_system_variables);
3391
    long tmp_buff_size= (long) key_cache->param_buff_size;
3392
    long tmp_block_size= (long) key_cache->param_block_size;
3393
    uint division_limit= key_cache->param_division_limit;
3394
    uint age_threshold=  key_cache->param_age_threshold;
3395
    pthread_mutex_unlock(&LOCK_global_system_variables);
3396
    DBUG_RETURN(!resize_key_cache(key_cache, tmp_block_size,
3397
				  tmp_buff_size,
3398
				  division_limit, age_threshold));
3399
  }
3400
  DBUG_RETURN(0);
3401
}
3402
3403
3404
/**
3405
  Change parameters for key cache (like size)
3406
*/
3407
int ha_change_key_cache_param(KEY_CACHE *key_cache)
3408
{
3409
  if (key_cache->key_cache_inited)
3410
  {
3411
    pthread_mutex_lock(&LOCK_global_system_variables);
3412
    uint division_limit= key_cache->param_division_limit;
3413
    uint age_threshold=  key_cache->param_age_threshold;
3414
    pthread_mutex_unlock(&LOCK_global_system_variables);
3415
    change_key_cache_param(key_cache, division_limit, age_threshold);
3416
  }
3417
  return 0;
3418
}
3419
3420
/**
3421
  Free memory allocated by a key cache.
3422
*/
3423
int ha_end_key_cache(KEY_CACHE *key_cache)
3424
{
3425
  end_key_cache(key_cache, 1);		// Can never fail
3426
  return 0;
3427
}
3428
3429
/**
3430
  Move all tables from one key cache to another one.
3431
*/
3432
int ha_change_key_cache(KEY_CACHE *old_key_cache,
3433
			KEY_CACHE *new_key_cache)
3434
{
3435
  mi_change_key_cache(old_key_cache, new_key_cache);
3436
  return 0;
3437
}
3438
3439
3440
/**
3441
  Try to discover one table from handler(s).
3442
3443
  @retval
3444
    -1   Table did not exists
3445
  @retval
3446
    0   OK. In this case *frmblob and *frmlen are set
3447
  @retval
3448
    >0   error.  frmblob and frmlen may not be set
3449
*/
3450
struct st_discover_args
3451
{
3452
  const char *db;
3453
  const char *name;
3454
  uchar **frmblob; 
3455
  size_t *frmlen;
3456
};
3457
3458
static my_bool discover_handlerton(THD *thd, plugin_ref plugin,
3459
                                   void *arg)
3460
{
3461
  st_discover_args *vargs= (st_discover_args *)arg;
3462
  handlerton *hton= plugin_data(plugin, handlerton *);
3463
  if (hton->state == SHOW_OPTION_YES && hton->discover &&
3464
      (!(hton->discover(hton, thd, vargs->db, vargs->name, 
3465
                        vargs->frmblob, 
3466
                        vargs->frmlen))))
3467
    return TRUE;
3468
3469
  return FALSE;
3470
}
3471
3472
int ha_discover(THD *thd, const char *db, const char *name,
3473
		uchar **frmblob, size_t *frmlen)
3474
{
3475
  int error= -1; // Table does not exist in any handler
3476
  DBUG_ENTER("ha_discover");
3477
  DBUG_PRINT("enter", ("db: %s, name: %s", db, name));
3478
  st_discover_args args= {db, name, frmblob, frmlen};
3479
3480
  if (is_prefix(name,tmp_file_prefix)) /* skip temporary tables */
3481
    DBUG_RETURN(error);
3482
3483
  if (plugin_foreach(thd, discover_handlerton,
3484
                 MYSQL_STORAGE_ENGINE_PLUGIN, &args))
3485
    error= 0;
3486
3487
  if (!error)
3488
    status_var_increment(thd->status_var.ha_discover_count);
3489
  DBUG_RETURN(error);
3490
}
3491
3492
3493
/**
3494
  Call this function in order to give the handler the possiblity
3495
  to ask engine if there are any new tables that should be written to disk
3496
  or any dropped tables that need to be removed from disk
3497
*/
3498
struct st_find_files_args
3499
{
3500
  const char *db;
3501
  const char *path;
3502
  const char *wild;
3503
  bool dir;
3504
  List<LEX_STRING> *files;
3505
};
3506
3507
/**
3508
  Ask handler if the table exists in engine.
3509
  @retval
3510
    HA_ERR_NO_SUCH_TABLE     Table does not exist
3511
  @retval
3512
    HA_ERR_TABLE_EXIST       Table exists
3513
  @retval
3514
    \#                  Error code
3515
*/
3516
struct st_table_exists_in_engine_args
3517
{
3518
  const char *db;
3519
  const char *name;
3520
  int err;
3521
};
3522
3523
static my_bool table_exists_in_engine_handlerton(THD *thd, plugin_ref plugin,
3524
                                   void *arg)
3525
{
3526
  st_table_exists_in_engine_args *vargs= (st_table_exists_in_engine_args *)arg;
3527
  handlerton *hton= plugin_data(plugin, handlerton *);
3528
3529
  int err= HA_ERR_NO_SUCH_TABLE;
3530
3531
  if (hton->state == SHOW_OPTION_YES && hton->table_exists_in_engine)
3532
    err = hton->table_exists_in_engine(hton, thd, vargs->db, vargs->name);
3533
3534
  vargs->err = err;
3535
  if (vargs->err == HA_ERR_TABLE_EXIST)
3536
    return TRUE;
3537
3538
  return FALSE;
3539
}
3540
3541
int ha_table_exists_in_engine(THD* thd, const char* db, const char* name)
3542
{
3543
  DBUG_ENTER("ha_table_exists_in_engine");
3544
  DBUG_PRINT("enter", ("db: %s, name: %s", db, name));
3545
  st_table_exists_in_engine_args args= {db, name, HA_ERR_NO_SUCH_TABLE};
3546
  plugin_foreach(thd, table_exists_in_engine_handlerton,
3547
                 MYSQL_STORAGE_ENGINE_PLUGIN, &args);
3548
  DBUG_PRINT("exit", ("error: %d", args.err));
3549
  DBUG_RETURN(args.err);
3550
}
3551
3552
/**
3553
  Calculate cost of 'index only' scan for given index and number of records
3554
3555
  @param keynr    Index number
3556
  @param records  Estimated number of records to be retrieved
3557
3558
  @note
3559
    It is assumed that we will read trough the whole key range and that all
3560
    key blocks are half full (normally things are much better). It is also
3561
    assumed that each time we read the next key from the index, the handler
3562
    performs a random seek, thus the cost is proportional to the number of
3563
    blocks read.
3564
3565
  @todo
3566
    Consider joining this function and handler::read_time() into one
3567
    handler::read_time(keynr, records, ranges, bool index_only) function.
3568
3569
  @return
3570
    Estimated cost of 'index only' scan
3571
*/
3572
3573
double handler::index_only_read_time(uint keynr, double records)
3574
{
3575
  double read_time;
3576
  uint keys_per_block= (stats.block_size/2/
3577
			(table->key_info[keynr].key_length + ref_length) + 1);
3578
  read_time=((double) (records + keys_per_block-1) /
3579
             (double) keys_per_block);
3580
  return read_time;
3581
}
3582
3583
3584
/****************************************************************************
3585
 * Default MRR implementation (MRR to non-MRR converter)
3586
 ***************************************************************************/
3587
3588
/**
3589
  Get cost and other information about MRR scan over a known list of ranges
3590
3591
  Calculate estimated cost and other information about an MRR scan for given
3592
  sequence of ranges.
3593
3594
  @param keyno           Index number
3595
  @param seq             Range sequence to be traversed
3596
  @param seq_init_param  First parameter for seq->init()
3597
  @param n_ranges_arg    Number of ranges in the sequence, or 0 if the caller
3598
                         can't efficiently determine it
3599
  @param bufsz    INOUT  IN:  Size of the buffer available for use
3600
                         OUT: Size of the buffer that is expected to be actually
3601
                              used, or 0 if buffer is not needed.
3602
  @param flags    INOUT  A combination of HA_MRR_* flags
3603
  @param cost     OUT    Estimated cost of MRR access
3604
3605
  @note
3606
    This method (or an overriding one in a derived class) must check for
3607
    thd->killed and return HA_POS_ERROR if it is not zero. This is required
3608
    for a user to be able to interrupt the calculation by killing the
3609
    connection/query.
3610
3611
  @retval
3612
    HA_POS_ERROR  Error or the engine is unable to perform the requested
3613
                  scan. Values of OUT parameters are undefined.
3614
  @retval
3615
    other         OK, *cost contains cost of the scan, *bufsz and *flags
3616
                  contain scan parameters.
3617
*/
3618
3619
ha_rows 
3620
handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
3621
                                     void *seq_init_param, uint n_ranges_arg,
3622
                                     uint *bufsz, uint *flags, COST_VECT *cost)
3623
{
3624
  KEY_MULTI_RANGE range;
3625
  range_seq_t seq_it;
3626
  ha_rows rows, total_rows= 0;
3627
  uint n_ranges=0;
3628
  THD *thd= current_thd;
3629
  
3630
  /* Default MRR implementation doesn't need buffer */
3631
  *bufsz= 0;
3632
3633
  seq_it= seq->init(seq_init_param, n_ranges, *flags);
3634
  while (!seq->next(seq_it, &range))
3635
  {
3636
    if (unlikely(thd->killed != 0))
3637
      return HA_POS_ERROR;
3638
    
3639
    n_ranges++;
3640
    key_range *min_endp, *max_endp;
3641
    {
3642
      min_endp= range.start_key.length? &range.start_key : NULL;
3643
      max_endp= range.end_key.length? &range.end_key : NULL;
3644
    }
3645
    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
3646
      rows= 1; /* there can be at most one row */
3647
    else
3648
    {
3649
      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
3650
                                                        max_endp)))
3651
      {
3652
        /* Can't scan one range => can't do MRR scan at all */
3653
        total_rows= HA_POS_ERROR;
3654
        break;
3655
      }
3656
    }
3657
    total_rows += rows;
3658
  }
3659
  
3660
  if (total_rows != HA_POS_ERROR)
3661
  {
3662
    /* The following calculation is the same as in multi_range_read_info(): */
3663
    *flags |= HA_MRR_USE_DEFAULT_IMPL;
3664
    cost->zero();
3665
    cost->avg_io_cost= 1; /* assume random seeks */
3666
    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
3667
      cost->io_count= index_only_read_time(keyno, (uint)total_rows);
3668
    else
3669
      cost->io_count= read_time(keyno, n_ranges, total_rows);
3670
    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
3671
  }
3672
  return total_rows;
3673
}
3674
3675
3676
/**
3677
  Get cost and other information about MRR scan over some sequence of ranges
3678
3679
  Calculate estimated cost and other information about an MRR scan for some
3680
  sequence of ranges.
3681
3682
  The ranges themselves will be known only at execution phase. When this
3683
  function is called we only know number of ranges and a (rough) E(#records)
3684
  within those ranges.
3685
3686
  Currently this function is only called for "n-keypart singlepoint" ranges,
3687
  i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN"
3688
3689
  The flags parameter is a combination of those flags: HA_MRR_SORTED,
3690
  HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS.
3691
3692
  @param keyno           Index number
3693
  @param n_ranges        Estimated number of ranges (i.e. intervals) in the
3694
                         range sequence.
3695
  @param n_rows          Estimated total number of records contained within all
3696
                         of the ranges
3697
  @param bufsz    INOUT  IN:  Size of the buffer available for use
3698
                         OUT: Size of the buffer that will be actually used, or
3699
                              0 if buffer is not needed.
3700
  @param flags    INOUT  A combination of HA_MRR_* flags
3701
  @param cost     OUT    Estimated cost of MRR access
3702
3703
  @retval
3704
    0     OK, *cost contains cost of the scan, *bufsz and *flags contain scan
3705
          parameters.
3706
  @retval
3707
    other Error or can't perform the requested scan
3708
*/
3709
3710
int handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
3711
                                   uint *bufsz, uint *flags, COST_VECT *cost)
3712
{
3713
  *bufsz= 0; /* Default implementation doesn't need a buffer */
3714
3715
  *flags |= HA_MRR_USE_DEFAULT_IMPL;
3716
3717
  cost->zero();
3718
  cost->avg_io_cost= 1; /* assume random seeks */
3719
3720
  /* Produce the same cost as non-MRR code does */
3721
  if (*flags & HA_MRR_INDEX_ONLY)
3722
    cost->io_count= index_only_read_time(keyno, n_rows);
3723
  else
3724
    cost->io_count= read_time(keyno, n_ranges, n_rows);
3725
  return 0;
3726
}
3727
3728
3729
/**
3730
  Initialize the MRR scan
3731
3732
  Initialize the MRR scan. This function may do heavyweight scan 
3733
  initialization like row prefetching/sorting/etc (NOTE: but better not do
3734
  it here as we may not need it, e.g. if we never satisfy WHERE clause on
3735
  previous tables. For many implementations it would be natural to do such
3736
  initializations in the first multi_read_range_next() call)
3737
3738
  mode is a combination of the following flags: HA_MRR_SORTED,
3739
  HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION 
3740
3741
  @param seq             Range sequence to be traversed
3742
  @param seq_init_param  First parameter for seq->init()
3743
  @param n_ranges        Number of ranges in the sequence
3744
  @param mode            Flags, see the description section for the details
3745
  @param buf             INOUT: memory buffer to be used
3746
3747
  @note
3748
    One must have called index_init() before calling this function. Several
3749
    multi_range_read_init() calls may be made in course of one query.
3750
3751
    Until WL#2623 is done (see its text, section 3.2), the following will 
3752
    also hold:
3753
    The caller will guarantee that if "seq->init == mrr_ranges_array_init"
3754
    then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures.
3755
    This property will only be used by NDB handler until WL#2623 is done.
3756
     
3757
    Buffer memory management is done according to the following scenario:
3758
    The caller allocates the buffer and provides it to the callee by filling
3759
    the members of HANDLER_BUFFER structure.
3760
    The callee consumes all or some fraction of the provided buffer space, and
3761
    sets the HANDLER_BUFFER members accordingly.
3762
    The callee may use the buffer memory until the next multi_range_read_init()
3763
    call is made, all records have been read, or until index_end() call is
3764
    made, whichever comes first.
3765
3766
  @retval 0  OK
3767
  @retval 1  Error
3768
*/
3769
3770
int
3771
handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3772
                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3773
{
3774
  DBUG_ENTER("handler::multi_range_read_init");
3775
  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
3776
  mrr_funcs= *seq_funcs;
3777
  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
3778
  mrr_have_range= FALSE;
3779
  DBUG_RETURN(0);
3780
}
3781
3782
3783
/**
3784
  Get next record in MRR scan
3785
3786
  Default MRR implementation: read the next record
3787
3788
  @param range_info  OUT  Undefined if HA_MRR_NO_ASSOCIATION flag is in effect
3789
                          Otherwise, the opaque value associated with the range
3790
                          that contains the returned record.
3791
3792
  @retval 0      OK
3793
  @retval other  Error code
3794
*/
3795
3796
int handler::multi_range_read_next(char **range_info)
3797
{
3798
  int result= 0;
3799
  int range_res;
3800
  DBUG_ENTER("handler::multi_range_read_next");
3801
3802
  if (!mrr_have_range)
3803
  {
3804
    mrr_have_range= TRUE;
3805
    goto start;
3806
  }
3807
3808
  do
3809
  {
3810
    /* Save a call if there can be only one row in range. */
3811
    if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
3812
    {
3813
      result= read_range_next();
3814
      /* On success or non-EOF errors jump to the end. */
3815
      if (result != HA_ERR_END_OF_FILE)
3816
        break;
3817
    }
3818
    else
3819
    {
3820
      if (was_semi_consistent_read())
3821
        goto scan_it_again;
3822
      /*
3823
        We need to set this for the last range only, but checking this
3824
        condition is more expensive than just setting the result code.
3825
      */
3826
      result= HA_ERR_END_OF_FILE;
3827
    }
3828
3829
start:
3830
    /* Try the next range(s) until one matches a record. */
3831
    while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
3832
    {
3833
scan_it_again:
3834
      result= read_range_first(mrr_cur_range.start_key.keypart_map ?
3835
                                 &mrr_cur_range.start_key : 0,
3836
                               mrr_cur_range.end_key.keypart_map ?
3837
                                 &mrr_cur_range.end_key : 0,
3838
                               test(mrr_cur_range.range_flag & EQ_RANGE),
3839
                               mrr_is_output_sorted);
3840
      if (result != HA_ERR_END_OF_FILE)
3841
        break;
3842
    }
3843
  }
3844
  while ((result == HA_ERR_END_OF_FILE) && !range_res);
3845
3846
  *range_info= mrr_cur_range.ptr;
3847
  DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
3848
  DBUG_RETURN(result);
3849
}
3850
3851
3852
/* **************************************************************************
3853
 * DS-MRR implementation 
3854
 ***************************************************************************/
3855
3856
/**
3857
  DS-MRR: Initialize and start MRR scan
3858
3859
  Initialize and start the MRR scan. Depending on the mode parameter, this
3860
  may use default or DS-MRR implementation.
3861
3862
  @param h               Table handler to be used
3863
  @param key             Index to be used
3864
  @param seq_funcs       Interval sequence enumeration functions
3865
  @param seq_init_param  Interval sequence enumeration parameter
3866
  @param n_ranges        Number of ranges in the sequence.
3867
  @param mode            HA_MRR_* modes to use
3868
  @param buf             INOUT Buffer to use
3869
3870
  @retval 0     Ok, Scan started.
3871
  @retval other Error
3872
*/
3873
3874
int DsMrr_impl::dsmrr_init(handler *h, KEY *key,
3875
                           RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
3876
                           uint n_ranges, uint mode, HANDLER_BUFFER *buf)
3877
{
3878
  uint elem_size;
3879
  uint keyno;
3880
  Item *pushed_cond= NULL;
3881
  handler *new_h2;
3882
  DBUG_ENTER("DsMrr_impl::dsmrr_init");
3883
  keyno= h->active_index;
3884
  DBUG_ASSERT(h2 == NULL);
3885
  if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
3886
  {
3887
    use_default_impl= TRUE;
3888
    DBUG_RETURN(h->handler::multi_range_read_init(seq_funcs, seq_init_param,
3889
                                                  n_ranges, mode, buf));
3890
  }
3891
  rowids_buf= buf->buffer;
3892
  //psergey-todo: don't add key_length as it is not needed anymore
3893
  rowids_buf += key->key_length + h->ref_length;
3894
3895
  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
3896
  rowids_buf_end= buf->buffer_end;
3897
  
3898
  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
3899
  rowids_buf_last= rowids_buf + 
3900
                      ((rowids_buf_end - rowids_buf)/ elem_size)*
3901
                      elem_size;
3902
  rowids_buf_end= rowids_buf_last;
3903
3904
  /* Create a separate handler object to do rndpos() calls. */
3905
  THD *thd= current_thd;
3906
  if (!(new_h2= h->clone(thd->mem_root)) || 
3907
      new_h2->ha_external_lock(thd, F_RDLCK))
3908
  {
3909
    delete new_h2;
3910
    DBUG_RETURN(1);
3911
  }
3912
3913
  if (keyno == h->pushed_idx_cond_keyno)
3914
    pushed_cond= h->pushed_idx_cond;
3915
  if (h->ha_index_end())
3916
  {
3917
    new_h2= h2;
3918
    goto error;
3919
  }
3920
3921
  h2= new_h2;
3922
  table->prepare_for_position();
3923
  new_h2->extra(HA_EXTRA_KEYREAD);
3924
3925
  if (h2->ha_index_init(keyno, FALSE) || 
3926
      h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
3927
                                         mode, buf))
3928
    goto error;
3929
  use_default_impl= FALSE;
3930
  
3931
  if (pushed_cond)
3932
    h2->idx_cond_push(keyno, pushed_cond);
3933
  if (dsmrr_fill_buffer(new_h2))
3934
    goto error;
3935
3936
  /*
3937
    If the above call has scanned through all intervals in *seq, then
3938
    adjust *buf to indicate that the remaining buffer space will not be used.
3939
  */
3940
  if (dsmrr_eof) 
3941
    buf->end_of_used_area= rowids_buf_last;
3942
3943
  if (h->ha_rnd_init(FALSE))
3944
    goto error;
3945
  
3946
  DBUG_RETURN(0);
3947
error:
3948
  h2->ha_index_or_rnd_end();
3949
  h2->ha_external_lock(thd, F_UNLCK);
3950
  h2->close();
3951
  delete h2;
3952
  DBUG_RETURN(1);
3953
}
3954
3955
3956
void DsMrr_impl::dsmrr_close()
3957
{
3958
  DBUG_ENTER("DsMrr_impl::dsmrr_close");
3959
  if (h2)
3960
  {
3961
    h2->ha_external_lock(current_thd, F_UNLCK);
3962
    h2->close();
3963
    delete h2;
3964
    h2= NULL;
3965
  }
3966
  use_default_impl= TRUE;
3967
  DBUG_VOID_RETURN;
3968
}
3969
3970
3971
static int rowid_cmp(void *h, uchar *a, uchar *b)
3972
{
3973
  return ((handler*)h)->cmp_ref(a, b);
3974
}
3975
3976
3977
/**
3978
  DS-MRR: Fill the buffer with rowids and sort it by rowid
3979
3980
  {This is an internal function of DiskSweep MRR implementation}
3981
  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
3982
  buffer. When the buffer is full or scan is completed, sort the buffer by 
3983
  rowid and return.
3984
  
3985
  The function assumes that rowids buffer is empty when it is invoked. 
3986
  
3987
  @param h  Table handler
3988
3989
  @retval 0      OK, the next portion of rowids is in the buffer,
3990
                 properly ordered
3991
  @retval other  Error
3992
*/
3993
3994
int DsMrr_impl::dsmrr_fill_buffer(handler *unused)
3995
{
3996
  char *range_info;
3997
  int res;
3998
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
3999
4000
  rowids_buf_cur= rowids_buf;
4001
  while ((rowids_buf_cur < rowids_buf_end) && 
4002
         !(res= h2->handler::multi_range_read_next(&range_info)))
4003
  {
4004
    /* Put rowid, or {rowid, range_id} pair into the buffer */
4005
    h2->position(table->record[0]);
4006
    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
4007
    rowids_buf_cur += h->ref_length;
4008
4009
    if (is_mrr_assoc)
4010
    {
4011
      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
4012
      rowids_buf_cur += sizeof(void*);
4013
    }
4014
  }
4015
4016
  if (res && res != HA_ERR_END_OF_FILE)
4017
    DBUG_RETURN(res); 
4018
  dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
4019
4020
  /* Sort the buffer contents by rowid */
4021
  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
4022
  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
4023
  
4024
  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
4025
            (void*)h);
4026
  rowids_buf_last= rowids_buf_cur;
4027
  rowids_buf_cur=  rowids_buf;
4028
  DBUG_RETURN(0);
4029
}
4030
4031
4032
/**
4033
  DS-MRR implementation: multi_range_read_next() function
4034
*/
4035
4036
int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
4037
{
4038
  int res;
4039
  
4040
  if (use_default_impl)
4041
    return h->handler::multi_range_read_next(range_info);
4042
    
4043
  if (rowids_buf_cur == rowids_buf_last)
4044
  {
4045
    if (dsmrr_eof)
4046
    {
4047
      res= HA_ERR_END_OF_FILE;
4048
      goto end;
4049
    }
4050
    res= dsmrr_fill_buffer(h);
4051
    if (res)
4052
      goto end;
4053
  }
4054
  
4055
  /* Return EOF if there are no rowids in the buffer after re-fill attempt */
4056
  if (rowids_buf_cur == rowids_buf_last)
4057
  {
4058
    res= HA_ERR_END_OF_FILE;
4059
    goto end;
4060
  }
4061
4062
  res= h->rnd_pos(table->record[0], rowids_buf_cur);
4063
  rowids_buf_cur += h->ref_length;
4064
  if (is_mrr_assoc)
4065
  {
4066
    memcpy(range_info, rowids_buf_cur, sizeof(void*));
4067
    rowids_buf_cur += sizeof(void*);
4068
  }
4069
4070
end:
4071
  if (res)
4072
    dsmrr_close();
4073
  return res;
4074
}
4075
4076
4077
/**
4078
  DS-MRR implementation: multi_range_read_info() function
4079
*/
4080
int DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, uint *bufsz,
4081
                           uint *flags, COST_VECT *cost)
4082
{  
4083
  int res;
4084
  uint def_flags= *flags;
4085
  uint def_bufsz= *bufsz;
4086
4087
  /* Get cost/flags/mem_usage of default MRR implementation */
4088
  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
4089
                                         &def_flags, cost);
4090
  DBUG_ASSERT(!res);
4091
4092
  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
4093
      choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
4094
  {
4095
    /* Default implementation is choosen */
4096
    DBUG_PRINT("info", ("Default MRR implementation choosen"));
4097
    *flags= def_flags;
4098
    *bufsz= def_bufsz;
4099
  }
4100
  else
4101
  {
4102
    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
4103
  }
4104
  return 0;
4105
}
4106
4107
4108
/**
4109
  DS-MRR Implementation: multi_range_read_info_const() function
4110
*/
4111
4112
ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
4113
                                 void *seq_init_param, uint n_ranges, 
4114
                                 uint *bufsz, uint *flags, COST_VECT *cost)
4115
{
4116
  ha_rows rows;
4117
  uint def_flags= *flags;
4118
  uint def_bufsz= *bufsz;
4119
  /* Get cost/flags/mem_usage of default MRR implementation */
4120
  rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
4121
                                                n_ranges, &def_bufsz, 
4122
                                                &def_flags, cost);
4123
  if (rows == HA_POS_ERROR)
4124
  {
4125
    /* Default implementation can't perform MRR scan => we can't either */
4126
    return rows;
4127
  }
4128
4129
  /*
4130
    If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
4131
    use the default MRR implementation (we need it for UPDATE/DELETE).
4132
    Otherwise, make a choice based on cost and @@optimizer_use_mrr.
4133
  */
4134
  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
4135
      choose_mrr_impl(keyno, rows, flags, bufsz, cost))
4136
  {
4137
    DBUG_PRINT("info", ("Default MRR implementation choosen"));
4138
    *flags= def_flags;
4139
    *bufsz= def_bufsz;
4140
  }
4141
  else
4142
  {
4143
    *flags &= ~HA_MRR_USE_DEFAULT_IMPL;
4144
    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
4145
  }
4146
  return rows;
4147
}
4148
4149
4150
/**
4151
  Check if key has partially-covered columns
4152
4153
  We can't use DS-MRR to perform range scans when the ranges are over
4154
  partially-covered keys, because we'll not have full key part values
4155
  (we'll have their prefixes from the index) and will not be able to check
4156
  if we've reached the end the range.
4157
4158
  @param keyno  Key to check
4159
4160
  @todo
4161
    Allow use of DS-MRR in cases where the index has partially-covered
4162
    components but they are not used for scanning.
4163
4164
  @retval TRUE   Yes
4165
  @retval FALSE  No
4166
*/
4167
4168
bool DsMrr_impl::key_uses_partial_cols(uint keyno)
4169
{
4170
  KEY_PART_INFO *kp= table->key_info[keyno].key_part;
4171
  KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
4172
  for (; kp != kp_end; kp++)
4173
  {
4174
    if (!kp->field->part_of_key.is_set(keyno))
4175
      return TRUE;
4176
  }
4177
  return FALSE;
4178
}
4179
4180
4181
/**
4182
  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
4183
4184
  Make the choice between using Default MRR implementation and DS-MRR.
4185
  This function contains common functionality factored out of dsmrr_info()
4186
  and dsmrr_info_const(). The function assumes that the default MRR
4187
  implementation's applicability requirements are satisfied.
4188
4189
  @param keyno       Index number
4190
  @param rows        E(full rows to be retrieved)
4191
  @param flags  IN   MRR flags provided by the MRR user
4192
                OUT  If DS-MRR is choosen, flags of DS-MRR implementation
4193
                     else the value is not modified
4194
  @param bufsz  IN   If DS-MRR is choosen, buffer use of DS-MRR implementation
4195
                     else the value is not modified
4196
  @param cost   IN   Cost of default MRR implementation
4197
                OUT  If DS-MRR is choosen, cost of DS-MRR scan
4198
                     else the value is not modified
4199
4200
  @retval TRUE   Default MRR implementation should be used
4201
  @retval FALSE  DS-MRR implementation should be used
4202
*/
4203
4204
bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
4205
                                 uint *bufsz, COST_VECT *cost)
4206
{
4207
  COST_VECT dsmrr_cost;
4208
  bool res;
4209
  THD *thd= current_thd;
4210
  if ((thd->variables.optimizer_use_mrr == 2) || 
4211
      (*flags & HA_MRR_INDEX_ONLY) || (*flags & HA_MRR_SORTED) ||
4212
      (keyno == table->s->primary_key && 
4213
       h->primary_key_is_clustered()) || 
4214
       key_uses_partial_cols(keyno))
4215
  {
4216
    /* Use the default implementation */
4217
    *flags |= HA_MRR_USE_DEFAULT_IMPL;
4218
    return TRUE;
4219
  }
4220
  
4221
  uint add_len= table->key_info[keyno].key_length + h->ref_length; 
4222
  *bufsz -= add_len;
4223
  if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
4224
    return TRUE;
4225
  *bufsz += add_len;
4226
  
4227
  bool force_dsmrr;
4228
  /* 
4229
    If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
4230
    DS-MRR and Default implementations cost. This allows one to force use of
4231
    DS-MRR whenever it is applicable without affecting other cost-based
4232
    choices.
4233
  */
4234
  if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
4235
      dsmrr_cost.total_cost() > cost->total_cost())
4236
    dsmrr_cost= *cost;
4237
4238
  if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
4239
  {
4240
    *flags &= ~HA_MRR_USE_DEFAULT_IMPL;  /* Use the DS-MRR implementation */
4241
    *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
4242
    *cost= dsmrr_cost;
4243
    res= FALSE;
4244
  }
4245
  else
4246
  {
4247
    /* Use the default MRR implementation */
4248
    res= TRUE;
4249
  }
4250
  return res;
4251
}
4252
4253
4254
static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
4255
4256
4257
/**
4258
  Get cost of DS-MRR scan
4259
4260
  @param keynr              Index to be used
4261
  @param rows               E(Number of rows to be scanned)
4262
  @param flags              Scan parameters (HA_MRR_* flags)
4263
  @param buffer_size INOUT  Buffer size
4264
  @param cost        OUT    The cost
4265
4266
  @retval FALSE  OK
4267
  @retval TRUE   Error, DS-MRR cannot be used (the buffer is too small
4268
                 for even 1 rowid)
4269
*/
4270
4271
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
4272
                                         uint *buffer_size, COST_VECT *cost)
4273
{
4274
  ulong max_buff_entries, elem_size;
4275
  ha_rows rows_in_full_step, rows_in_last_step;
4276
  uint n_full_steps;
4277
  double index_read_cost;
4278
4279
  elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
4280
  max_buff_entries = *buffer_size / elem_size;
4281
4282
  if (!max_buff_entries)
4283
    return TRUE; /* Buffer has not enough space for even 1 rowid */
4284
4285
  /* Number of iterations we'll make with full buffer */
4286
  n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
4287
  
4288
  /* 
4289
    Get numbers of rows we'll be processing in 
4290
     - non-last sweep, with full buffer 
4291
     - last iteration, with non-full buffer
4292
  */
4293
  rows_in_full_step= max_buff_entries;
4294
  rows_in_last_step= rows % max_buff_entries;
4295
  
4296
  /* Adjust buffer size if we expect to use only part of the buffer */
4297
  if (n_full_steps)
4298
  {
4299
    get_sort_and_sweep_cost(table, rows, cost);
4300
    cost->multiply(n_full_steps);
4301
  }
4302
  else
4303
  {
4304
    cost->zero();
4305
    *buffer_size= max(*buffer_size, 
4306
                      (size_t)(1.2*rows_in_last_step) * elem_size + 
4307
                      h->ref_length + table->key_info[keynr].key_length);
4308
  }
4309
  
4310
  COST_VECT last_step_cost;
4311
  get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
4312
  cost->add(&last_step_cost);
4313
 
4314
  if (n_full_steps != 0)
4315
    cost->mem_cost= *buffer_size;
4316
  else
4317
    cost->mem_cost= (double)rows_in_last_step * elem_size;
4318
  
4319
  /* Total cost of all index accesses */
4320
  index_read_cost= h->index_only_read_time(keynr, (double)rows);
4321
  cost->add_io(index_read_cost, 1 /* Random seeks */);
4322
  return FALSE;
4323
}
4324
4325
4326
/* 
4327
  Get cost of one sort-and-sweep step
4328
4329
  SYNOPSIS
4330
    get_sort_and_sweep_cost()
4331
      table       Table being accessed
4332
      nrows       Number of rows to be sorted and retrieved
4333
      cost   OUT  The cost
4334
4335
  DESCRIPTION
4336
    Get cost of these operations:
4337
     - sort an array of #nrows ROWIDs using qsort
4338
     - read #nrows records from table in a sweep.
4339
*/
4340
4341
static 
4342
void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
4343
{
4344
  if (nrows)
4345
  {
4346
    get_sweep_read_cost(table, nrows, FALSE, cost);
4347
    /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
4348
    double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
4349
    if (cmp_op < 3)
4350
      cmp_op= 3;
4351
    cost->cpu_cost += cmp_op * log2(cmp_op);
4352
  }
4353
  else
4354
    cost->zero();
4355
}
4356
4357
4358
/**
4359
  Get cost of reading nrows table records in a "disk sweep"
4360
4361
  A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
4362
  for an ordered sequence of rowids.
4363
4364
  We assume hard disk IO. The read is performed as follows:
4365
4366
   1. The disk head is moved to the needed cylinder
4367
   2. The controller waits for the plate to rotate
4368
   3. The data is transferred
4369
4370
  Time to do #3 is insignificant compared to #2+#1.
4371
4372
  Time to move the disk head is proportional to head travel distance.
4373
4374
  Time to wait for the plate to rotate depends on whether the disk head
4375
  was moved or not. 
4376
4377
  If disk head wasn't moved, the wait time is proportional to distance
4378
  between the previous block and the block we're reading.
4379
4380
  If the head was moved, we don't know how much we'll need to wait for the
4381
  plate to rotate. We assume the wait time to be a variate with a mean of
4382
  0.5 of full rotation time.
4383
4384
  Our cost units are "random disk seeks". The cost of random disk seek is
4385
  actually not a constant, it depends one range of cylinders we're going
4386
  to access. We make it constant by introducing a fuzzy concept of "typical 
4387
  datafile length" (it's fuzzy as it's hard to tell whether it should
4388
  include index file, temp.tables etc). Then random seek cost is:
4389
4390
    1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
4391
4392
  We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
4393
4394
  @param table             Table to be accessed
4395
  @param nrows             Number of rows to retrieve
4396
  @param interrupted       TRUE <=> Assume that the disk sweep will be
4397
                           interrupted by other disk IO. FALSE - otherwise.
4398
  @param cost         OUT  The cost.
4399
*/
4400
4401
void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
4402
                         COST_VECT *cost)
4403
{
4404
  DBUG_ENTER("get_sweep_read_cost");
4405
4406
  cost->zero();
4407
  if (table->file->primary_key_is_clustered())
4408
  {
4409
    cost->io_count= table->file->read_time(table->s->primary_key,
4410
                                           (uint) nrows, nrows);
4411
  }
4412
  else
4413
  {
4414
    double n_blocks=
4415
      ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
4416
    double busy_blocks=
4417
      n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
4418
    if (busy_blocks < 1.0)
4419
      busy_blocks= 1.0;
4420
4421
    DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks,
4422
                       busy_blocks));
4423
    cost->io_count= busy_blocks;
4424
4425
    if (!interrupted)
4426
    {
4427
      /* Assume reading is done in one 'sweep' */
4428
      cost->avg_io_cost= (DISK_SEEK_BASE_COST +
4429
                          DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
4430
    }
4431
  }
4432
  DBUG_PRINT("info",("returning cost=%g", cost->total_cost()));
4433
  DBUG_VOID_RETURN;
4434
}
4435
4436
4437
/* **************************************************************************
4438
 * DS-MRR implementation ends
4439
 ***************************************************************************/
4440
4441
/**
4442
  Read first row between two ranges.
4443
4444
  @param start_key		Start key. Is 0 if no min range
4445
  @param end_key		End key.  Is 0 if no max range
4446
  @param eq_range_arg	        Set to 1 if start_key == end_key
4447
  @param sorted		Set to 1 if result should be sorted per key
4448
4449
  @note
4450
    Record is read into table->record[0]
4451
4452
  @retval
4453
    0			Found row
4454
  @retval
4455
    HA_ERR_END_OF_FILE	No rows in range
4456
  @retval
4457
    \#			Error code
4458
*/
4459
int handler::read_range_first(const key_range *start_key,
4460
			      const key_range *end_key,
4461
			      bool eq_range_arg,
4462
                              bool sorted /* ignored */)
4463
{
4464
  int result;
4465
  DBUG_ENTER("handler::read_range_first");
4466
4467
  eq_range= eq_range_arg;
4468
  end_range= 0;
4469
  if (end_key)
4470
  {
4471
    end_range= &save_end_range;
4472
    save_end_range= *end_key;
4473
    key_compare_result_on_equal= ((end_key->flag == HA_READ_BEFORE_KEY) ? 1 :
4474
				  (end_key->flag == HA_READ_AFTER_KEY) ? -1 : 0);
4475
  }
4476
  range_key_part= table->key_info[active_index].key_part;
4477
4478
  if (!start_key)			// Read first record
4479
    result= index_first(table->record[0]);
4480
  else
4481
    result= index_read_map(table->record[0],
4482
                           start_key->key,
4483
                           start_key->keypart_map,
4484
                           start_key->flag);
4485
  if (result)
4486
    DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) 
4487
		? HA_ERR_END_OF_FILE
4488
		: result);
4489
4490
  DBUG_RETURN (compare_key(end_range) <= 0 ? 0 : HA_ERR_END_OF_FILE);
4491
}
4492
4493
4494
/**
4495
  Read next row between two endpoints.
4496
4497
  @note
4498
    Record is read into table->record[0]
4499
4500
  @retval
4501
    0			Found row
4502
  @retval
4503
    HA_ERR_END_OF_FILE	No rows in range
4504
  @retval
4505
    \#			Error code
4506
*/
4507
int handler::read_range_next()
4508
{
4509
  int result;
4510
  DBUG_ENTER("handler::read_range_next");
4511
4512
  if (eq_range)
4513
  {
4514
    /* We trust that index_next_same always gives a row in range */
4515
    DBUG_RETURN(index_next_same(table->record[0],
4516
                                end_range->key,
4517
                                end_range->length));
4518
  }
4519
  result= index_next(table->record[0]);
4520
  if (result)
4521
    DBUG_RETURN(result);
4522
  DBUG_RETURN(compare_key(end_range) <= 0 ? 0 : HA_ERR_END_OF_FILE);
4523
}
4524
4525
4526
/**
4527
  Compare if found key (in row) is over max-value.
4528
4529
  @param range		range to compare to row. May be 0 for no range
4530
4531
  @seealso
4532
    key.cc::key_cmp()
4533
4534
  @return
4535
    The return value is SIGN(key_in_row - range_key):
4536
4537
    - 0   : Key is equal to range or 'range' == 0 (no range)
4538
    - -1  : Key is less than range
4539
    - 1   : Key is larger than range
4540
*/
4541
int handler::compare_key(key_range *range)
4542
{
4543
  int cmp;
4544
  if (!range || in_range_check_pushed_down)
4545
    return 0;					// No max range
4546
  cmp= key_cmp(range_key_part, range->key, range->length);
4547
  if (!cmp)
4548
    cmp= key_compare_result_on_equal;
4549
  return cmp;
4550
}
4551
4552
4553
/*
4554
  Same as compare_key() but doesn't check have in_range_check_pushed_down.
4555
  This is used by index condition pushdown implementation.
4556
*/
4557
4558
int handler::compare_key2(key_range *range)
4559
{
4560
  int cmp;
4561
  if (!range)
4562
    return 0;					// no max range
4563
  cmp= key_cmp(range_key_part, range->key, range->length);
4564
  if (!cmp)
4565
    cmp= key_compare_result_on_equal;
4566
  return cmp;
4567
}
4568
4569
int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key,
4570
                                key_part_map keypart_map,
4571
                                enum ha_rkey_function find_flag)
4572
{
4573
  int error, error1;
4574
  error= index_init(index, 0);
4575
  if (!error)
4576
  {
4577
    error= index_read_map(buf, key, keypart_map, find_flag);
4578
    error1= index_end();
4579
  }
4580
  return error ?  error : error1;
4581
}
4582
4583
4584
/**
4585
  Returns a list of all known extensions.
4586
4587
    No mutexes, worst case race is a minor surplus memory allocation
4588
    We have to recreate the extension map if mysqld is restarted (for example
4589
    within libmysqld)
4590
4591
  @retval
4592
    pointer		pointer to TYPELIB structure
4593
*/
4594
static my_bool exts_handlerton(THD *unused, plugin_ref plugin,
4595
                               void *arg)
4596
{
4597
  List<char> *found_exts= (List<char> *) arg;
4598
  handlerton *hton= plugin_data(plugin, handlerton *);
4599
  handler *file;
4600
  if (hton->state == SHOW_OPTION_YES && hton->create &&
4601
      (file= hton->create(hton, (TABLE_SHARE*) 0, current_thd->mem_root)))
4602
  {
4603
    List_iterator_fast<char> it(*found_exts);
4604
    const char **ext, *old_ext;
4605
4606
    for (ext= file->bas_ext(); *ext; ext++)
4607
    {
4608
      while ((old_ext= it++))
4609
      {
4610
        if (!strcmp(old_ext, *ext))
4611
	  break;
4612
      }
4613
      if (!old_ext)
4614
        found_exts->push_back((char *) *ext);
4615
4616
      it.rewind();
4617
    }
4618
    delete file;
4619
  }
4620
  return FALSE;
4621
}
4622
4623
TYPELIB *ha_known_exts(void)
4624
{
4625
  if (!known_extensions.type_names || mysys_usage_id != known_extensions_id)
4626
  {
4627
    List<char> found_exts;
4628
    const char **ext, *old_ext;
4629
4630
    known_extensions_id= mysys_usage_id;
4631
4632
    plugin_foreach(NULL, exts_handlerton,
4633
                   MYSQL_STORAGE_ENGINE_PLUGIN, &found_exts);
4634
4635
    ext= (const char **) my_once_alloc(sizeof(char *)*
4636
                                       (found_exts.elements+1),
4637
                                       MYF(MY_WME | MY_FAE));
4638
4639
    DBUG_ASSERT(ext != 0);
4640
    known_extensions.count= found_exts.elements;
4641
    known_extensions.type_names= ext;
4642
4643
    List_iterator_fast<char> it(found_exts);
4644
    while ((old_ext= it++))
4645
      *ext++= old_ext;
4646
    *ext= 0;
4647
  }
4648
  return &known_extensions;
4649
}
4650
4651
4652
static bool stat_print(THD *thd, const char *type, uint type_len,
4653
                       const char *file, uint file_len,
4654
                       const char *status, uint status_len)
4655
{
4656
  Protocol *protocol= thd->protocol;
4657
  protocol->prepare_for_resend();
4658
  protocol->store(type, type_len, system_charset_info);
4659
  protocol->store(file, file_len, system_charset_info);
4660
  protocol->store(status, status_len, system_charset_info);
4661
  if (protocol->write())
4662
    return TRUE;
4663
  return FALSE;
4664
}
4665
4666
bool ha_show_status(THD *thd, handlerton *db_type, enum ha_stat_type stat)
4667
{
4668
  List<Item> field_list;
4669
  Protocol *protocol= thd->protocol;
4670
  bool result;
4671
4672
  field_list.push_back(new Item_empty_string("Type",10));
4673
  field_list.push_back(new Item_empty_string("Name",FN_REFLEN));
4674
  field_list.push_back(new Item_empty_string("Status",10));
4675
4676
  if (protocol->send_fields(&field_list,
4677
                            Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
4678
    return TRUE;
4679
12.1.1 by Brian Aker
Cleaned up show status.
4680
  result= db_type->show_status &&
4681
    db_type->show_status(db_type, thd, stat_print, stat) ? 1 : 0;
1 by brian
clean slate
4682
4683
  if (!result)
4684
    my_eof(thd);
4685
  return result;
4686
}
4687
4688
4689
/**
4690
  Check if the conditions for row-based binlogging is correct for the table.
4691
4692
  A row in the given table should be replicated if:
4693
  - Row-based replication is enabled in the current thread
4694
  - The binlog is enabled
4695
  - It is not a temporary table
4696
  - The binary log is open
4697
  - The database the table resides in shall be binlogged (binlog_*_db rules)
4698
  - table is not mysql.event
4699
*/
4700
4701
static bool check_table_binlog_row_based(THD *thd, TABLE *table)
4702
{
4703
  if (table->s->cached_row_logging_check == -1)
4704
  {
4705
    int const check(table->s->tmp_table == NO_TMP_TABLE &&
4706
                    binlog_filter->db_ok(table->s->db.str));
4707
    table->s->cached_row_logging_check= check;
4708
  }
4709
4710
  DBUG_ASSERT(table->s->cached_row_logging_check == 0 ||
4711
              table->s->cached_row_logging_check == 1);
4712
4713
  return (thd->current_stmt_binlog_row_based &&
4714
          table->s->cached_row_logging_check &&
4715
          (thd->options & OPTION_BIN_LOG) &&
4716
          mysql_bin_log.is_open());
4717
}
4718
4719
4720
/**
4721
   Write table maps for all (manually or automatically) locked tables
4722
   to the binary log.
4723
4724
   This function will generate and write table maps for all tables
4725
   that are locked by the thread 'thd'.  Either manually locked
4726
   (stored in THD::locked_tables) and automatically locked (stored
4727
   in THD::lock) are considered.
4728
4729
   @param thd     Pointer to THD structure
4730
4731
   @retval 0   All OK
4732
   @retval 1   Failed to write all table maps
4733
4734
   @sa
4735
       THD::lock
4736
       THD::locked_tables
4737
*/
4738
4739
static int write_locked_table_maps(THD *thd)
4740
{
4741
  DBUG_ENTER("write_locked_table_maps");
4742
  DBUG_PRINT("enter", ("thd: 0x%lx  thd->lock: 0x%lx  thd->locked_tables: 0x%lx  "
4743
                       "thd->extra_lock: 0x%lx",
4744
                       (long) thd, (long) thd->lock,
4745
                       (long) thd->locked_tables, (long) thd->extra_lock));
4746
4747
  if (thd->get_binlog_table_maps() == 0)
4748
  {
4749
    MYSQL_LOCK *locks[3];
4750
    locks[0]= thd->extra_lock;
4751
    locks[1]= thd->lock;
4752
    locks[2]= thd->locked_tables;
4753
    for (uint i= 0 ; i < sizeof(locks)/sizeof(*locks) ; ++i )
4754
    {
4755
      MYSQL_LOCK const *const lock= locks[i];
4756
      if (lock == NULL)
4757
        continue;
4758
4759
      TABLE **const end_ptr= lock->table + lock->table_count;
4760
      for (TABLE **table_ptr= lock->table ; 
4761
           table_ptr != end_ptr ;
4762
           ++table_ptr)
4763
      {
4764
        TABLE *const table= *table_ptr;
4765
        DBUG_PRINT("info", ("Checking table %s", table->s->table_name.str));
4766
        if (table->current_lock == F_WRLCK &&
4767
            check_table_binlog_row_based(thd, table))
4768
        {
4769
          int const has_trans= table->file->has_transactions();
4770
          int const error= thd->binlog_write_table_map(table, has_trans);
4771
          /*
4772
            If an error occurs, it is the responsibility of the caller to
4773
            roll back the transaction.
4774
          */
4775
          if (unlikely(error))
4776
            DBUG_RETURN(1);
4777
        }
4778
      }
4779
    }
4780
  }
4781
  DBUG_RETURN(0);
4782
}
4783
4784
4785
typedef bool Log_func(THD*, TABLE*, bool, const uchar*, const uchar*);
4786
4787
static int binlog_log_row(TABLE* table,
4788
                          const uchar *before_record,
4789
                          const uchar *after_record,
4790
                          Log_func *log_func)
4791
{
4792
  if (table->no_replicate)
4793
    return 0;
4794
  bool error= 0;
4795
  THD *const thd= table->in_use;
4796
4797
  if (check_table_binlog_row_based(thd, table))
4798
  {
4799
    DBUG_DUMP("read_set 10", (uchar*) table->read_set->bitmap,
4800
              (table->s->fields + 7) / 8);
4801
    /*
4802
      If there are no table maps written to the binary log, this is
4803
      the first row handled in this statement. In that case, we need
4804
      to write table maps for all locked tables to the binary log.
4805
    */
4806
    if (likely(!(error= write_locked_table_maps(thd))))
4807
    {
4808
      bool const has_trans= table->file->has_transactions();
4809
      error= (*log_func)(thd, table, has_trans, before_record, after_record);
4810
    }
4811
  }
4812
  return error ? HA_ERR_RBR_LOGGING_FAILED : 0;
4813
}
4814
4815
int handler::ha_external_lock(THD *thd, int lock_type)
4816
{
4817
  DBUG_ENTER("handler::ha_external_lock");
4818
  /*
4819
    Whether this is lock or unlock, this should be true, and is to verify that
4820
    if get_auto_increment() was called (thus may have reserved intervals or
4821
    taken a table lock), ha_release_auto_increment() was too.
4822
  */
4823
  DBUG_ASSERT(next_insert_id == 0);
4824
4825
  /*
4826
    We cache the table flags if the locking succeeded. Otherwise, we
4827
    keep them as they were when they were fetched in ha_open().
4828
  */
4829
  MYSQL_EXTERNAL_LOCK(lock_type);
4830
4831
  int error= external_lock(thd, lock_type);
4832
  if (error == 0)
4833
    cached_table_flags= table_flags();
4834
  DBUG_RETURN(error);
4835
}
4836
4837
4838
/**
4839
  Check handler usage and reset state of file to after 'open'
4840
*/
4841
int handler::ha_reset()
4842
{
4843
  DBUG_ENTER("ha_reset");
4844
  /* Check that we have called all proper deallocation functions */
4845
  DBUG_ASSERT((uchar*) table->def_read_set.bitmap +
4846
              table->s->column_bitmap_size ==
4847
              (uchar*) table->def_write_set.bitmap);
4848
  DBUG_ASSERT(bitmap_is_set_all(&table->s->all_set));
4849
  DBUG_ASSERT(table->key_read == 0);
4850
  /* ensure that ha_index_end / ha_rnd_end has been called */
4851
  DBUG_ASSERT(inited == NONE);
4852
  /* Free cache used by filesort */
4853
  free_io_cache(table);
4854
  /* reset the bitmaps to point to defaults */
4855
  table->default_column_bitmaps();
4856
  DBUG_RETURN(reset());
4857
}
4858
4859
4860
int handler::ha_write_row(uchar *buf)
4861
{
4862
  int error;
4863
  Log_func *log_func= Write_rows_log_event::binlog_row_logging_function;
4864
  DBUG_ENTER("handler::ha_write_row");
4865
  MYSQL_INSERT_ROW_START();
4866
4867
  mark_trx_read_write();
4868
4869
  if (unlikely(error= write_row(buf)))
4870
    DBUG_RETURN(error);
4871
  if (unlikely(error= binlog_log_row(table, 0, buf, log_func)))
4872
    DBUG_RETURN(error); /* purecov: inspected */
4873
  MYSQL_INSERT_ROW_END();
4874
  DBUG_RETURN(0);
4875
}
4876
4877
4878
int handler::ha_update_row(const uchar *old_data, uchar *new_data)
4879
{
4880
  int error;
4881
  Log_func *log_func= Update_rows_log_event::binlog_row_logging_function;
4882
4883
  /*
4884
    Some storage engines require that the new record is in record[0]
4885
    (and the old record is in record[1]).
4886
   */
4887
  DBUG_ASSERT(new_data == table->record[0]);
4888
4889
  mark_trx_read_write();
4890
4891
  if (unlikely(error= update_row(old_data, new_data)))
4892
    return error;
4893
  if (unlikely(error= binlog_log_row(table, old_data, new_data, log_func)))
4894
    return error;
4895
  return 0;
4896
}
4897
4898
int handler::ha_delete_row(const uchar *buf)
4899
{
4900
  int error;
4901
  Log_func *log_func= Delete_rows_log_event::binlog_row_logging_function;
4902
4903
  mark_trx_read_write();
4904
4905
  if (unlikely(error= delete_row(buf)))
4906
    return error;
4907
  if (unlikely(error= binlog_log_row(table, buf, 0, log_func)))
4908
    return error;
4909
  return 0;
4910
}
4911
4912
4913
4914
/**
4915
  @details
4916
  use_hidden_primary_key() is called in case of an update/delete when
4917
  (table_flags() and HA_PRIMARY_KEY_REQUIRED_FOR_DELETE) is defined
4918
  but we don't have a primary key
4919
*/
4920
void handler::use_hidden_primary_key()
4921
{
4922
  /* fallback to use all columns in the table to identify row */
4923
  table->use_all_columns();
4924
}