~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to drizzled/utf8/core.h

Committer: Brian Aker
Date: 2010-10-10 02:00:34 UTC
mfrom: (1830.1.5 trunk-drizzle)
Revision ID: brian@tangent.org-20101010020034-d67x3d09fssxq1v6

Merge rollup of utf8 and table encapsulation.

files added:
drizzled/utf8/checked.h

drizzled/utf8/core.h

drizzled/utf8/unchecked.h

files modified:
drizzled/field/datetime.cc

drizzled/field/double.cc

drizzled/field/enum.cc

drizzled/field/int64_t.cc

drizzled/field/long.cc

drizzled/field/real.cc

drizzled/field/timestamp.cc

drizzled/include.am

drizzled/plugin/event_observer.cc

drizzled/statement/alter_table.cc

drizzled/statement/drop_index.cc

drizzled/table.cc

drizzled/table.h

drizzled/utf8/utf8.h

plugin/blitzdb/blitzcmp.cc

plugin/blitzdb/blitzdata.cc

plugin/blitzdb/ha_blitz.cc

plugin/haildb/haildb_engine.cc

plugin/haildb/plugin.ini

plugin/pbms/src/systab_backup_ms.cc

plugin/pbms/src/systab_cloud_ms.cc

plugin/pbms/src/systab_dump_ms.cc

plugin/pbms/src/systab_httpheader_ms.cc

plugin/pbms/src/systab_variable_ms.cc

plugin/pbms/src/system_table_ms.cc

plugin/pbxt/src/ha_pbxt.cc

plugin/pbxt/src/systab_xt.cc

Show diffs side-by-side

added added

removed removed

drizzled/utf8/core.h

Permission is hereby granted, free of charge, to any person or organization

obtaining a copy of the software and accompanying documentation covered by

this license (the "Software") to use, reproduce, display, distribute,

execute, and transmit the Software, and to prepare derivative works of the

Software, and to permit third-parties to whom the Software is furnished to

do so, all subject to the following:

The copyright notices in the Software and this entire statement, including

the above license grant, this restriction and the following disclaimer,

must be included in all copies of the Software, in whole or in part, and

all derivative works of the Software, unless such copies or derivative

works are solely in the form of machine-executable object code generated by

a source language processor.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT

SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE

FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,

ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER

DEALINGS IN THE SOFTWARE.

#ifndef DRIZZLED_UTF8_CORE_H

#define DRIZZLED_UTF8_CORE_H

#include <iterator>

namespace drizzled

{

namespace utf8

{

// Helper code - not intended to be directly called by the library users. May be changed at any time

namespace internal

{

// Unicode constants

// Leading (high) surrogates: 0xd800 - 0xdbff

// Trailing (low) surrogates: 0xdc00 - 0xdfff

const uint16_t LEAD_SURROGATE_MIN = 0xd800u;

const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;

const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;

const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;

const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);

const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;

// Maximum valid value for a Unicode code point

const uint32_t CODE_POINT_MAX = 0x0010ffffu;

template<typename octet_type>

inline uint8_t mask8(octet_type oc)

{

return static_cast<uint8_t>(0xff & oc);

}

template<typename u16_type>

inline uint16_t mask16(u16_type oc)

{

return static_cast<uint16_t>(0xffff & oc);

}

template<typename octet_type>

inline bool is_trail(octet_type oc)

{

return ((mask8(oc) >> 6) == 0x2);

}

template <typename u16>

inline bool is_lead_surrogate(u16 cp)

{

return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);

}

template <typename u16>

inline bool is_trail_surrogate(u16 cp)

{

return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);

}

template <typename u16>

inline bool is_surrogate(u16 cp)

{

return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);

}

template <typename u32>

inline bool is_code_point_valid(u32 cp)

{

return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);

}

template <typename octet_iterator>

inline typename std::iterator_traits<octet_iterator>::difference_type

sequence_length(octet_iterator lead_it)

{

uint8_t lead = mask8(*lead_it);

if (lead < 0x80)

return 1;

100

else if ((lead >> 5) == 0x6)

101

return 2;

102

else if ((lead >> 4) == 0xe)

103

return 3;

104

else if ((lead >> 3) == 0x1e)

105

return 4;

106

else

107

return 0;

108

}

109

110

template <typename octet_difference_type>

111

inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)

112

{

113

if (cp < 0x80) {

114

if (length != 1)

115

return true;

116

}

117

else if (cp < 0x800) {

118

if (length != 2)

119

return true;

120

}

121

else if (cp < 0x10000) {

122

if (length != 3)

123

return true;

124

}

125

126

return false;

127

}

128

129

enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};

130

131

/// get_sequence_x functions decode utf-8 sequences of the length x

132

133

template <typename octet_iterator>

134

utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)

135

{

136

if (it != end) {

137

if (code_point)

138

*code_point = mask8(*it);

139

return UTF8_OK;

140

}

141

return NOT_ENOUGH_ROOM;

142

}

143

144

template <typename octet_iterator>

145

utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)

146

{

147

utf_error ret_code = NOT_ENOUGH_ROOM;

148

149

if (it != end) {

150

uint32_t cp = mask8(*it);

151

if (++it != end) {

152

if (is_trail(*it)) {

153

cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);

154

155

if (code_point)

156

*code_point = cp;

157

ret_code = UTF8_OK;

158

}

159

else

160

ret_code = INCOMPLETE_SEQUENCE;

161

}

162

else

163

ret_code = NOT_ENOUGH_ROOM;

164

}

165

166

return ret_code;

167

}

168

169

template <typename octet_iterator>

170

utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)

171

{

172

utf_error ret_code = NOT_ENOUGH_ROOM;

173

174

if (it != end) {

175

uint32_t cp = mask8(*it);

176

if (++it != end) {

177

if (is_trail(*it)) {

178

cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);

179

if (++it != end) {

180

if (is_trail(*it)) {

181

cp += (*it) & 0x3f;

182

183

if (code_point)

184

*code_point = cp;

185

ret_code = UTF8_OK;

186

}

187

else

188

ret_code = INCOMPLETE_SEQUENCE;

189

}

190

else

191

ret_code = NOT_ENOUGH_ROOM;

192

}

193

else

194

ret_code = INCOMPLETE_SEQUENCE;

195

}

196

else

197

ret_code = NOT_ENOUGH_ROOM;

198

}

199

200

return ret_code;

201

}

202

203

template <typename octet_iterator>

204

utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)

205

{

206

utf_error ret_code = NOT_ENOUGH_ROOM;

207

208

if (it != end) {

209

uint32_t cp = mask8(*it);

210

if (++it != end) {

211

if (is_trail(*it)) {

212

cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);

213

if (++it != end) {

214

if (is_trail(*it)) {

215

cp += (mask8(*it) << 6) & 0xfff;

216

if (++it != end) {

217

if (is_trail(*it)) {

218

cp += (*it) & 0x3f;

219

220

if (code_point)

221

*code_point = cp;

222

ret_code = UTF8_OK;

223

}

224

else

225

ret_code = INCOMPLETE_SEQUENCE;

226

}

227

else

228

ret_code = NOT_ENOUGH_ROOM;

229

}

230

else

231

ret_code = INCOMPLETE_SEQUENCE;

232

}

233

else

234

ret_code = NOT_ENOUGH_ROOM;

235

}

236

else

237

ret_code = INCOMPLETE_SEQUENCE;

238

}

239

else

240

ret_code = NOT_ENOUGH_ROOM;

241

}

242

243

return ret_code;

244

}

245

246

template <typename octet_iterator>

247

utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)

248

{

249

// Save the original value of it so we can go back in case of failure

250

// Of course, it does not make much sense with i.e. stream iterators

251

octet_iterator original_it = it;

252

253

uint32_t cp = 0;

254

// Determine the sequence length based on the lead octet

255

typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;

256

octet_difference_type length = sequence_length(it);

257

if (length == 0)

258

return INVALID_LEAD;

259

260

// Now that we have a valid sequence length, get trail octets and calculate the code point

261

utf_error err = UTF8_OK;

262

switch (length) {

263

case 1:

264

err = get_sequence_1(it, end, &cp);

265

break;

266

case 2:

267

err = get_sequence_2(it, end, &cp);

268

break;

269

case 3:

270

err = get_sequence_3(it, end, &cp);

271

break;

272

case 4:

273

err = get_sequence_4(it, end, &cp);

274

break;

275

}

276

277

if (err == UTF8_OK) {

278

// Decoding succeeded. Now, security checks...

279

if (is_code_point_valid(cp)) {

280

if (!is_overlong_sequence(cp, length)){

281

// Passed! Return here.

282

if (code_point)

283

*code_point = cp;

284

++it;

285

return UTF8_OK;

286

}

287

else

288

err = OVERLONG_SEQUENCE;

289

}

290

else

291

err = INVALID_CODE_POINT;

292

}

293

294

// Failure branch - restore the original value of the iterator

295

it = original_it;

296

return err;

297

}

298

299

template <typename octet_iterator>

300

inline utf_error validate_next(octet_iterator& it, octet_iterator end) {

301

return validate_next(it, end, 0);

302

}

303

304

} // namespace internal

305

306

/// The library API - functions intended to be called by the users

307

308

// Byte order mark

309

const uint8_t bom[] = {0xef, 0xbb, 0xbf};

310

311

template <typename octet_iterator>

312

octet_iterator find_invalid(octet_iterator start, octet_iterator end)

313

{

314

octet_iterator result = start;

315

while (result != end) {

316

internal::utf_error err_code = internal::validate_next(result, end);

317

if (err_code != internal::UTF8_OK)

318

return result;

319

}

320

return result;

321

}

322

323

template <typename octet_iterator>

324

inline bool is_valid(octet_iterator start, octet_iterator end)

325

{

326

return (find_invalid(start, end) == end);

327

}

328

329

template <typename octet_iterator>

330

inline bool starts_with_bom (octet_iterator it, octet_iterator end)

331

{

332

return (

333

((it != end) && (internal::mask8(*it++)) == bom[0]) &&

334

((it != end) && (internal::mask8(*it++)) == bom[1]) &&

335

((it != end) && (internal::mask8(*it)) == bom[2])

336

);

337

}

338

339

//Deprecated in release 2.3

340

template <typename octet_iterator>

341

inline bool is_bom (octet_iterator it)

342

{

343

return (

344

(internal::mask8(*it++)) == bom[0] &&

345

(internal::mask8(*it++)) == bom[1] &&

346

(internal::mask8(*it)) == bom[2]

347

);

348

}

349

} // namespace utf8

350

} // namespace drizzled

351

352

#endif /* DRIZZLED_UTF8_CORE_H */

353

354

Older »