1
// Copyright 2006 Nemanja Trifunovic
4
Permission is hereby granted, free of charge, to any person or organization
5
obtaining a copy of the software and accompanying documentation covered by
6
this license (the "Software") to use, reproduce, display, distribute,
7
execute, and transmit the Software, and to prepare derivative works of the
8
Software, and to permit third-parties to whom the Software is furnished to
9
do so, all subject to the following:
11
The copyright notices in the Software and this entire statement, including
12
the above license grant, this restriction and the following disclaimer,
13
must be included in all copies of the Software, in whole or in part, and
14
all derivative works of the Software, unless such copies or derivative
15
works are solely in the form of machine-executable object code generated by
16
a source language processor.
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.
29
#ifndef DRIZZLED_UTF8_CHECKED_H
30
#define DRIZZLED_UTF8_CHECKED_H
32
#include "drizzled/utf8/core.h"
39
// Base for the exceptions that may be thrown from the library
40
class exception : public std::exception {
43
// Exceptions that may be thrown from the library functions.
44
class invalid_code_point : public exception {
47
invalid_code_point(uint32_t cp_in) : cp(cp_in) {}
48
virtual const char* what() const throw() { return "Invalid code point"; }
49
uint32_t code_point() const {return cp;}
52
class invalid_utf8 : public exception {
55
invalid_utf8 (uint8_t u) : u8(u) {}
56
virtual const char* what() const throw() { return "Invalid UTF-8"; }
57
uint8_t utf8_octet() const {return u8;}
60
class invalid_utf16 : public exception {
63
invalid_utf16 (uint16_t u) : u16(u) {}
64
virtual const char* what() const throw() { return "Invalid UTF-16"; }
65
uint16_t utf16_word() const {return u16;}
68
class not_enough_room : public exception {
70
virtual const char* what() const throw() { return "Not enough space"; }
73
/// The library API - functions intended to be called by the users
75
template <typename octet_iterator, typename output_iterator>
76
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
78
while (start != end) {
79
octet_iterator sequence_start = start;
80
internal::utf_error err_code = internal::validate_next(start, end);
82
case internal::UTF8_OK :
83
for (octet_iterator it = sequence_start; it != start; ++it)
86
case internal::NOT_ENOUGH_ROOM:
87
throw not_enough_room();
88
case internal::INVALID_LEAD:
89
append (replacement, out);
92
case internal::INCOMPLETE_SEQUENCE:
93
case internal::OVERLONG_SEQUENCE:
94
case internal::INVALID_CODE_POINT:
95
append (replacement, out);
97
// just one replacement mark for the sequence
98
while (internal::is_trail(*start) && start != end)
106
template <typename octet_iterator, typename output_iterator>
107
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
109
static const uint32_t replacement_marker = internal::mask16(0xfffd);
110
return replace_invalid(start, end, out, replacement_marker);
113
template <typename octet_iterator>
114
octet_iterator append(uint32_t cp, octet_iterator result)
116
if (!internal::is_code_point_valid(cp))
117
throw invalid_code_point(cp);
119
if (cp < 0x80) // one octet
120
*(result++) = static_cast<uint8_t>(cp);
121
else if (cp < 0x800) { // two octets
122
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
123
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
125
else if (cp < 0x10000) { // three octets
126
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
127
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
128
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
130
else { // four octets
131
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
132
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
133
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
134
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
139
template <typename octet_iterator>
140
uint32_t next(octet_iterator& it, octet_iterator end)
143
internal::utf_error err_code = internal::validate_next(it, end, &cp);
145
case internal::UTF8_OK :
147
case internal::NOT_ENOUGH_ROOM :
148
throw not_enough_room();
149
case internal::INVALID_LEAD :
150
case internal::INCOMPLETE_SEQUENCE :
151
case internal::OVERLONG_SEQUENCE :
152
throw invalid_utf8(*it);
153
case internal::INVALID_CODE_POINT :
154
throw invalid_code_point(cp);
159
template <typename octet_iterator>
160
uint32_t peek_next(octet_iterator it, octet_iterator end)
162
return next(it, end);
165
template <typename octet_iterator>
166
uint32_t prior(octet_iterator& it, octet_iterator start)
168
octet_iterator end = it;
169
while (internal::is_trail(*(--it)))
171
throw invalid_utf8(*it); // error - no lead byte in the sequence
172
octet_iterator temp = it;
173
return next(temp, end);
176
/// Deprecated in versions that include "prior"
177
template <typename octet_iterator>
178
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180
octet_iterator end = it;
181
while (internal::is_trail(*(--it)))
182
if (it == pass_start)
183
throw invalid_utf8(*it); // error - no lead byte in the sequence
184
octet_iterator temp = it;
185
return next(temp, end);
188
template <typename octet_iterator, typename distance_type>
189
void advance (octet_iterator& it, distance_type n, octet_iterator end)
191
for (distance_type i = 0; i < n; ++i)
195
template <typename octet_iterator>
196
typename std::iterator_traits<octet_iterator>::difference_type
197
distance (octet_iterator first, octet_iterator last)
199
typename std::iterator_traits<octet_iterator>::difference_type dist;
200
for (dist = 0; first < last; ++dist)
205
template <typename u16bit_iterator, typename octet_iterator>
206
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208
while (start != end) {
209
uint32_t cp = internal::mask16(*start++);
210
// Take care of surrogate pairs first
211
if (internal::is_lead_surrogate(cp)) {
213
uint32_t trail_surrogate = internal::mask16(*start++);
214
if (internal::is_trail_surrogate(trail_surrogate))
215
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
220
throw invalid_utf16(static_cast<uint16_t>(cp));
223
// Lone trail surrogate
224
else if (internal::is_trail_surrogate(cp))
225
throw invalid_utf16(static_cast<uint16_t>(cp));
227
result = append(cp, result);
232
template <typename u16bit_iterator, typename octet_iterator>
233
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235
while (start != end) {
236
uint32_t cp = next(start, end);
237
if (cp > 0xffff) { //make a surrogate pair
238
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
239
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
242
*result++ = static_cast<uint16_t>(cp);
247
template <typename octet_iterator, typename u32bit_iterator>
248
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
251
result = append(*(start++), result);
256
template <typename octet_iterator, typename u32bit_iterator>
257
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
260
(*result++) = next(start, end);
265
// The iterator class
266
template <typename octet_iterator>
267
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
269
octet_iterator range_start;
270
octet_iterator range_end;
273
explicit iterator (const octet_iterator& octet_it,
274
const octet_iterator& range_start_in,
275
const octet_iterator& range_end_in) :
276
it(octet_it), range_start(range_start_in), range_end(range_end_in)
278
if (it < range_start || it > range_end)
279
throw std::out_of_range("Invalid utf-8 iterator position");
281
// the default "big three" are OK
282
octet_iterator base () const { return it; }
283
uint32_t operator * () const
285
octet_iterator temp = it;
286
return next(temp, range_end);
288
bool operator == (const iterator& rhs) const
290
if (range_start != rhs.range_start || range_end != rhs.range_end)
291
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
292
return (it == rhs.it);
294
bool operator != (const iterator& rhs) const
296
return !(operator == (rhs));
298
iterator& operator ++ ()
303
iterator operator ++ (int)
305
iterator temp = *this;
309
iterator& operator -- ()
311
prior(it, range_start);
314
iterator operator -- (int)
316
iterator temp = *this;
317
prior(it, range_start);
323
} // namespace drizzled
325
#endif /* DRIZZLED_UTF8_CHECKED_H */