1
// Copyright 2006 Nemanja Trifunovic
4
Permission is hereby granted, free of charge, to any person or organization
5
obtaining a copy of the software and accompanying documentation covered by
6
this license (the "Software") to use, reproduce, display, distribute,
7
execute, and transmit the Software, and to prepare derivative works of the
8
Software, and to permit third-parties to whom the Software is furnished to
9
do so, all subject to the following:
11
The copyright notices in the Software and this entire statement, including
12
the above license grant, this restriction and the following disclaimer,
13
must be included in all copies of the Software, in whole or in part, and
14
all derivative works of the Software, unless such copies or derivative
15
works are solely in the form of machine-executable object code generated by
16
a source language processor.
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.
28
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
31
#include "drizzled/utf8/core.h"
36
// Exceptions that may be thrown from the library functions.
37
class invalid_code_point : public std::exception {
40
invalid_code_point(uint32_t cp) : cp(cp) {}
41
virtual const char* what() const throw() { return "Invalid code point"; }
42
uint32_t code_point() const {return cp;}
45
class invalid_utf8 : public std::exception {
48
invalid_utf8 (uint8_t u) : u8(u) {}
49
virtual const char* what() const throw() { return "Invalid UTF-8"; }
50
uint8_t utf8_octet() const {return u8;}
53
class invalid_utf16 : public std::exception {
56
invalid_utf16 (uint16_t u) : u16(u) {}
57
virtual const char* what() const throw() { return "Invalid UTF-16"; }
58
uint16_t utf16_word() const {return u16;}
61
class not_enough_room : public std::exception {
63
virtual const char* what() const throw() { return "Not enough space"; }
66
/// The library API - functions intended to be called by the users
68
template <typename octet_iterator, typename output_iterator>
69
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
71
while (start != end) {
72
octet_iterator sequence_start = start;
73
internal::utf_error err_code = internal::validate_next(start, end);
76
for (octet_iterator it = sequence_start; it != start; ++it)
79
case internal::NOT_ENOUGH_ROOM:
80
throw not_enough_room();
81
case internal::INVALID_LEAD:
82
append (replacement, out);
85
case internal::INCOMPLETE_SEQUENCE:
86
case internal::OVERLONG_SEQUENCE:
87
case internal::INVALID_CODE_POINT:
88
append (replacement, out);
90
// just one replacement mark for the sequence
91
while (internal::is_trail(*start) && start != end)
99
template <typename octet_iterator, typename output_iterator>
100
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
102
static const uint32_t replacement_marker = internal::mask16(0xfffd);
103
return replace_invalid(start, end, out, replacement_marker);
106
template <typename octet_iterator>
107
octet_iterator append(uint32_t cp, octet_iterator result)
109
if (!internal::is_code_point_valid(cp))
110
throw invalid_code_point(cp);
112
if (cp < 0x80) // one octet
113
*(result++) = static_cast<uint8_t>(cp);
114
else if (cp < 0x800) { // two octets
115
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
116
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
118
else if (cp < 0x10000) { // three octets
119
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
120
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
121
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
123
else if (cp <= internal::CODE_POINT_MAX) { // four octets
124
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
125
*(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
126
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
127
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
130
throw invalid_code_point(cp);
135
template <typename octet_iterator>
136
uint32_t next(octet_iterator& it, octet_iterator end)
139
internal::utf_error err_code = internal::validate_next(it, end, &cp);
143
case internal::NOT_ENOUGH_ROOM :
144
throw not_enough_room();
145
case internal::INVALID_LEAD :
146
case internal::INCOMPLETE_SEQUENCE :
147
case internal::OVERLONG_SEQUENCE :
148
throw invalid_utf8(*it);
149
case internal::INVALID_CODE_POINT :
150
throw invalid_code_point(cp);
155
template <typename octet_iterator>
156
uint32_t peek_next(octet_iterator it, octet_iterator end)
158
return next(it, end);
161
template <typename octet_iterator>
162
uint32_t prior(octet_iterator& it, octet_iterator start)
164
octet_iterator end = it;
165
while (internal::is_trail(*(--it)))
167
throw invalid_utf8(*it); // error - no lead byte in the sequence
168
octet_iterator temp = it;
169
return next(temp, end);
172
/// Deprecated in versions that include "prior"
173
template <typename octet_iterator>
174
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
176
octet_iterator end = it;
177
while (internal::is_trail(*(--it)))
178
if (it == pass_start)
179
throw invalid_utf8(*it); // error - no lead byte in the sequence
180
octet_iterator temp = it;
181
return next(temp, end);
184
template <typename octet_iterator, typename distance_type>
185
void advance (octet_iterator& it, distance_type n, octet_iterator end)
187
for (distance_type i = 0; i < n; ++i)
191
template <typename octet_iterator>
192
typename std::iterator_traits<octet_iterator>::difference_type
193
distance (octet_iterator first, octet_iterator last)
195
typename std::iterator_traits<octet_iterator>::difference_type dist;
196
for (dist = 0; first < last; ++dist)
201
template <typename u16bit_iterator, typename octet_iterator>
202
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
204
while (start != end) {
205
uint32_t cp = internal::mask16(*start++);
206
// Take care of surrogate pairs first
207
if (internal::is_surrogate(cp)) {
209
uint32_t trail_surrogate = internal::mask16(*start++);
210
if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
211
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
213
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
216
throw invalid_utf16(static_cast<uint16_t>(*start));
219
result = append(cp, result);
224
template <typename u16bit_iterator, typename octet_iterator>
225
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
227
while (start != end) {
228
uint32_t cp = next(start, end);
229
if (cp > 0xffff) { //make a surrogate pair
230
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
231
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
234
*result++ = static_cast<uint16_t>(cp);
239
template <typename octet_iterator, typename u32bit_iterator>
240
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
243
result = append(*(start++), result);
248
template <typename octet_iterator, typename u32bit_iterator>
249
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
252
(*result++) = next(start, end);
257
// The iterator class
258
template <typename octet_iterator>
259
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
261
octet_iterator range_start;
262
octet_iterator range_end;
265
explicit iterator (const octet_iterator& octet_it,
266
const octet_iterator& range_start,
267
const octet_iterator& range_end) :
268
it(octet_it), range_start(range_start), range_end(range_end)
270
if (it < range_start || it > range_end)
271
throw std::out_of_range("Invalid utf-8 iterator position");
273
// the default "big three" are OK
274
octet_iterator base () const { return it; }
275
uint32_t operator * () const
277
octet_iterator temp = it;
278
return next(temp, range_end);
280
bool operator == (const iterator& rhs) const
282
if (range_start != rhs.range_start || range_end != rhs.range_end)
283
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
284
return (it == rhs.it);
286
bool operator != (const iterator& rhs) const
288
return !(operator == (rhs));
290
iterator& operator ++ ()
295
iterator operator ++ (int)
297
iterator temp = *this;
301
iterator& operator -- ()
303
prior(it, range_start);
306
iterator operator -- (int)
308
iterator temp = *this;
309
prior(it, range_start);
316
#endif //header guard