~drizzle-trunk/drizzle/development

873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
1
// Copyright 2006 Nemanja Trifunovic
2
3
/*
4
Permission is hereby granted, free of charge, to any person or organization
5
obtaining a copy of the software and accompanying documentation covered by
6
this license (the "Software") to use, reproduce, display, distribute,
7
execute, and transmit the Software, and to prepare derivative works of the
8
Software, and to permit third-parties to whom the Software is furnished to
9
do so, all subject to the following:
10
11
The copyright notices in the Software and this entire statement, including
12
the above license grant, this restriction and the following disclaimer,
13
must be included in all copies of the Software, in whole or in part, and
14
all derivative works of the Software, unless such copies or derivative
15
works are solely in the form of machine-executable object code generated by
16
a source language processor.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.
25
*/
26
1122.2.10 by Monty Taylor
Fixed all of the include guards.
27
#ifndef DRIZZLED_UTF8_CORE_H
28
#define DRIZZLED_UTF8_CORE_H
1823.4.3 by Monty Taylor
Merged in new lib version.
29
 
30
 #include <iterator>
31
 
1823.4.2 by Monty Taylor
Cleaned things up a little bit namespace wise
32
namespace drizzled
33
{
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
34
namespace utf8
35
{
36
37
// Helper code - not intended to be directly called by the library users. May be changed at any time
38
namespace internal
1823.4.3 by Monty Taylor
Merged in new lib version.
39
{
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
40
    // Unicode constants
41
    // Leading (high) surrogates: 0xd800 - 0xdbff
42
    // Trailing (low) surrogates: 0xdc00 - 0xdfff
43
    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
44
    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
45
    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
46
    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
47
    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
48
    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
49
50
    // Maximum valid value for a Unicode code point
51
    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
52
53
    template<typename octet_type>
54
    inline uint8_t mask8(octet_type oc)
55
    {
56
        return static_cast<uint8_t>(0xff & oc);
57
    }
58
    template<typename u16_type>
59
    inline uint16_t mask16(u16_type oc)
60
    {
61
        return static_cast<uint16_t>(0xffff & oc);
62
    }
63
    template<typename octet_type>
64
    inline bool is_trail(octet_type oc)
65
    {
66
        return ((mask8(oc) >> 6) == 0x2);
67
    }
68
69
    template <typename u16>
1823.4.3 by Monty Taylor
Merged in new lib version.
70
    inline bool is_lead_surrogate(u16 cp)
71
    {
72
        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
73
    }
74
75
    template <typename u16>
76
    inline bool is_trail_surrogate(u16 cp)
77
    {
78
        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
79
    }
80
81
    template <typename u16>
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
82
    inline bool is_surrogate(u16 cp)
83
    {
84
        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
85
    }
86
87
    template <typename u32>
88
    inline bool is_code_point_valid(u32 cp)
89
    {
90
        return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
1823.4.3 by Monty Taylor
Merged in new lib version.
91
    }
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
92
93
    template <typename octet_iterator>
94
    inline typename std::iterator_traits<octet_iterator>::difference_type
95
    sequence_length(octet_iterator lead_it)
96
    {
97
        uint8_t lead = mask8(*lead_it);
1823.4.3 by Monty Taylor
Merged in new lib version.
98
        if (lead < 0x80)
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
99
            return 1;
100
        else if ((lead >> 5) == 0x6)
101
            return 2;
102
        else if ((lead >> 4) == 0xe)
103
            return 3;
104
        else if ((lead >> 3) == 0x1e)
105
            return 4;
1823.4.3 by Monty Taylor
Merged in new lib version.
106
        else
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
107
            return 0;
108
    }
109
1823.4.3 by Monty Taylor
Merged in new lib version.
110
    template <typename octet_difference_type>
111
    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
112
    {
113
        if (cp < 0x80) {
114
            if (length != 1) 
115
                return true;
116
        }
117
        else if (cp < 0x800) {
118
            if (length != 2) 
119
                return true;
120
        }
121
        else if (cp < 0x10000) {
122
            if (length != 3) 
123
                return true;
124
        }
125
126
        return false;
127
    }
128
129
    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
130
131
    /// get_sequence_x functions decode utf-8 sequences of the length x
132
133
    template <typename octet_iterator>
134
    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
135
    {
136
        if (it != end) {
137
            if (code_point)
138
                *code_point = mask8(*it);
139
            return UTF8_OK;
140
        }
141
        return NOT_ENOUGH_ROOM;
142
    }
143
144
    template <typename octet_iterator>
145
    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
146
    {
147
        utf_error ret_code = NOT_ENOUGH_ROOM;
148
149
        if (it != end) {
150
            uint32_t cp = mask8(*it);
151
            if (++it != end) {
152
                if (is_trail(*it)) {
153
                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
154
155
                    if (code_point)
156
                        *code_point = cp;
157
                    ret_code = UTF8_OK;
158
                }
159
                else
160
                    ret_code = INCOMPLETE_SEQUENCE;
161
            }
162
            else
163
                ret_code = NOT_ENOUGH_ROOM;
164
        }
165
166
        return ret_code;
167
    }
168
169
    template <typename octet_iterator>
170
    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
171
    {
172
        utf_error ret_code = NOT_ENOUGH_ROOM;
173
174
        if (it != end) {
175
            uint32_t cp = mask8(*it);
176
            if (++it != end) {
177
                if (is_trail(*it)) {
178
                    cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
179
                    if (++it != end) {
180
                        if (is_trail(*it)) {
181
                            cp += (*it) & 0x3f;
182
183
                            if (code_point)
184
                                *code_point = cp;
185
                            ret_code = UTF8_OK;
186
                        }
187
                        else 
188
                            ret_code = INCOMPLETE_SEQUENCE;
189
                    }
190
                    else
191
                        ret_code = NOT_ENOUGH_ROOM;
192
                }
193
                else
194
                    ret_code = INCOMPLETE_SEQUENCE;
195
            }
196
            else
197
                ret_code = NOT_ENOUGH_ROOM;
198
        }
199
200
        return ret_code;
201
    }
202
203
    template <typename octet_iterator>
204
    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
205
    {
206
        utf_error ret_code = NOT_ENOUGH_ROOM;
207
208
        if (it != end) {
209
            uint32_t cp = mask8(*it);
210
            if (++it != end) {
211
                if (is_trail(*it)) {
212
                    cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
213
                    if (++it != end) {
214
                        if (is_trail(*it)) {
215
                            cp += (mask8(*it) << 6) & 0xfff;
216
                            if (++it != end) {
217
                                if (is_trail(*it)) {
218
                                    cp += (*it) & 0x3f;
219
220
                                    if (code_point)
221
                                        *code_point = cp;
222
                                    ret_code = UTF8_OK;
223
                                }
224
                                else
225
                                    ret_code = INCOMPLETE_SEQUENCE;
226
                            }
227
                            else
228
                                ret_code = NOT_ENOUGH_ROOM;
229
                        }
230
                        else
231
                            ret_code = INCOMPLETE_SEQUENCE;
232
                    }
233
                    else
234
                        ret_code = NOT_ENOUGH_ROOM;
235
                }
236
                else 
237
                    ret_code = INCOMPLETE_SEQUENCE;
238
            }
239
            else
240
                ret_code = NOT_ENOUGH_ROOM;
241
        }
242
243
        return ret_code;
244
    }
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
245
246
    template <typename octet_iterator>
247
    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
248
    {
1823.4.3 by Monty Taylor
Merged in new lib version.
249
        // Save the original value of it so we can go back in case of failure
250
        // Of course, it does not make much sense with i.e. stream iterators
251
        octet_iterator original_it = it;
252
253
        uint32_t cp = 0;
254
        // Determine the sequence length based on the lead octet
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
255
        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
256
        octet_difference_type length = sequence_length(it);
1823.4.3 by Monty Taylor
Merged in new lib version.
257
        if (length == 0)
258
            return INVALID_LEAD;
259
260
        // Now that we have a valid sequence length, get trail octets and calculate the code point
261
        utf_error err = UTF8_OK;
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
262
        switch (length) {
1823.4.3 by Monty Taylor
Merged in new lib version.
263
            case 1:
264
                err = get_sequence_1(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
265
                break;
266
            case 2:
1823.4.3 by Monty Taylor
Merged in new lib version.
267
                err = get_sequence_2(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
268
            break;
269
            case 3:
1823.4.3 by Monty Taylor
Merged in new lib version.
270
                err = get_sequence_3(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
271
            break;
272
            case 4:
1823.4.3 by Monty Taylor
Merged in new lib version.
273
                err = get_sequence_4(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
274
            break;
275
        }
1823.4.3 by Monty Taylor
Merged in new lib version.
276
277
        if (err == UTF8_OK) {
278
            // Decoding succeeded. Now, security checks...
279
            if (is_code_point_valid(cp)) {
280
                if (!is_overlong_sequence(cp, length)){
281
                    // Passed! Return here.
282
                    if (code_point)
283
                        *code_point = cp;
284
                    ++it;
285
                    return UTF8_OK;
286
                }
287
                else
288
                    err = OVERLONG_SEQUENCE;
289
            }
290
            else 
291
                err = INVALID_CODE_POINT;
292
        }
293
294
        // Failure branch - restore the original value of the iterator
295
        it = original_it;
296
        return err;
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
297
    }
298
299
    template <typename octet_iterator>
300
    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
301
        return validate_next(it, end, 0);
302
    }
303
1823.4.3 by Monty Taylor
Merged in new lib version.
304
} // namespace internal
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
305
306
    /// The library API - functions intended to be called by the users
307
308
    // Byte order mark
1823.4.3 by Monty Taylor
Merged in new lib version.
309
    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
310
311
    template <typename octet_iterator>
312
    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
313
    {
314
        octet_iterator result = start;
315
        while (result != end) {
316
            internal::utf_error err_code = internal::validate_next(result, end);
1823.4.3 by Monty Taylor
Merged in new lib version.
317
            if (err_code != internal::UTF8_OK)
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
318
                return result;
319
        }
320
        return result;
321
    }
322
323
    template <typename octet_iterator>
324
    inline bool is_valid(octet_iterator start, octet_iterator end)
325
    {
326
        return (find_invalid(start, end) == end);
327
    }
328
329
    template <typename octet_iterator>
1823.4.3 by Monty Taylor
Merged in new lib version.
330
    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
331
    {
332
        return (
333
            ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
334
            ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
335
            ((it != end) && (internal::mask8(*it))   == bom[2])
336
           );
337
    }
338
	
339
	//Deprecated in release 2.3 
340
    template <typename octet_iterator>
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
341
    inline bool is_bom (octet_iterator it)
342
    {
343
        return (
344
            (internal::mask8(*it++)) == bom[0] &&
345
            (internal::mask8(*it++)) == bom[1] &&
346
            (internal::mask8(*it))   == bom[2]
347
           );
348
    }
349
} // namespace utf8
1823.4.3 by Monty Taylor
Merged in new lib version.
350
} // namespace drizzled
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
351
1122.2.10 by Monty Taylor
Fixed all of the include guards.
352
#endif /* DRIZZLED_UTF8_CORE_H */
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
353
354