~drizzle-trunk/drizzle/development

873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
1
// Copyright 2006 Nemanja Trifunovic
2
3
/*
4
Permission is hereby granted, free of charge, to any person or organization
5
obtaining a copy of the software and accompanying documentation covered by
6
this license (the "Software") to use, reproduce, display, distribute,
7
execute, and transmit the Software, and to prepare derivative works of the
8
Software, and to permit third-parties to whom the Software is furnished to
9
do so, all subject to the following:
10
11
The copyright notices in the Software and this entire statement, including
12
the above license grant, this restriction and the following disclaimer,
13
must be included in all copies of the Software, in whole or in part, and
14
all derivative works of the Software, unless such copies or derivative
15
works are solely in the form of machine-executable object code generated by
16
a source language processor.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.
25
*/
26
2234 by Brian Aker
Mass removal of ifdef/endif in favor of pragma once.
27
#pragma once
1823.4.3 by Monty Taylor
Merged in new lib version.
28
 
29
 #include <iterator>
30
 
1823.4.2 by Monty Taylor
Cleaned things up a little bit namespace wise
31
namespace drizzled
32
{
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
33
namespace utf8
34
{
35
36
// Helper code - not intended to be directly called by the library users. May be changed at any time
37
namespace internal
1823.4.3 by Monty Taylor
Merged in new lib version.
38
{
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
39
    // Unicode constants
40
    // Leading (high) surrogates: 0xd800 - 0xdbff
41
    // Trailing (low) surrogates: 0xdc00 - 0xdfff
42
    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
43
    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
44
    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
45
    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
46
    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
47
    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
48
49
    // Maximum valid value for a Unicode code point
50
    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
51
52
    template<typename octet_type>
53
    inline uint8_t mask8(octet_type oc)
54
    {
55
        return static_cast<uint8_t>(0xff & oc);
56
    }
57
    template<typename u16_type>
58
    inline uint16_t mask16(u16_type oc)
59
    {
60
        return static_cast<uint16_t>(0xffff & oc);
61
    }
62
    template<typename octet_type>
63
    inline bool is_trail(octet_type oc)
64
    {
65
        return ((mask8(oc) >> 6) == 0x2);
66
    }
67
68
    template <typename u16>
1823.4.3 by Monty Taylor
Merged in new lib version.
69
    inline bool is_lead_surrogate(u16 cp)
70
    {
71
        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
72
    }
73
74
    template <typename u16>
75
    inline bool is_trail_surrogate(u16 cp)
76
    {
77
        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
78
    }
79
80
    template <typename u16>
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
81
    inline bool is_surrogate(u16 cp)
82
    {
83
        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
84
    }
85
86
    template <typename u32>
87
    inline bool is_code_point_valid(u32 cp)
88
    {
89
        return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
1823.4.3 by Monty Taylor
Merged in new lib version.
90
    }
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
91
92
    template <typename octet_iterator>
93
    inline typename std::iterator_traits<octet_iterator>::difference_type
94
    sequence_length(octet_iterator lead_it)
95
    {
96
        uint8_t lead = mask8(*lead_it);
1823.4.3 by Monty Taylor
Merged in new lib version.
97
        if (lead < 0x80)
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
98
            return 1;
99
        else if ((lead >> 5) == 0x6)
100
            return 2;
101
        else if ((lead >> 4) == 0xe)
102
            return 3;
103
        else if ((lead >> 3) == 0x1e)
104
            return 4;
1823.4.3 by Monty Taylor
Merged in new lib version.
105
        else
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
106
            return 0;
107
    }
108
1823.4.3 by Monty Taylor
Merged in new lib version.
109
    template <typename octet_difference_type>
110
    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
111
    {
112
        if (cp < 0x80) {
113
            if (length != 1) 
114
                return true;
115
        }
116
        else if (cp < 0x800) {
117
            if (length != 2) 
118
                return true;
119
        }
120
        else if (cp < 0x10000) {
121
            if (length != 3) 
122
                return true;
123
        }
124
125
        return false;
126
    }
127
128
    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
129
130
    /// get_sequence_x functions decode utf-8 sequences of the length x
131
132
    template <typename octet_iterator>
133
    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
134
    {
135
        if (it != end) {
136
            if (code_point)
137
                *code_point = mask8(*it);
138
            return UTF8_OK;
139
        }
140
        return NOT_ENOUGH_ROOM;
141
    }
142
143
    template <typename octet_iterator>
144
    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
145
    {
146
        utf_error ret_code = NOT_ENOUGH_ROOM;
147
148
        if (it != end) {
149
            uint32_t cp = mask8(*it);
150
            if (++it != end) {
151
                if (is_trail(*it)) {
152
                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
153
154
                    if (code_point)
155
                        *code_point = cp;
156
                    ret_code = UTF8_OK;
157
                }
158
                else
159
                    ret_code = INCOMPLETE_SEQUENCE;
160
            }
161
            else
162
                ret_code = NOT_ENOUGH_ROOM;
163
        }
164
165
        return ret_code;
166
    }
167
168
    template <typename octet_iterator>
169
    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
170
    {
171
        utf_error ret_code = NOT_ENOUGH_ROOM;
172
173
        if (it != end) {
174
            uint32_t cp = mask8(*it);
175
            if (++it != end) {
176
                if (is_trail(*it)) {
177
                    cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
178
                    if (++it != end) {
179
                        if (is_trail(*it)) {
180
                            cp += (*it) & 0x3f;
181
182
                            if (code_point)
183
                                *code_point = cp;
184
                            ret_code = UTF8_OK;
185
                        }
186
                        else 
187
                            ret_code = INCOMPLETE_SEQUENCE;
188
                    }
189
                    else
190
                        ret_code = NOT_ENOUGH_ROOM;
191
                }
192
                else
193
                    ret_code = INCOMPLETE_SEQUENCE;
194
            }
195
            else
196
                ret_code = NOT_ENOUGH_ROOM;
197
        }
198
199
        return ret_code;
200
    }
201
202
    template <typename octet_iterator>
203
    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
204
    {
205
        utf_error ret_code = NOT_ENOUGH_ROOM;
206
207
        if (it != end) {
208
            uint32_t cp = mask8(*it);
209
            if (++it != end) {
210
                if (is_trail(*it)) {
211
                    cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
212
                    if (++it != end) {
213
                        if (is_trail(*it)) {
214
                            cp += (mask8(*it) << 6) & 0xfff;
215
                            if (++it != end) {
216
                                if (is_trail(*it)) {
217
                                    cp += (*it) & 0x3f;
218
219
                                    if (code_point)
220
                                        *code_point = cp;
221
                                    ret_code = UTF8_OK;
222
                                }
223
                                else
224
                                    ret_code = INCOMPLETE_SEQUENCE;
225
                            }
226
                            else
227
                                ret_code = NOT_ENOUGH_ROOM;
228
                        }
229
                        else
230
                            ret_code = INCOMPLETE_SEQUENCE;
231
                    }
232
                    else
233
                        ret_code = NOT_ENOUGH_ROOM;
234
                }
235
                else 
236
                    ret_code = INCOMPLETE_SEQUENCE;
237
            }
238
            else
239
                ret_code = NOT_ENOUGH_ROOM;
240
        }
241
242
        return ret_code;
243
    }
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
244
245
    template <typename octet_iterator>
246
    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
247
    {
1823.4.3 by Monty Taylor
Merged in new lib version.
248
        // Save the original value of it so we can go back in case of failure
249
        // Of course, it does not make much sense with i.e. stream iterators
250
        octet_iterator original_it = it;
251
252
        uint32_t cp = 0;
253
        // Determine the sequence length based on the lead octet
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
254
        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
255
        octet_difference_type length = sequence_length(it);
1823.4.3 by Monty Taylor
Merged in new lib version.
256
        if (length == 0)
257
            return INVALID_LEAD;
258
259
        // Now that we have a valid sequence length, get trail octets and calculate the code point
260
        utf_error err = UTF8_OK;
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
261
        switch (length) {
1823.4.3 by Monty Taylor
Merged in new lib version.
262
            case 1:
263
                err = get_sequence_1(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
264
                break;
265
            case 2:
1823.4.3 by Monty Taylor
Merged in new lib version.
266
                err = get_sequence_2(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
267
            break;
268
            case 3:
1823.4.3 by Monty Taylor
Merged in new lib version.
269
                err = get_sequence_3(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
270
            break;
271
            case 4:
1823.4.3 by Monty Taylor
Merged in new lib version.
272
                err = get_sequence_4(it, end, &cp);
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
273
            break;
274
        }
1823.4.3 by Monty Taylor
Merged in new lib version.
275
276
        if (err == UTF8_OK) {
277
            // Decoding succeeded. Now, security checks...
278
            if (is_code_point_valid(cp)) {
279
                if (!is_overlong_sequence(cp, length)){
280
                    // Passed! Return here.
281
                    if (code_point)
282
                        *code_point = cp;
283
                    ++it;
284
                    return UTF8_OK;
285
                }
286
                else
287
                    err = OVERLONG_SEQUENCE;
288
            }
289
            else 
290
                err = INVALID_CODE_POINT;
291
        }
292
293
        // Failure branch - restore the original value of the iterator
294
        it = original_it;
295
        return err;
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
296
    }
297
298
    template <typename octet_iterator>
299
    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
300
        return validate_next(it, end, 0);
301
    }
302
1823.4.3 by Monty Taylor
Merged in new lib version.
303
} // namespace internal
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
304
305
    /// The library API - functions intended to be called by the users
306
307
    // Byte order mark
1823.4.3 by Monty Taylor
Merged in new lib version.
308
    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
309
310
    template <typename octet_iterator>
311
    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
312
    {
313
        octet_iterator result = start;
314
        while (result != end) {
315
            internal::utf_error err_code = internal::validate_next(result, end);
1823.4.3 by Monty Taylor
Merged in new lib version.
316
            if (err_code != internal::UTF8_OK)
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
317
                return result;
318
        }
319
        return result;
320
    }
321
322
    template <typename octet_iterator>
323
    inline bool is_valid(octet_iterator start, octet_iterator end)
324
    {
325
        return (find_invalid(start, end) == end);
326
    }
327
328
    template <typename octet_iterator>
1823.4.3 by Monty Taylor
Merged in new lib version.
329
    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
330
    {
331
        return (
332
            ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
333
            ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
334
            ((it != end) && (internal::mask8(*it))   == bom[2])
335
           );
336
    }
337
	
338
	//Deprecated in release 2.3 
339
    template <typename octet_iterator>
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
340
    inline bool is_bom (octet_iterator it)
341
    {
342
        return (
343
            (internal::mask8(*it++)) == bom[0] &&
344
            (internal::mask8(*it++)) == bom[1] &&
345
            (internal::mask8(*it))   == bom[2]
346
           );
347
    }
348
} // namespace utf8
1823.4.3 by Monty Taylor
Merged in new lib version.
349
} // namespace drizzled
873.2.27 by Monty Taylor
Added in utf8cpp. We want this to be an external lib, but it needs packaged.
350
351
352