~azzar1/unity/add-show-desktop-key

« back to all changes in this revision

Viewing changes to lib/common/util.py

  • Committer: wagrant
  • Date: 2008-08-20 00:04:12 UTC
  • Revision ID: svn-v3-trunk0:2b9c9e99-6f39-0410-b283-7f802c844ae2:trunk:1033
python-console: Move incomplete_utf8_sequence() into common.util, where it
                probably belongs.

Show diffs side-by-side

added added

removed removed

Lines of Context:
214
214
    # produced by json.js module when it encodes date objects.
215
215
    return datetime.datetime.strptime(str, "%Y-%m-%dT%H:%M:%SZ")
216
216
 
 
217
def incomplete_utf8_sequence(byteseq):
 
218
    """
 
219
    str -> int
 
220
    Given a UTF-8-encoded byte sequence (str), returns the number of bytes at
 
221
    the end of the string which comprise an incomplete UTF-8 character
 
222
    sequence.
 
223
 
 
224
    If the string is empty or ends with a complete character OR INVALID
 
225
    sequence, returns 0.
 
226
    Otherwise, returns 1-3 indicating the number of bytes in the final
 
227
    incomplete (but valid) character sequence.
 
228
 
 
229
    Does not check any bytes before the final sequence for correctness.
 
230
 
 
231
    >>> incomplete_utf8_sequence("")
 
232
    0
 
233
    >>> incomplete_utf8_sequence("xy")
 
234
    0
 
235
    >>> incomplete_utf8_sequence("xy\xc3\xbc")
 
236
    0
 
237
    >>> incomplete_utf8_sequence("\xc3")
 
238
    1
 
239
    >>> incomplete_utf8_sequence("\xbc\xc3")
 
240
    1
 
241
    >>> incomplete_utf8_sequence("xy\xbc\xc3")
 
242
    1
 
243
    >>> incomplete_utf8_sequence("xy\xe0\xa0")
 
244
    2
 
245
    >>> incomplete_utf8_sequence("xy\xf4")
 
246
    1
 
247
    >>> incomplete_utf8_sequence("xy\xf4\x8f")
 
248
    2
 
249
    >>> incomplete_utf8_sequence("xy\xf4\x8f\xa0")
 
250
    3
 
251
    """
 
252
    count = 0
 
253
    expect = None
 
254
    for b in byteseq[::-1]:
 
255
        b = ord(b)
 
256
        count += 1
 
257
        if b & 0x80 == 0x0:
 
258
            # 0xxxxxxx (single-byte character)
 
259
            expect = 1
 
260
            break
 
261
        elif b & 0xc0 == 0x80:
 
262
            # 10xxxxxx (subsequent byte)
 
263
            pass
 
264
        elif b & 0xe0 == 0xc0:
 
265
            # 110xxxxx (start of 2-byte sequence)
 
266
            expect = 2
 
267
            break
 
268
        elif b & 0xf0 == 0xe0:
 
269
            # 1110xxxx (start of 3-byte sequence)
 
270
            expect = 3
 
271
            break
 
272
        elif b & 0xf8 == 0xf0:
 
273
            # 11110xxx (start of 4-byte sequence)
 
274
            expect = 4
 
275
            break
 
276
        else:
 
277
            # Invalid byte
 
278
            return 0
 
279
 
 
280
        if count >= 4:
 
281
            # Seen too many "subsequent bytes", invalid
 
282
            return 0
 
283
 
 
284
    if expect is None:
 
285
        # We never saw a "first byte", invalid
 
286
        return 0
 
287
 
 
288
    # We now know expect and count
 
289
    if count >= expect:
 
290
        # Complete, or we saw an invalid sequence
 
291
        return 0
 
292
    elif count < expect:
 
293
        # Incomplete
 
294
        return count