~azzar1/unity/add-show-desktop-key

« back to all changes in this revision

Viewing changes to scripts/python-console

  • Committer: wagrant
  • Date: 2008-08-20 00:04:12 UTC
  • Revision ID: svn-v3-trunk0:2b9c9e99-6f39-0410-b283-7f802c844ae2:trunk:1033
python-console: Move incomplete_utf8_sequence() into common.util, where it
                probably belongs.

Show diffs side-by-side

added added

removed removed

Lines of Context:
67
67
        '''Trim an incomplete UTF-8 character from the end of a string.
68
68
           Returns (trimmed_string, count_of_trimmed_bytes).
69
69
        '''
70
 
        tokill = incomplete_utf8_sequence(stuff)
 
70
        tokill = common.util.incomplete_utf8_sequence(stuff)
71
71
        if tokill == 0:
72
72
            return (stuff, tokill)
73
73
        else:
367
367
    list = list + traceback.format_exception_only(etype, value)
368
368
    return ''.join(list)
369
369
 
370
 
def incomplete_utf8_sequence(byteseq):
371
 
    """
372
 
    str -> int
373
 
    Given a UTF-8-encoded byte sequence (str), returns the number of bytes at
374
 
    the end of the string which comprise an incomplete UTF-8 character
375
 
    sequence.
376
 
 
377
 
    If the string is empty or ends with a complete character OR INVALID
378
 
    sequence, returns 0.
379
 
    Otherwise, returns 1-3 indicating the number of bytes in the final
380
 
    incomplete (but valid) character sequence.
381
 
 
382
 
    Does not check any bytes before the final sequence for correctness.
383
 
 
384
 
    >>> incomplete_utf8_sequence("")
385
 
    0
386
 
    >>> incomplete_utf8_sequence("xy")
387
 
    0
388
 
    >>> incomplete_utf8_sequence("xy\xc3\xbc")
389
 
    0
390
 
    >>> incomplete_utf8_sequence("\xc3")
391
 
    1
392
 
    >>> incomplete_utf8_sequence("\xbc\xc3")
393
 
    1
394
 
    >>> incomplete_utf8_sequence("xy\xbc\xc3")
395
 
    1
396
 
    >>> incomplete_utf8_sequence("xy\xe0\xa0")
397
 
    2
398
 
    >>> incomplete_utf8_sequence("xy\xf4")
399
 
    1
400
 
    >>> incomplete_utf8_sequence("xy\xf4\x8f")
401
 
    2
402
 
    >>> incomplete_utf8_sequence("xy\xf4\x8f\xa0")
403
 
    3
404
 
    """
405
 
    count = 0
406
 
    expect = None
407
 
    for b in byteseq[::-1]:
408
 
        b = ord(b)
409
 
        count += 1
410
 
        if b & 0x80 == 0x0:
411
 
            # 0xxxxxxx (single-byte character)
412
 
            expect = 1
413
 
            break
414
 
        elif b & 0xc0 == 0x80:
415
 
            # 10xxxxxx (subsequent byte)
416
 
            pass
417
 
        elif b & 0xe0 == 0xc0:
418
 
            # 110xxxxx (start of 2-byte sequence)
419
 
            expect = 2
420
 
            break
421
 
        elif b & 0xf0 == 0xe0:
422
 
            # 1110xxxx (start of 3-byte sequence)
423
 
            expect = 3
424
 
            break
425
 
        elif b & 0xf8 == 0xf0:
426
 
            # 11110xxx (start of 4-byte sequence)
427
 
            expect = 4
428
 
            break
429
 
        else:
430
 
            # Invalid byte
431
 
            return 0
432
 
 
433
 
        if count >= 4:
434
 
            # Seen too many "subsequent bytes", invalid
435
 
            return 0
436
 
 
437
 
    if expect is None:
438
 
        # We never saw a "first byte", invalid
439
 
        return 0
440
 
 
441
 
    # We now know expect and count
442
 
    if count >= expect:
443
 
        # Complete, or we saw an invalid sequence
444
 
        return 0
445
 
    elif count < expect:
446
 
        # Incomplete
447
 
        return count
448
370
 
449
371
# Takes an object and returns a flattened version suitable for JSON
450
372
def flatten(object):