67
67
'''Trim an incomplete UTF-8 character from the end of a string.
68
68
Returns (trimmed_string, count_of_trimmed_bytes).
70
tokill = incomplete_utf8_sequence(stuff)
70
tokill = common.util.incomplete_utf8_sequence(stuff)
72
72
return (stuff, tokill)
367
367
list = list + traceback.format_exception_only(etype, value)
368
368
return ''.join(list)
370
def incomplete_utf8_sequence(byteseq):
373
Given a UTF-8-encoded byte sequence (str), returns the number of bytes at
374
the end of the string which comprise an incomplete UTF-8 character
377
If the string is empty or ends with a complete character OR INVALID
379
Otherwise, returns 1-3 indicating the number of bytes in the final
380
incomplete (but valid) character sequence.
382
Does not check any bytes before the final sequence for correctness.
384
>>> incomplete_utf8_sequence("")
386
>>> incomplete_utf8_sequence("xy")
388
>>> incomplete_utf8_sequence("xy\xc3\xbc")
390
>>> incomplete_utf8_sequence("\xc3")
392
>>> incomplete_utf8_sequence("\xbc\xc3")
394
>>> incomplete_utf8_sequence("xy\xbc\xc3")
396
>>> incomplete_utf8_sequence("xy\xe0\xa0")
398
>>> incomplete_utf8_sequence("xy\xf4")
400
>>> incomplete_utf8_sequence("xy\xf4\x8f")
402
>>> incomplete_utf8_sequence("xy\xf4\x8f\xa0")
407
for b in byteseq[::-1]:
411
# 0xxxxxxx (single-byte character)
414
elif b & 0xc0 == 0x80:
415
# 10xxxxxx (subsequent byte)
417
elif b & 0xe0 == 0xc0:
418
# 110xxxxx (start of 2-byte sequence)
421
elif b & 0xf0 == 0xe0:
422
# 1110xxxx (start of 3-byte sequence)
425
elif b & 0xf8 == 0xf0:
426
# 11110xxx (start of 4-byte sequence)
434
# Seen too many "subsequent bytes", invalid
438
# We never saw a "first byte", invalid
441
# We now know expect and count
443
# Complete, or we saw an invalid sequence
449
371
# Takes an object and returns a flattened version suitable for JSON
450
372
def flatten(object):