214
214
# produced by json.js module when it encodes date objects.
215
215
return datetime.datetime.strptime(str, "%Y-%m-%dT%H:%M:%SZ")
217
def incomplete_utf8_sequence(byteseq):
220
Given a UTF-8-encoded byte sequence (str), returns the number of bytes at
221
the end of the string which comprise an incomplete UTF-8 character
224
If the string is empty or ends with a complete character OR INVALID
226
Otherwise, returns 1-3 indicating the number of bytes in the final
227
incomplete (but valid) character sequence.
229
Does not check any bytes before the final sequence for correctness.
231
>>> incomplete_utf8_sequence("")
233
>>> incomplete_utf8_sequence("xy")
235
>>> incomplete_utf8_sequence("xy\xc3\xbc")
237
>>> incomplete_utf8_sequence("\xc3")
239
>>> incomplete_utf8_sequence("\xbc\xc3")
241
>>> incomplete_utf8_sequence("xy\xbc\xc3")
243
>>> incomplete_utf8_sequence("xy\xe0\xa0")
245
>>> incomplete_utf8_sequence("xy\xf4")
247
>>> incomplete_utf8_sequence("xy\xf4\x8f")
249
>>> incomplete_utf8_sequence("xy\xf4\x8f\xa0")
254
for b in byteseq[::-1]:
258
# 0xxxxxxx (single-byte character)
261
elif b & 0xc0 == 0x80:
262
# 10xxxxxx (subsequent byte)
264
elif b & 0xe0 == 0xc0:
265
# 110xxxxx (start of 2-byte sequence)
268
elif b & 0xf0 == 0xe0:
269
# 1110xxxx (start of 3-byte sequence)
272
elif b & 0xf8 == 0xf0:
273
# 11110xxx (start of 4-byte sequence)
281
# Seen too many "subsequent bytes", invalid
285
# We never saw a "first byte", invalid
288
# We now know expect and count
290
# Complete, or we saw an invalid sequence