4
4
"""Character encoding utilities"""
9
'escape_nonascii_uniquely',
10
from htmlentitydefs import codepoint2name
11
16
from cStringIO import StringIO
13
__all__ = ['guess', 'ascii_smash']
16
19
(codecs.BOM_UTF16_BE, 'utf_16_be'),
17
20
(codecs.BOM_UTF16_LE, 'utf_16_le'),
151
154
return unicode(s, 'ISO-8859-1', 'replace')
154
# def unicode_to_unaccented_str(text):
155
# """Converts a unicode string into an ascii-only str, converting accented
156
# characters to their plain equivalents.
158
# >>> unicode_to_unaccented_str(u'')
160
# >>> unicode_to_unaccented_str(u'foo bar 123')
162
# >>> unicode_to_unaccented_str(u'viva S\xe3o Carlos!')
165
# assert isinstance(text, unicode)
168
# charnum = ord(char)
169
# codepoint = codepoint2name.get(charnum)
170
# if codepoint is not None:
171
# strchar = codepoint[0]
174
# strchar = char.encode('ascii')
175
# except UnicodeEncodeError:
181
157
def ascii_smash(unicode_string):
182
158
"""Attempt to convert the Unicode string, possibly containing accents,
183
159
to an ASCII string.
370
346
if match is not None:
371
347
return match.group(1)
373
# Something we can"t represent. Return empty string.
349
# Something we can't represent. Return empty string.
353
def escape_nonascii_uniquely(bogus_string):
354
"""Replace non-ascii characters with a hex representation.
356
This is mainly for preventing emails with invalid characters from causing
357
oopses. The nonascii characters could have been removed or just converted
358
to "?", but this provides some insight into what the bogus data was, and
359
it prevents the message-id from two unrelated emails matching because
360
all the nonascii characters have been replaced with the same ascii
363
Unfortunately, all the strings below are actually part of this
364
function's docstring, so python processes the backslash once before
365
doctest, and then python processes it again when doctest runs the
366
test. This makes it confusing, since four backslashes will get
367
converted into a single ascii character.
369
>>> print len('\xa9'), len('\\xa9'), len('\\\\xa9')
371
>>> print escape_nonascii_uniquely('hello \xa9')
373
>>> print escape_nonascii_uniquely('hello \\xa9')
376
This string only has ascii characters, so escape_nonascii_uniquely()
377
actually has no effect.
379
>>> print escape_nonascii_uniquely('hello \\\\xa9')
382
nonascii_regex = re.compile(r'[\200-\377]')
383
# By encoding the invalid ascii with a backslash, x, and then the
384
# hex value, it makes it easy to decode it by pasting into a python
385
# interpreter. quopri() is not used, since that could caused the
386
# decoding of an email to fail.
388
return '\\x%x' % ord(match.group(0))
389
return nonascii_regex.sub(quote, bogus_string)