~launchpad-pqm/launchpad/devel

« back to all changes in this revision

Viewing changes to lib/canonical/encoding.py

Committer: Leonard Richardson
Date: 2010-08-25 14:42:52 UTC
mfrom: (11437 launchpad)
mto: (11441.1.1 yet-again-optimized-length)
mto: This revision was merged to the branch mainline in revision 11452.
Revision ID: leonard.richardson@canonical.com-20100825144252-kagpu1pcz8dzon8x

Merge with trunk.

files added:
lib/lp/archiveuploader/tests/data/ed_0.2-20_i386.changes.binary-only

lib/lp/archiveuploader/tests/data/ed_0.2-20_source.changes

lib/lp/bugs/javascript/bug_subscription_widget.js

lib/lp/bugs/javascript/tests/test_bug_subscription_widget.html

lib/lp/bugs/javascript/tests/test_bug_subscription_widget.js

lib/lp/bugs/mail/newbug.py

lib/lp/bugs/subscribers/bugtask.py

lib/lp/translations/browser/tests/test_potemplate_navigation.py

files removed:
lib/canonical/launchpad/emailtemplates/notify-unhandled-email.txt

lib/lp/buildmaster/model/buildbase.py

lib/lp/buildmaster/tests/test_buildbase.py

files modified:
bootstrap.py

lib/canonical/buildd/buildrecipe

lib/canonical/buildd/debian/changelog

lib/canonical/buildd/sbuild-package

lib/canonical/encoding.py

lib/canonical/launchpad/components/decoratedresultset.py

lib/canonical/launchpad/doc/canonical_url_examples.txt

lib/canonical/launchpad/icing/print.css

lib/canonical/launchpad/icing/shipit.css

lib/canonical/launchpad/icing/style-3-0.css.in

lib/canonical/launchpad/icing/yui_2.7.0b/build/calendar/assets/skins/sam/calendar-skin.css

lib/canonical/launchpad/mailnotification.py

lib/canonical/launchpad/pagetests/basics/notfound-traversals.txt

lib/canonical/launchpad/security.py

lib/canonical/launchpad/subscribers/karma.py

lib/canonical/launchpad/templates/oops-veryplain.pt

lib/canonical/launchpad/webapp/error.py

lib/canonical/launchpad/xmlrpc/mailinglist.py

lib/devscripts/ec2test/remote.py

lib/devscripts/ec2test/tests/test_remote.py

lib/lp/app/templates/root-index.pt

lib/lp/archivepublisher/utils.py

lib/lp/archiveuploader/dscfile.py

lib/lp/archiveuploader/nascentupload.py

lib/lp/archiveuploader/tests/data/ed_0.2-20_i386.changes.mismatched-arch-unsigned

lib/lp/archiveuploader/tests/data/suite/foo_1.0-2_multi_binary/foo_1.0-2_i386.changes

lib/lp/archiveuploader/tests/nascentupload-announcements.txt

lib/lp/archiveuploader/tests/nascentupload.txt

lib/lp/archiveuploader/tests/nascentuploadfile.txt

lib/lp/archiveuploader/tests/test_dscfile.py

lib/lp/archiveuploader/tests/test_nascentupload_documentation.py

lib/lp/archiveuploader/tests/test_nascentuploadfile.py

lib/lp/blueprints/templates/specificationbranch-status.pt

lib/lp/bugs/configure.zcml

lib/lp/bugs/doc/bugnotification-email.txt

lib/lp/bugs/doc/bugnotification-sending.txt

lib/lp/bugs/doc/bugsubscription.txt

lib/lp/bugs/doc/externalbugtracker-bugzilla.txt

lib/lp/bugs/externalbugtracker/bugzilla.py

lib/lp/bugs/scripts/bugnotification.py

lib/lp/bugs/subscribers/bug.py

lib/lp/bugs/subscribers/bugcreation.py

lib/lp/buildmaster/doc/builder.txt

lib/lp/buildmaster/interfaces/buildbase.py

lib/lp/buildmaster/model/builder.py

lib/lp/buildmaster/model/packagebuild.py

lib/lp/buildmaster/tests/test_builder.py

lib/lp/buildmaster/tests/test_packagebuild.py

lib/lp/code/browser/branchlisting.py

lib/lp/code/browser/branchvisibilitypolicy.py

lib/lp/code/browser/configure.zcml

lib/lp/code/browser/sourcepackagerecipe.py

lib/lp/code/browser/sourcepackagerecipelisting.py

lib/lp/code/browser/tests/test_branchlisting.py

lib/lp/code/browser/tests/test_branchmergeproposal.py

lib/lp/code/browser/tests/test_branchmergeproposallisting.py

lib/lp/code/browser/tests/test_product.py

lib/lp/code/model/sourcepackagerecipebuild.py

lib/lp/code/model/tests/test_sourcepackagerecipebuild.py

lib/lp/code/stories/branches/xx-branch-edit.txt

lib/lp/code/stories/branches/xx-branch-listings.txt

lib/lp/code/stories/branches/xx-branch-url-validation.txt

lib/lp/code/stories/branches/xx-creating-branches.txt

lib/lp/code/stories/branches/xx-source-package-branches-listing.txt

lib/lp/code/stories/branches/xx-subscribing-branches.txt

lib/lp/code/stories/sourcepackagerecipes/xx-recipe-listings.txt

lib/lp/code/templates/branch-summary-listing.pt

lib/lp/code/templates/distributionsourcepackage-branches-grouped.pt

lib/lp/hardwaredb/model/hwdb.py

lib/lp/registry/browser/distribution.py

lib/lp/registry/browser/product.py

lib/lp/registry/browser/productseries.py

lib/lp/registry/browser/sourcepackage.py

lib/lp/registry/browser/team.py

lib/lp/registry/browser/tests/productseries-views.txt

lib/lp/registry/doc/message-holds-xmlrpc.txt

lib/lp/registry/model/distroseries.py

lib/lp/registry/model/mailinglist.py

lib/lp/registry/model/product.py

lib/lp/registry/stories/productseries/xx-productseries-link-branch.txt

lib/lp/registry/templates/productseries-codesummary.pt

lib/lp/registry/tests/test_mailinglist.py

lib/lp/registry/tests/test_product.py

lib/lp/registry/vocabularies.py

lib/lp/soyuz/browser/tests/distroseriesqueue-views.txt

lib/lp/soyuz/doc/distroseriesqueue.txt

lib/lp/soyuz/doc/package-diff.txt

lib/lp/soyuz/model/binarypackagebuild.py

lib/lp/soyuz/scripts/initialise_distroseries.py

lib/lp/soyuz/scripts/tests/test_initialise_distroseries.py

lib/lp/soyuz/tests/test_binarypackagebuild.py

lib/lp/testing/__init__.py

lib/lp/testing/fakelibrarian.py

lib/lp/testing/matchers.py

lib/lp/testing/menu.py

lib/lp/testing/tests/test_fakelibrarian.py

lib/lp/testing/views.py

lib/lp/translations/browser/potemplate.py

lib/lp/translations/browser/tests/potemplate-views.txt

lib/lp/translations/interfaces/potemplate.py

lib/lp/translations/model/potemplate.py

lib/lp/translations/utilities/tests/test_file_importer.py

utilities/apidoc-index.pt

versions.cfg

Show diffs side-by-side

added added

removed removed

lib/canonical/encoding.py

"""Character encoding utilities"""

__metaclass__ = type

__all__ = [

'ascii_smash',

'escape_nonascii_uniquely',

'guess',

]

import re

import codecs

import unicodedata

from htmlentitydefs import codepoint2name

from cStringIO import StringIO

__all__ = ['guess', 'ascii_smash']

_boms = [

(codecs.BOM_UTF16_BE, 'utf_16_be'),

(codecs.BOM_UTF16_LE, 'utf_16_le'),

151

154

return unicode(s, 'ISO-8859-1', 'replace')

152

155

153

156

154

# def unicode_to_unaccented_str(text):

155

# """Converts a unicode string into an ascii-only str, converting accented

156

# characters to their plain equivalents.

157

158

# >>> unicode_to_unaccented_str(u'')

159

# ''

160

# >>> unicode_to_unaccented_str(u'foo bar 123')

161

# 'foo bar 123'

162

# >>> unicode_to_unaccented_str(u'viva S\xe3o Carlos!')

163

# 'viva Sao Carlos!'

164

# """

165

# assert isinstance(text, unicode)

166

# L = []

167

# for char in text:

168

# charnum = ord(char)

169

# codepoint = codepoint2name.get(charnum)

170

# if codepoint is not None:

171

# strchar = codepoint[0]

172

# else:

173

# try:

174

# strchar = char.encode('ascii')

175

# except UnicodeEncodeError:

176

# strchar = ''

177

# L.append(strchar)

178

# return ''.join(L)

179

180

181

157

def ascii_smash(unicode_string):

182

158

"""Attempt to convert the Unicode string, possibly containing accents,

183

159

to an ASCII string.

370

346

if match is not None:

371

347

return match.group(1)

372

348

373

# Something we can"t represent. Return empty string.

349

# Something we can't represent. Return empty string.

374

350

return ""

375

351

352

353

def escape_nonascii_uniquely(bogus_string):

354

"""Replace non-ascii characters with a hex representation.

355

356

This is mainly for preventing emails with invalid characters from causing

357

oopses. The nonascii characters could have been removed or just converted

358

to "?", but this provides some insight into what the bogus data was, and

359

it prevents the message-id from two unrelated emails matching because

360

all the nonascii characters have been replaced with the same ascii

361

character.

362

363

Unfortunately, all the strings below are actually part of this

364

function's docstring, so python processes the backslash once before

365

doctest, and then python processes it again when doctest runs the

366

test. This makes it confusing, since four backslashes will get

367

converted into a single ascii character.

368

369

>>> print len('\xa9'), len('\\xa9'), len('\\\\xa9')

370

1 1 4

371

>>> print escape_nonascii_uniquely('hello \xa9')

372

hello \\xa9

373

>>> print escape_nonascii_uniquely('hello \\xa9')

374

hello \\xa9

375

376

This string only has ascii characters, so escape_nonascii_uniquely()

377

actually has no effect.

378

379

>>> print escape_nonascii_uniquely('hello \\\\xa9')

380

hello \\xa9

381

"""

382

nonascii_regex = re.compile(r'[\200-\377]')

383

# By encoding the invalid ascii with a backslash, x, and then the

384

# hex value, it makes it easy to decode it by pasting into a python

385

# interpreter. quopri() is not used, since that could caused the

386

# decoding of an email to fail.

387

def quote(match):

388

return '\\x%x' % ord(match.group(0))

389

return nonascii_regex.sub(quote, bogus_string)

Older »