~launchpad-pqm/launchpad/devel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Copyright 2009-2010 Canonical Ltd.  This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).

__metaclass__ = type
__all__ = [
    'MixedNewlineMarkersError',
    'sanitize_translations_from_webui',
    'sanitize_translations_from_import',
    ]


class MixedNewlineMarkersError(ValueError):
    """Exception raised when we detect mixing of new line markers.

    Raised when the sanitization code detects that a msgid or msgstr uses
    more than one style of newline markers (windows, mac, unix).
    """


class Sanitizer(object):
    """Provide a function to sanitize a translation text."""

    # There are three different kinds of newlines:
    windows_style = u'\r\n'
    mac_style = u'\r'
    unix_style = u'\n'
    mixed_style = object()

    dot_char = u'\u2022'

    def __init__(self, english_singular):
        """Extract information from the English singular."""
        # Does the dot character appear in the Eglish singular?
        self.has_dots = self.dot_char in english_singular
        # Find out if there is leading or trailing whitespace in the English
        # singular.
        stripped_singular_text = english_singular.strip()
        self.is_empty_stripped = stripped_singular_text == ""
        if len(stripped_singular_text) != len(english_singular):
            # There is whitespace that we should copy to the 'text'
            # after stripping it.
            self.prefix = english_singular[:-len(english_singular.lstrip())]
            self.postfix = english_singular[len(english_singular.rstrip()):]
        else:
            self.prefix = ''
            self.postfix = ''
        # Get the newline style that is used in the English Singular.
        self.newline_style = self._getNewlineStyle(english_singular)

    @classmethod
    def _getNewlineStyle(cls, text):
        """Find out which newline style is used in text."""
        style = None
        # To avoid confusing the single-character newline styles for mac and
        # unix with the two-character windows one, remove the windows-style
        # newlines from the text and use that text to search for the other
        # two.
        stripped_text = text.replace(cls.windows_style, u'')
        if text != stripped_text:
            # Text contains windows style new lines.
            style = cls.windows_style

        for one_char_style in (cls.mac_style, cls.unix_style):
            if one_char_style in stripped_text:
                if style is not None:
                    return cls.mixed_style
                style = one_char_style

        return style

    def sanitize(self, translation_text):
        """Return 'translation_text' or None after doing some sanitization.

        The text is sanitized through the following filters:

          self.convertDotToSpace
          self.normalizeWhitespaces
          self.normalizeNewlines

        If the resulting string after these operations is an empty string,
        it returns None.

        :param english_singular: The text of the singular MsgId that this
            translation is for.
        :param translation_text: A unicode text that needs to be sanitized.
        """
        if translation_text is None:
            return None

        # Fix the visual point that users copy & paste from the web interface.
        new_text = self.convertDotToSpace(translation_text)
        # Now, fix the newline chars.
        new_text = self.normalizeNewlines(new_text)
        # Finally, set the same whitespace at the start/end of the string.
        new_text = self.normalizeWhitespace(new_text)
        # Also, if it's an empty string, replace it with None.
        if new_text == '':
            new_text = None

        return new_text

    def convertDotToSpace(self, translation_text):
        """Return 'translation_text' with the 'dot' char exchanged with a
        normal space.

        If the english_singular contains that character, 'translation_text' is
        returned without changes as it's a valid char instead of our way to
        represent a normal space to the user.
        """
        if self.has_dots or self.dot_char not in translation_text:
            return translation_text

        return translation_text.replace(u'\u2022', ' ')

    def normalizeWhitespace(self, translation_text):
        """Return 'translation_text' with the same trailing and leading
        whitespace that self.singular_text has.

        If 'translation_text' has only whitespace but english_singular has
        other characters, the empty string (u'') is returned to note it as an
        untranslated string.
        """
        if translation_text is None:
            return None

        stripped_translation_text = translation_text.strip()

        if not self.is_empty_stripped and len(stripped_translation_text) == 0:
            return ''

        return '%s%s%s' % (
            self.prefix, stripped_translation_text, self.postfix)

    def normalizeNewlines(self, translation_text):
        """Return 'translation_text' with newlines sync with english_singular.

        Raises an exception if the text has mixed newline styles.
        """
        if self.newline_style is None:
            # No newlines in the English singular, so we have nothing to do.
            return translation_text

        # Get the style that is used in the given text.
        translation_newline_style = self._getNewlineStyle(translation_text)

        if translation_newline_style == self.mixed_style:
            # The translation has mixed newlines in it; that is not allowed.
            raise MixedNewlineMarkersError(
                "Translations text (%r) mixes different newline markers." %
                    translation_text)

        if translation_newline_style is None:
            # The translation text doesn't contain any newlines, so there is
            # nothing for us to do.
            return translation_text

        if self.newline_style is self.mixed_style:
            # The original has mixed newlines (some very old data are like
            # this, new data with mixed newlines are rejected), so we're just
            # going to punt and normalize to unix style.
            return translation_text.replace(
                translation_newline_style, self.unix_style)
        else:
            # Otherwise the translation text should be normalized to use the
            # same newline style as the original.
            return translation_text.replace(
                translation_newline_style, self.newline_style)


def sanitize_translations(
        english_singular, translations, pluralforms):
    """Sanitize `translations` using sanitize_translation.

    If there is no certain pluralform in `translations`, set it to None.
    If there are `translations` with greater pluralforms than allowed,
    sanitize and keep them.
    :param english_singular: The text of the singular MsgId that these
        translations are for.
    :param translations: A dictionary of plural forms, with the
        integer plural form number as the key and the translation as the
        value.
    :param pluralforms: The number of expected pluralforms
    """
    # Sanitize all given translations.
    # Make sure the translations are stored in a dict.
    if isinstance(translations, (list, tuple)):
        translations = dict(enumerate(translations))
    # Unneeded plural forms are stored as well (needed since we may
    # have incorrect plural form data, so we can just reactivate them
    # once we fix the plural information for the language)
    sanitized_translations = {}
    sanitizer = Sanitizer(english_singular)
    for form, text in translations.items():
        sanitized_translations[form] = sanitizer.sanitize(text)

    # Expected plural forms should all exist and empty translations should
    # be normalized to None.
    if pluralforms is None:
        pluralforms = 2
    for pluralform in range(pluralforms):
        if pluralform not in sanitized_translations:
            sanitized_translations[pluralform] = None

    return sanitized_translations


def sanitize_translations_from_import(
        english_singular, translations, pluralforms):
    # At import time we want to ensure that the english_singular does not
    # contain mixed newline styles.
    if Sanitizer._getNewlineStyle(english_singular) is Sanitizer.mixed_style:
        raise MixedNewlineMarkersError(
            "Original text (%r) mixes different newline markers." %
                english_singular)
    return sanitize_translations(english_singular, translations, pluralforms)


def sanitize_translations_from_webui(
        english_singular, translations, pluralforms):
    return sanitize_translations(english_singular, translations, pluralforms)