~launchpad-pqm/launchpad/devel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Copyright 2009 Canonical Ltd.  This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).

"""Translation-related formatting functions."""

__metaclass__ = type

__all__ = [
    'contract_rosetta_escapes',
    'convert_newlines_to_web_form',
    'count_lines',
    'expand_rosetta_escapes',
    'parse_cformat_string',
    'text_to_html',
    ]

from math import ceil
import re
from xml.sax.saxutils import escape as xml_escape

from canonical.launchpad import helpers
from lp.translations.interfaces.translations import TranslationConstants


class UnrecognisedCFormatString(ValueError):
    """Exception: C-style format string fails to parse."""


def contract_rosetta_escapes(text):
    """Replace Rosetta escape sequences with the real characters."""
    return helpers.text_replaced(text, {'[tab]': '\t',
                                        r'\[tab]': '[tab]',
                                        '[nbsp]': u'\u00a0',
                                        r'\[nbsp]': '[nbsp]',
                                        '[nnbsp]': u'\u202f',
                                        r'\[nnbsp]': '[nnbsp]'})


def expand_rosetta_escapes(unicode_text):
    """Replace characters needing a Rosetta escape sequences."""
    escapes = {u'\t': TranslationConstants.TAB_CHAR,
               u'[tab]': TranslationConstants.TAB_CHAR_ESCAPED,
               u'\u00a0': TranslationConstants.NO_BREAK_SPACE_CHAR,
               u'[nbsp]': TranslationConstants.NO_BREAK_SPACE_CHAR_ESCAPED,
               u'\u202f': TranslationConstants.NARROW_NO_BREAK_SPACE_CHAR,
               u'[nnbsp]':
    TranslationConstants.NARROW_NO_BREAK_SPACE_CHAR_ESCAPED}
    return helpers.text_replaced(unicode_text, escapes)


def text_to_html(text, flags, space=TranslationConstants.SPACE_CHAR,
               newline=TranslationConstants.NEWLINE_CHAR):
    """Convert a unicode text to a HTML representation."""
    if text is None:
        return None

    lines = []
    # Replace leading and trailing spaces on each line with special markup.
    if u'\r\n' in text:
        newline_chars = u'\r\n'
    elif u'\r' in text:
        newline_chars = u'\r'
    else:
        newline_chars = u'\n'
    for line in xml_escape(text).split(newline_chars):
        # Pattern:
        # - group 1: zero or more spaces: leading whitespace
        # - group 2: zero or more groups of (zero or
        #   more spaces followed by one or more non-spaces): maximal string
        #   which doesn't begin or end with whitespace
        # - group 3: zero or more spaces: trailing whitespace
        match = re.match(u'^( *)((?: *[^ ]+)*)( *)$', line)

        if match:
            lines.append(
                space * len(match.group(1)) +
                match.group(2) +
                space * len(match.group(3)))
        else:
            raise AssertionError(
                "A regular expression that should always match didn't.")

    if 'c-format' in flags:
        # Replace c-format sequences with marked-up versions. If there is a
        # problem parsing the c-format sequences on a particular line, that
        # line is left unformatted.
        for i in range(len(lines)):
            formatted_line = ''

            try:
                segments = parse_cformat_string(lines[i])
            except UnrecognisedCFormatString:
                continue

            for segment in segments:
                type, content = segment

                if type == 'interpolation':
                    formatted_line += (u'<code>%s</code>' % content)
                elif type == 'string':
                    formatted_line += content

            lines[i] = formatted_line

    return expand_rosetta_escapes(newline.join(lines))


def convert_newlines_to_web_form(unicode_text):
    """Convert Unicode string to CR/LF line endings as used in web forms.

    Any style of line endings is accepted: MacOS-style CR, MS-DOS-style
    CR/LF, or rest-of-world-style LF.
    """
    if unicode_text is None:
        return None

    assert isinstance(unicode_text, unicode), (
        "The given text must be unicode instead of %s" % type(unicode_text))

    if unicode_text is None:
        return None
    elif u'\r\n' in unicode_text:
        # The text is already using the windows newline chars
        return unicode_text
    elif u'\n' in unicode_text:
        return helpers.text_replaced(unicode_text, {u'\n': u'\r\n'})
    else:
        return helpers.text_replaced(unicode_text, {u'\r': u'\r\n'})


def count_lines(text):
    """Count the number of physical lines in a string.

    This is always at least as large as the number of logical lines in a
    string.
    """
    if text is None:
        return 0

    CHARACTERS_PER_LINE = 60
    count = 0

    for line in text.split(u'\n'):
        if len(line) == 0:
            count += 1
        else:
            count += int(ceil(float(len(line)) / CHARACTERS_PER_LINE))

    return count


def parse_cformat_string(string):
    """Parse C-style format string into sequence of segments.

    The result is a sequence of tuples (type, content), where ``type`` is
    either "string" (for a plain piece of string) or "interpolation" (for a
    printf()-style substitution).  The other part of the tuple, ``content``,
    will be the part of the input string that makes up the given element, so
    either plain text or a printf substitution such as ``%s`` or ``%.3d``.

    As in printf(), the double parenthesis (%%) is taken as plain text.
    """
    # The sequence '%%' is not counted as an interpolation. Perhaps splitting
    # into 'special' and 'non-special' sequences would be better.

    # This function works on the basis that s can be one of three things: an
    # empty string, a string beginning with a sequence containing no
    # interpolations, or a string beginning with an interpolation.
    segments = []
    end = string
    plain_re = re.compile('(%%|[^%])+')
    interpolation_re = re.compile('%[^diouxXeEfFgGcspmn]*[diouxXeEfFgGcspmn]')

    while end:
        # Check for a interpolation-less prefix.
        match = plain_re.match(end)
        if match:
            segment = match.group(0)
            segments.append(('string', segment))
            end = end[len(segment):]
            continue

        # Check for an interpolation sequence at the beginning.
        match = interpolation_re.match(end)
        if match:
            segment = match.group(0)
            segments.append(('interpolation', segment))
            end = end[len(segment):]
            continue

        # Give up.
        raise UnrecognisedCFormatString(string)

    return segments