~launchpad-pqm/launchpad/devel

1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
5
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
9
10
A well-structured XML/HTML document yields a well-behaved data
11
structure. An ill-structured XML/HTML document yields a
12
correspondingly ill-behaved data structure. If your document is only
13
locally well-structured, you can use this library to find and process
14
the well-structured part of it.
15
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
19
20
* chardet, for auto-detecting character encodings
21
  http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23
  by stock Python.
24
  http://cjkpython.i18n.org/
25
26
Beautiful Soup defines classes for two main parsing strategies:
27
    
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
28
 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
   language that kind of looks like XML.
30
31
 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
32
   or invalid. This class has web browser-like heuristics for
33
   obtaining a sensible parse tree in the face of common HTML errors.
34
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
38
Parser.
39
40
For more than you ever wanted to know about Beautiful Soup, see the
41
documentation:
42
http://www.crummy.com/software/BeautifulSoup/documentation.html
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
43
"""
44
from __future__ import generators
45
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
46
__author__ = "Leonard Richardson (crummy.com)"
47
__contributors__ = ["Sam Ruby (intertwingly.net)",
48
                    "the unwitting Mark Pilgrim (diveintomark.org)",
49
                    "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
50
__version__ = "3.0.3"
51
__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
52
__license__ = "PSF"
53
54
from sgmllib import SGMLParser, SGMLParseError
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
55
import codecs
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
56
import types
57
import re
58
import sgmllib
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
59
from htmlentitydefs import name2codepoint
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
60
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
61
# This RE makes Beautiful Soup able to parse XML with namespaces.
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
62
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
64
# This RE makes Beautiful Soup capable of recognizing numeric character
65
# references that use hexadecimal.
66
sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
67
68
DEFAULT_OUTPUT_ENCODING = "utf-8"
69
70
# First, the classes that represent markup elements.
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
71
72
class PageElement:
73
    """Contains the navigational information for some part of the page
74
    (either a tag or a piece of text)"""
75
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
76
    def setup(self, parent=None, previous=None):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
77
        """Sets up the initial relations between this element and
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
78
        other elements."""        
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
79
        self.parent = parent
80
        self.previous = previous
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
81
        self.next = None
82
        self.previousSibling = None
83
        self.nextSibling = None
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
84
        if self.parent and self.parent.contents:
85
            self.previousSibling = self.parent.contents[-1]
86
            self.previousSibling.nextSibling = self
87
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
88
    def replaceWith(self, replaceWith):        
89
        oldParent = self.parent
90
        myIndex = self.parent.contents.index(self)
91
        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
92
            # We're replacing this element with one of its siblings.
93
            index = self.parent.contents.index(replaceWith)
94
            if index and index < myIndex:
95
                # Furthermore, it comes before this element. That
96
                # means that when we extract it, the index of this
97
                # element will change.
98
                myIndex = myIndex - 1
99
        self.extract()        
100
        oldParent.insert(myIndex, replaceWith)
101
        
102
    def extract(self):
103
        """Destructively rips this element out of the tree."""        
104
        if self.parent:
105
            try:
106
                self.parent.contents.remove(self)
107
            except ValueError:
108
                pass
109
110
        #Find the two elements that would be next to each other if
111
        #this element (and any children) hadn't been parsed. Connect
112
        #the two.        
113
        lastChild = self._lastRecursiveChild()
114
        nextElement = lastChild.next
115
116
        if self.previous:
117
            self.previous.next = nextElement
118
        if nextElement:
119
            nextElement.previous = self.previous
120
        self.previous = None
121
        lastChild.next = None
122
123
        self.parent = None        
124
        if self.previousSibling:
125
            self.previousSibling.nextSibling = self.nextSibling
126
        if self.nextSibling:
127
            self.nextSibling.previousSibling = self.previousSibling
128
        self.previousSibling = self.nextSibling = None       
129
130
    def _lastRecursiveChild(self):
131
        "Finds the last element beneath this object to be parsed."
132
        lastChild = self
133
        while hasattr(lastChild, 'contents') and lastChild.contents:
134
            lastChild = lastChild.contents[-1]
135
        return lastChild
136
137
    def insert(self, position, newChild):
138
        if (isinstance(newChild, basestring)
139
            or isinstance(newChild, unicode)) \
140
            and not isinstance(newChild, NavigableString):
141
            newChild = NavigableString(newChild)        
142
143
        position =  min(position, len(self.contents))
144
        if hasattr(newChild, 'parent') and newChild.parent != None:
145
            # We're 'inserting' an element that's already one
146
            # of this object's children. 
147
            if newChild.parent == self:
148
                index = self.find(newChild)
149
                if index and index < position:
150
                    # Furthermore we're moving it further down the
151
                    # list of this object's children. That means that
152
                    # when we extract this element, our target index
153
                    # will jump down one.
154
                    position = position - 1
155
            newChild.extract()
156
            
157
        newChild.parent = self
158
        previousChild = None
159
        if position == 0:
160
            newChild.previousSibling = None
161
            newChild.previous = self
162
        else:
163
            previousChild = self.contents[position-1]
164
            newChild.previousSibling = previousChild
165
            newChild.previousSibling.nextSibling = newChild
166
            newChild.previous = previousChild._lastRecursiveChild()
167
        if newChild.previous:
168
            newChild.previous.next = newChild        
169
170
        newChildsLastElement = newChild._lastRecursiveChild()
171
172
        if position >= len(self.contents):
173
            newChild.nextSibling = None
174
            
175
            parent = self
176
            parentsNextSibling = None
177
            while not parentsNextSibling:
178
                parentsNextSibling = parent.nextSibling
179
                parent = parent.parent
180
                if not parent: # This is the last element in the document.
181
                    break
182
            if parentsNextSibling:
183
                newChildsLastElement.next = parentsNextSibling
184
            else:
185
                newChildsLastElement.next = None
186
        else:
187
            nextChild = self.contents[position]            
188
            newChild.nextSibling = nextChild            
189
            if newChild.nextSibling:
190
                newChild.nextSibling.previousSibling = newChild
191
            newChildsLastElement.next = nextChild
192
193
        if newChildsLastElement.next:
194
            newChildsLastElement.next.previous = newChildsLastElement
195
        self.contents.insert(position, newChild)
196
197
    def findNext(self, name=None, attrs={}, text=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
198
        """Returns the first item that matches the given criteria and
199
        appears after this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
200
        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
201
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
202
    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
203
                    **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
204
        """Returns all items that match the given criteria and appear
205
        before after Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
206
        return self._findAll(name, attrs, text, limit, self.nextGenerator)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
207
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
208
    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
209
        """Returns the closest sibling to this Tag that matches the
210
        given criteria and appears after this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
211
        return self._findOne(self.findNextSiblings, name, attrs, text,
212
                             **kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
213
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
214
    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
215
                         **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
216
        """Returns the siblings of this Tag that match the given
217
        criteria and appear after this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
218
        return self._findAll(name, attrs, text, limit,
219
                             self.nextSiblingGenerator, **kwargs)
220
    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
221
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
222
    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
223
        """Returns the first item that matches the given criteria and
224
        appears before this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
225
        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
226
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
227
    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
228
                        **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
229
        """Returns all items that match the given criteria and appear
230
        before this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
231
        return self._findAll(name, attrs, text, limit, self.previousGenerator,
232
                           **kwargs)
233
    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
234
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
235
    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
236
        """Returns the closest sibling to this Tag that matches the
237
        given criteria and appears before this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
238
        return self._findOne(self.findPreviousSiblings, name, attrs, text,
239
                             **kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
240
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
241
    def findPreviousSiblings(self, name=None, attrs={}, text=None,
242
                             limit=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
243
        """Returns the siblings of this Tag that match the given
244
        criteria and appear before this Tag in the document."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
245
        return self._findAll(name, attrs, text, limit,
246
                             self.previousSiblingGenerator, **kwargs)
247
    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
248
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
249
    def findParent(self, name=None, attrs={}, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
250
        """Returns the closest parent of this Tag that matches the given
251
        criteria."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
252
        # NOTE: We can't use _findOne because findParents takes a different
253
        # set of arguments.
254
        r = None
255
        l = self.findParents(name, attrs, 1)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
256
        if l:
257
            r = l[0]
258
        return r
259
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
260
    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
261
        """Returns the parents of this Tag that match the given
262
        criteria."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
263
264
        return self._findAll(name, attrs, None, limit, self.parentGenerator,
265
                             **kwargs)
266
    fetchParents = findParents # Compatibility with pre-3.x
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
267
268
    #These methods do the real heavy lifting.
269
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
270
    def _findOne(self, method, name, attrs, text, **kwargs):
271
        r = None
272
        l = method(name, attrs, text, 1, **kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
273
        if l:
274
            r = l[0]
275
        return r
276
    
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
277
    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
278
        "Iterates over a generator looking for things that match."
279
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
280
        if isinstance(name, SoupStrainer):
281
            strainer = name
282
        else:
283
            # Build a SoupStrainer
284
            strainer = SoupStrainer(name, attrs, text, **kwargs)
285
        results = ResultSet(strainer)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
286
        g = generator()
287
        while True:
288
            try:
289
                i = g.next()
290
            except StopIteration:
291
                break
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
292
            if i:
293
                found = strainer.search(i)
294
                if found:
295
                    results.append(found)
296
                    if limit and len(results) >= limit:
297
                        break
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
298
        return results
299
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
300
    #These Generators can be used to navigate starting from both
301
    #NavigableStrings and Tags.                
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
302
    def nextGenerator(self):
303
        i = self
304
        while i:
305
            i = i.next
306
            yield i
307
308
    def nextSiblingGenerator(self):
309
        i = self
310
        while i:
311
            i = i.nextSibling
312
            yield i
313
314
    def previousGenerator(self):
315
        i = self
316
        while i:
317
            i = i.previous
318
            yield i
319
320
    def previousSiblingGenerator(self):
321
        i = self
322
        while i:
323
            i = i.previousSibling
324
            yield i
325
326
    def parentGenerator(self):
327
        i = self
328
        while i:
329
            i = i.parent
330
            yield i
331
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
332
    # Utility methods
333
    def substituteEncoding(self, str, encoding=None):
334
        encoding = encoding or "utf-8"
335
        return str.replace("%SOUP-ENCODING%", encoding)    
336
337
    def toEncoding(self, s, encoding=None):
338
        """Encodes an object to a string in some encoding, or to Unicode.
339
        ."""
340
        if isinstance(s, unicode):
341
            if encoding:
342
                s = s.encode(encoding)
343
        elif isinstance(s, str):
344
            if encoding:
345
                s = s.encode(encoding)
346
            else:
347
                s = unicode(s)
348
        else:
349
            if encoding:
350
                s  = self.toEncoding(str(s), encoding)
351
            else:
352
                s = unicode(s)
353
        return s
354
355
class NavigableString(unicode, PageElement):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
356
357
    def __getattr__(self, attr):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
358
        """text.string gives you text. This is for backwards
359
        compatibility for Navigable*String, but for CData* it lets you
360
        get the string without the CData wrapper."""
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
361
        if attr == 'string':
362
            return self
363
        else:
364
            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
365
366
    def __unicode__(self):
367
        return __str__(self, None)
368
369
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
370
        if encoding:
371
            return self.encode(encoding)
372
        else:
373
            return self
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
374
        
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
375
class CData(NavigableString):
376
377
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378
        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
379
380
class ProcessingInstruction(NavigableString):
381
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
382
        output = self
383
        if "%SOUP-ENCODING%" in output:
384
            output = self.substituteEncoding(output, encoding)
385
        return "<?%s?>" % self.toEncoding(output, encoding)
386
387
class Comment(NavigableString):
388
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389
        return "<!--%s-->" % NavigableString.__str__(self, encoding)    
390
391
class Declaration(NavigableString):
392
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
393
        return "<!%s>" % NavigableString.__str__(self, encoding)        
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
394
395
class Tag(PageElement):
396
    """Represents a found HTML tag with its attributes and contents."""
397
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
398
    XML_ENTITIES_TO_CHARS = { 'apos' : "'",
399
                              "quot" : '"',
400
                              "amp" : "&",
401
                              "lt" : "<",
402
                              "gt" : ">"
403
                              }
404
    # An RE for finding ampersands that aren't the start of of a
405
    # numeric entity.
406
    BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
407
408
    def __init__(self, parser, name, attrs=None, parent=None,
409
                 previous=None):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
410
        "Basic constructor."
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
411
412
        # We don't actually store the parser object: that lets extracted
413
        # chunks be garbage-collected
414
        self.parserClass = parser.__class__
415
        self.isSelfClosing = parser.isSelfClosingTag(name)
416
        self.convertHTMLEntities = parser.convertHTMLEntities
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
417
        self.name = name
418
        if attrs == None:
419
            attrs = []
420
        self.attrs = attrs
421
        self.contents = []
422
        self.setup(parent, previous)
423
        self.hidden = False
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
424
        self.containsSubstitutions = False
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
425
426
    def get(self, key, default=None):
427
        """Returns the value of the 'key' attribute for the tag, or
428
        the value given for 'default' if it doesn't have that
429
        attribute."""
430
        return self._getAttrMap().get(key, default)    
431
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
432
    def has_key(self, key):
433
        return self._getAttrMap().has_key(key)
434
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
435
    def __getitem__(self, key):
436
        """tag[key] returns the value of the 'key' attribute for the tag,
437
        and throws an exception if it's not there."""
438
        return self._getAttrMap()[key]
439
440
    def __iter__(self):
441
        "Iterating over a tag iterates over its contents."
442
        return iter(self.contents)
443
444
    def __len__(self):
445
        "The length of a tag is the length of its list of contents."
446
        return len(self.contents)
447
448
    def __contains__(self, x):
449
        return x in self.contents
450
451
    def __nonzero__(self):
452
        "A tag is non-None even if it has no contents."
453
        return True
454
455
    def __setitem__(self, key, value):        
456
        """Setting tag[key] sets the value of the 'key' attribute for the
457
        tag."""
458
        self._getAttrMap()
459
        self.attrMap[key] = value
460
        found = False
461
        for i in range(0, len(self.attrs)):
462
            if self.attrs[i][0] == key:
463
                self.attrs[i] = (key, value)
464
                found = True
465
        if not found:
466
            self.attrs.append((key, value))
467
        self._getAttrMap()[key] = value
468
469
    def __delitem__(self, key):
470
        "Deleting tag[key] deletes all 'key' attributes for the tag."
471
        for item in self.attrs:
472
            if item[0] == key:
473
                self.attrs.remove(item)
474
                #We don't break because bad HTML can define the same
475
                #attribute multiple times.
476
            self._getAttrMap()
477
            if self.attrMap.has_key(key):
478
                del self.attrMap[key]
479
480
    def __call__(self, *args, **kwargs):
481
        """Calling a tag like a function is the same as calling its
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
482
        findAll() method. Eg. tag('a') returns a list of all the A tags
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
483
        found within this tag."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
484
        return apply(self.findAll, args, kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
485
486
    def __getattr__(self, tag):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
487
        #print "Getattr %s.%s" % (self.__class__, tag)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
488
        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
489
            return self.find(tag[:-3])
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
490
        elif tag.find('__') != 0:
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
491
            return self.find(tag)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
492
493
    def __eq__(self, other):
494
        """Returns true iff this tag has the same name, the same attributes,
495
        and the same contents (recursively) as the given tag.
496
497
        NOTE: right now this will return false if two tags have the
498
        same attributes in a different order. Should this be fixed?"""
499
        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
500
            return False
501
        for i in range(0, len(self.contents)):
502
            if self.contents[i] != other.contents[i]:
503
                return False
504
        return True
505
506
    def __ne__(self, other):
507
        """Returns true iff this tag is not identical to the other tag,
508
        as defined in __eq__."""
509
        return not self == other
510
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
511
    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
512
        """Renders this tag as a string."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
513
        return self.__str__(encoding)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
514
515
    def __unicode__(self):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
516
        return self.__str__(None)
517
518
    def _convertEntities(self, match):
519
        x = match.group(1)
520
        if x in name2codepoint:
521
            return unichr(name2codepoint[x])            
522
        elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
523
            return '&%s;' % x
524
        else:
525
            return '&amp;%s;' % x
526
527
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
528
                prettyPrint=False, indentLevel=0):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
529
        """Returns a string or Unicode representation of this tag and
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
530
        its contents. To get Unicode, pass None for encoding.
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
531
532
        NOTE: since Python's HTML parser consumes whitespace, this
533
        method is not certain to reproduce the whitespace present in
534
        the original string."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
535
536
        encodedName = self.toEncoding(self.name, encoding)
537
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
538
        attrs = []
539
        if self.attrs:
540
            for key, val in self.attrs:
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
541
                fmt = '%s="%s"'
542
                if isString(val):                    
543
                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
544
                        val = self.substituteEncoding(val, encoding)
545
546
                    # The attribute value either:
547
                    #
548
                    # * Contains no embedded double quotes or single quotes.
549
                    #   No problem: we enclose it in double quotes.
550
                    # * Contains embedded single quotes. No problem:
551
                    #   double quotes work here too.
552
                    # * Contains embedded double quotes. No problem:
553
                    #   we enclose it in single quotes.
554
                    # * Embeds both single _and_ double quotes. This
555
                    #   can't happen naturally, but it can happen if
556
                    #   you modify an attribute value after parsing
557
                    #   the document. Now we have a bit of a
558
                    #   problem. We solve it by enclosing the
559
                    #   attribute in single quotes, and escaping any
560
                    #   embedded single quotes to XML entities.
561
                    if '"' in val:
562
                        # This can't happen naturally, but it can happen
563
                        # if you modify an attribute value after parsing.
564
                        if "'" in val:
565
                            val = val.replace('"', "&quot;")
566
                        else:
567
                            fmt = "%s='%s'"
568
569
                    # Optionally convert any HTML entities
570
                    if self.convertHTMLEntities:
571
                        val = re.sub("&(\w+);", self._convertEntities, val)
572
573
                    # Now we're okay w/r/t quotes. But the attribute
574
                    # value might also contain angle brackets, or
575
                    # ampersands that aren't part of entities. We need
576
                    # to escape those to XML entities too.
577
                    val = val.replace("<", "&lt;").replace(">", "&gt;")
578
                    val = self.BARE_AMPERSAND.sub("&amp;", val)
579
580
                                      
581
                attrs.append(fmt % (self.toEncoding(key, encoding),
582
                                    self.toEncoding(val, encoding)))
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
583
        close = ''
584
        closeTag = ''
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
585
        if self.isSelfClosing:
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
586
            close = ' /'
587
        else:
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
588
            closeTag = '</%s>' % encodedName
589
590
        indentTag, indentContents = 0, 0
591
        if prettyPrint:
592
            indentTag = indentLevel
593
            space = (' ' * (indentTag-1))
594
            indentContents = indentTag + 1
595
        contents = self.renderContents(encoding, prettyPrint, indentContents)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
596
        if self.hidden:
597
            s = contents
598
        else:
599
            s = []
600
            attributeString = ''
601
            if attrs:
602
                attributeString = ' ' + ' '.join(attrs)            
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
603
            if prettyPrint:
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
604
                s.append(space)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
605
            s.append('<%s%s%s>' % (encodedName, attributeString, close))
606
            if prettyPrint:
607
                s.append("\n")
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
608
            s.append(contents)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
609
            if prettyPrint and contents and contents[-1] != "\n":
610
                s.append("\n")
611
            if prettyPrint and closeTag:
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
612
                s.append(space)
613
            s.append(closeTag)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
614
            if prettyPrint and closeTag and self.nextSibling:
615
                s.append("\n")
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
616
            s = ''.join(s)
617
        return s
618
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
619
    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
620
        return self.__str__(encoding, True)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
621
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
622
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
623
                       prettyPrint=False, indentLevel=0):
624
        """Renders the contents of this tag as a string in the given
625
        encoding. If encoding is None, returns a Unicode string.."""
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
626
        s=[]
627
        for c in self:
628
            text = None
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
629
            if isinstance(c, NavigableString):
630
                text = c.__str__(encoding)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
631
            elif isinstance(c, Tag):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
632
                s.append(c.__str__(encoding, prettyPrint, indentLevel))
633
            if text and prettyPrint:
634
                text = text.strip()              
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
635
            if text:
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
636
                if prettyPrint:
637
                    s.append(" " * (indentLevel-1))
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
638
                s.append(text)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
639
                if prettyPrint:
640
                    s.append("\n")
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
641
        return ''.join(s)    
642
643
    #Soup methods
644
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
645
    def find(self, name=None, attrs={}, recursive=True, text=None,
646
             **kwargs):
647
        """Return only the first child of this Tag matching the given
648
        criteria."""
649
        r = None
650
        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
651
        if l:
652
            r = l[0]
653
        return r
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
654
    findChild = find
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
655
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
656
    def findAll(self, name=None, attrs={}, recursive=True, text=None,
657
                limit=None, **kwargs):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
658
        """Extracts a list of Tag objects that match the given
659
        criteria.  You can specify the name of the Tag and any
660
        attributes you want the Tag to have.
661
662
        The value of a key-value pair in the 'attrs' map can be a
663
        string, a list of strings, a regular expression object, or a
664
        callable that takes a string and returns whether or not the
665
        string matches for some custom definition of 'matches'. The
666
        same is true of the tag name."""
667
        generator = self.recursiveChildGenerator
668
        if not recursive:
669
            generator = self.childGenerator
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
670
        return self._findAll(name, attrs, text, limit, generator, **kwargs)
671
    findChildren = findAll
672
673
    # Pre-3.x compatibility methods
674
    first = find
675
    fetch = findAll
676
    
677
    def fetchText(self, text=None, recursive=True, limit=None):
678
        return self.findAll(text=text, recursive=recursive, limit=limit)
679
680
    def firstText(self, text=None, recursive=True):
681
        return self.find(text=text, recursive=recursive)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
682
    
683
    #Utility methods
684
685
    def append(self, tag):
686
        """Appends the given tag to the contents of this tag."""
687
        self.contents.append(tag)
688
689
    #Private methods
690
691
    def _getAttrMap(self):
692
        """Initializes a map representation of this tag's attributes,
693
        if not already initialized."""
694
        if not getattr(self, 'attrMap'):
695
            self.attrMap = {}
696
            for (key, value) in self.attrs:
697
                self.attrMap[key] = value 
698
        return self.attrMap
699
700
    #Generator methods
701
    def childGenerator(self):
702
        for i in range(0, len(self.contents)):
703
            yield self.contents[i]
704
        raise StopIteration
705
    
706
    def recursiveChildGenerator(self):
707
        stack = [(self, 0)]
708
        while stack:
709
            tag, start = stack.pop()
710
            if isinstance(tag, Tag):            
711
                for i in range(start, len(tag.contents)):
712
                    a = tag.contents[i]
713
                    yield a
714
                    if isinstance(a, Tag) and tag.contents:
715
                        if i < len(tag.contents) - 1:
716
                            stack.append((tag, i+1))
717
                        stack.append((a, 0))
718
                        break
719
        raise StopIteration
720
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
721
# Next, a couple classes to represent queries and their results.
722
class SoupStrainer:
723
    """Encapsulates a number of ways of matching a markup element (tag or
724
    text)."""
725
726
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
727
        self.name = name
728
        if isString(attrs):
729
            kwargs['class'] = attrs
730
            attrs = None
731
        if kwargs:
732
            if attrs:
733
                attrs = attrs.copy()
734
                attrs.update(kwargs)
735
            else:
736
                attrs = kwargs
737
        self.attrs = attrs
738
        self.text = text
739
740
    def __str__(self):
741
        if self.text:
742
            return self.text
743
        else:
744
            return "%s|%s" % (self.name, self.attrs)
745
    
746
    def searchTag(self, markupName=None, markupAttrs={}):
747
        found = None
748
        markup = None
749
        if isinstance(markupName, Tag):
750
            markup = markupName
751
            markupAttrs = markup
752
        callFunctionWithTagData = callable(self.name) \
753
                                and not isinstance(markupName, Tag)
754
755
        if (not self.name) \
756
               or callFunctionWithTagData \
757
               or (markup and self._matches(markup, self.name)) \
758
               or (not markup and self._matches(markupName, self.name)):
759
            if callFunctionWithTagData:
760
                match = self.name(markupName, markupAttrs)
761
            else:
762
                match = True            
763
                markupAttrMap = None
764
                for attr, matchAgainst in self.attrs.items():
765
                    if not markupAttrMap:
766
                         if hasattr(markupAttrs, 'get'):
767
                            markupAttrMap = markupAttrs
768
                         else:
769
                            markupAttrMap = {}
770
                            for k,v in markupAttrs:
771
                                markupAttrMap[k] = v
772
                    attrValue = markupAttrMap.get(attr)
773
                    if not self._matches(attrValue, matchAgainst):
774
                        match = False
775
                        break
776
            if match:
777
                if markup:
778
                    found = markup
779
                else:
780
                    found = markupName
781
        return found
782
783
    def search(self, markup):
784
        #print 'looking for %s in %s' % (self, markup)
785
        found = None
786
        # If given a list of items, scan it for a text element that
787
        # matches.        
788
        if isList(markup) and not isinstance(markup, Tag):
789
            for element in markup:
790
                if isinstance(element, NavigableString) \
791
                       and self.search(element):
792
                    found = element
793
                    break
794
        # If it's a Tag, make sure its name or attributes match.
795
        # Don't bother with Tags if we're searching for text.
796
        elif isinstance(markup, Tag):
797
            if not self.text:
798
                found = self.searchTag(markup)
799
        # If it's text, make sure the text matches.
800
        elif isinstance(markup, NavigableString) or \
801
                 isString(markup):
802
            if self._matches(markup, self.text):
803
                found = markup
804
        else:
805
            raise Exception, "I don't know how to match against a %s" \
806
                  % markup.__class__
807
        return found
808
        
809
    def _matches(self, markup, matchAgainst):    
810
        #print "Matching %s against %s" % (markup, matchAgainst)
811
        result = False
812
        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
813
            result = markup != None
814
        elif callable(matchAgainst):
815
            result = matchAgainst(markup)
816
        else:
817
            #Custom match methods take the tag as an argument, but all
818
            #other ways of matching match the tag name as a string.
819
            if isinstance(markup, Tag):
820
                markup = markup.name
821
            if markup and not isString(markup):
822
                markup = unicode(markup)
823
            #Now we know that chunk is either a string, or None.
824
            if hasattr(matchAgainst, 'match'):
825
                # It's a regexp object.
826
                result = markup and matchAgainst.search(markup)
827
            elif isList(matchAgainst):
828
                result = markup in matchAgainst
829
            elif hasattr(matchAgainst, 'items'):
830
                result = markup.has_key(matchAgainst)
831
            elif matchAgainst and isString(markup):
832
                if isinstance(markup, unicode):
833
                    matchAgainst = unicode(matchAgainst)
834
                else:
835
                    matchAgainst = str(matchAgainst)
836
837
            if not result:
838
                result = matchAgainst == markup
839
        return result
840
841
class ResultSet(list):
842
    """A ResultSet is just a list that keeps track of the SoupStrainer
843
    that created it."""
844
    def __init__(self, source):
845
        list.__init__([])
846
        self.source = source
847
848
# Now, some helper functions.
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
849
850
def isList(l):
851
    """Convenience method that works with all 2.x versions of Python
852
    to determine whether or not something is listlike."""
853
    return hasattr(l, '__iter__') \
854
           or (type(l) in (types.ListType, types.TupleType))
855
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
856
def isString(s):
857
    """Convenience method that works with all 2.x versions of Python
858
    to determine whether or not something is stringlike."""
859
    try:
860
        return isinstance(s, unicode) or isintance(s, basestring) 
861
    except NameError:
862
        return isinstance(s, str)
863
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
864
def buildTagMap(default, *args):
865
    """Turns a list of maps, lists, or scalars into a single map.
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
866
    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
867
    NESTING_RESET_TAGS maps out of lists and partial maps."""
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
868
    built = {}
869
    for portion in args:
870
        if hasattr(portion, 'items'):
871
            #It's a map. Merge it.
872
            for k,v in portion.items():
873
                built[k] = v
874
        elif isList(portion):
875
            #It's a list. Map each item to the default.
876
            for k in portion:
877
                built[k] = default
878
        else:
879
            #It's a scalar. Map it to the default.
880
            built[portion] = default
881
    return built
882
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
883
# Now, the parser classes.
884
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
885
class BeautifulStoneSoup(Tag, SGMLParser):
886
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
887
    """This class contains the basic parser and search code. It defines
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
888
    a parser that knows nothing about tag behavior except for the
889
    following:
890
   
891
      You can't close a tag without closing all the tags it encloses.
892
      That is, "<foo><bar></foo>" actually means
893
      "<foo><bar></bar></foo>".
894
895
    [Another possible explanation is "<foo><bar /></foo>", but since
896
    this class defines no SELF_CLOSING_TAGS, it will never use that
897
    explanation.]
898
899
    This class is useful for parsing XML or made-up markup languages,
900
    or when BeautifulSoup makes an assumption counter to what you were
901
    expecting."""
902
903
    SELF_CLOSING_TAGS = {}
904
    NESTABLE_TAGS = {}
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
905
    RESET_NESTING_TAGS = {}
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
906
    QUOTE_TAGS = {}
907
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
908
    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
909
                       lambda x: x.group(1) + ' />'),
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
910
                      (re.compile('<!\s+([^<>]*)>'),
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
911
                       lambda x: '<!' + x.group(1) + '>')
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
912
                      ]
913
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
914
    ROOT_TAG_NAME = u'[document]'
915
916
    HTML_ENTITIES = "html"
917
    XML_ENTITIES = "xml"
918
    ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
919
920
    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
921
                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
922
                 convertEntities=None, selfClosingTags=None):
923
        """The Soup object is initialized as the 'root tag', and the
924
        provided markup (which can be a string or a file-like object)
925
        is fed into the underlying parser. 
926
927
        sgmllib will process most bad HTML, and the BeautifulSoup
928
        class has some tricks for dealing with some HTML that kills
929
        sgmllib, but Beautiful Soup can nonetheless choke or lose data
930
        if your data uses self-closing tags or declarations
931
        incorrectly.
932
933
        By default, Beautiful Soup uses regexes to sanitize input,
934
        avoiding the vast majority of these problems. If the problems
935
        don't apply to you, pass in False for markupMassage, and
936
        you'll get better performance.
937
938
        The default parser massage techniques fix the two most common
939
        instances of invalid HTML that choke sgmllib:
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
940
941
         <br/> (No space between name of closing tag and tag close)
942
         <! --Comment--> (Extraneous whitespace in declaration)
943
944
        You can pass in a custom list of (RE object, replace method)
945
        tuples to get Beautiful Soup to scrub your input the way you
946
        want."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
947
948
        self.parseOnlyThese = parseOnlyThese
949
        self.fromEncoding = fromEncoding
950
        self.smartQuotesTo = smartQuotesTo
951
952
        if convertEntities:
953
            # It doesn't make sense to convert encoded characters to
954
            # entities even while you're converting entities to Unicode.
955
            # Just convert it all to Unicode.
956
            self.smartQuotesTo = None
957
958
        if isList(convertEntities):
959
            self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
960
            self.convertXMLEntities = self.XML_ENTITIES in convertEntities
961
        else:
962
            self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
963
            self.convertXMLEntities = self.XML_ENTITIES == convertEntities
964
965
        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
966
        SGMLParser.__init__(self)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
967
            
968
        if hasattr(markup, 'read'):        # It's a file-type object.
969
            markup = markup.read()
970
        self.markup = markup
971
        self.markupMassage = markupMassage
972
        try:
973
            self._feed()
974
        except StopParsing:
975
            pass
976
        self.markup = None                 # The markup can now be GCed
977
978
    def _feed(self, inDocumentEncoding=None):
979
        # Convert the document to Unicode.
980
        markup = self.markup
981
        if isinstance(markup, unicode):
982
            if not hasattr(self, 'originalEncoding'):
983
                self.originalEncoding = None
984
        else:
985
            dammit = UnicodeDammit\
986
                     (markup, [self.fromEncoding, inDocumentEncoding],
987
                      smartQuotesTo=self.smartQuotesTo)
988
            markup = dammit.unicode
989
            self.originalEncoding = dammit.originalEncoding
990
        if markup:
991
            if self.markupMassage:
992
                if not isList(self.markupMassage):
993
                    self.markupMassage = self.MARKUP_MASSAGE            
994
                for fix, m in self.markupMassage:
995
                    markup = fix.sub(m, markup)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
996
        self.reset()
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
997
998
        SGMLParser.feed(self, markup or "")
999
        SGMLParser.close(self)
1000
        # Close out any unfinished strings and close all the open tags.
1001
        self.endData()
1002
        while self.currentTag.name != self.ROOT_TAG_NAME:
1003
            self.popTag()
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1004
1005
    def __getattr__(self, methodName):
1006
        """This method routes method call requests to either the SGMLParser
1007
        superclass or the Tag superclass, depending on the method name."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1008
        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1009
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1010
        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1011
               or methodName.find('do_') == 0:
1012
            return SGMLParser.__getattr__(self, methodName)
1013
        elif methodName.find('__') != 0:
1014
            return Tag.__getattr__(self, methodName)
1015
        else:
1016
            raise AttributeError
1017
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1018
    def isSelfClosingTag(self, name):
1019
        """Returns true iff the given string is the name of a
1020
        self-closing tag according to this parser."""
1021
        return self.SELF_CLOSING_TAGS.has_key(name) \
1022
               or self.instanceSelfClosingTags.has_key(name)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1023
            
1024
    def reset(self):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1025
        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1026
        self.hidden = 1
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1027
        SGMLParser.reset(self)
1028
        self.currentData = []
1029
        self.currentTag = None
1030
        self.tagStack = []
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1031
        self.quoteStack = []
1032
        self.pushTag(self)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1033
    
1034
    def popTag(self):
1035
        tag = self.tagStack.pop()
1036
        # Tags with just one string-owning child get the child as a
1037
        # 'string' property, so that soup.tag.string is shorthand for
1038
        # soup.tag.contents[0]
1039
        if len(self.currentTag.contents) == 1 and \
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1040
           isinstance(self.currentTag.contents[0], NavigableString):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1041
            self.currentTag.string = self.currentTag.contents[0]
1042
1043
        #print "Pop", tag.name
1044
        if self.tagStack:
1045
            self.currentTag = self.tagStack[-1]
1046
        return self.currentTag
1047
1048
    def pushTag(self, tag):
1049
        #print "Push", tag.name
1050
        if self.currentTag:
1051
            self.currentTag.append(tag)
1052
        self.tagStack.append(tag)
1053
        self.currentTag = self.tagStack[-1]
1054
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1055
    def endData(self, containerClass=NavigableString):
1056
        if self.currentData:
1057
            currentData = ''.join(self.currentData)
1058
            if currentData.endswith('<') and self.convertHTMLEntities:
1059
                currentData = currentData[:-1] + '&lt;'
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1060
            if not currentData.strip():
1061
                if '\n' in currentData:
1062
                    currentData = '\n'
1063
                else:
1064
                    currentData = ' '
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1065
            self.currentData = []
1066
            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1067
                   (not self.parseOnlyThese.text or \
1068
                    not self.parseOnlyThese.search(currentData)):
1069
                return
1070
            o = containerClass(currentData)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1071
            o.setup(self.currentTag, self.previous)
1072
            if self.previous:
1073
                self.previous.next = o
1074
            self.previous = o
1075
            self.currentTag.contents.append(o)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1076
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1077
1078
    def _popToTag(self, name, inclusivePop=True):
1079
        """Pops the tag stack up to and including the most recent
1080
        instance of the given tag. If inclusivePop is false, pops the tag
1081
        stack up to but *not* including the most recent instqance of
1082
        the given tag."""
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1083
        #print "Popping to %s" % name
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1084
        if name == self.ROOT_TAG_NAME:
1085
            return            
1086
1087
        numPops = 0
1088
        mostRecentTag = None
1089
        for i in range(len(self.tagStack)-1, 0, -1):
1090
            if name == self.tagStack[i].name:
1091
                numPops = len(self.tagStack)-i
1092
                break
1093
        if not inclusivePop:
1094
            numPops = numPops - 1
1095
1096
        for i in range(0, numPops):
1097
            mostRecentTag = self.popTag()
1098
        return mostRecentTag    
1099
1100
    def _smartPop(self, name):
1101
1102
        """We need to pop up to the previous tag of this type, unless
1103
        one of this tag's nesting reset triggers comes between this
1104
        tag and the previous tag of this type, OR unless this tag is a
1105
        generic nesting trigger and another generic nesting trigger
1106
        comes between this tag and the previous tag of this type.
1107
1108
        Examples:
1109
         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1110
         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1111
         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1112
         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1113
1114
         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1115
         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1116
         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1117
        """
1118
1119
        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1120
        isNestable = nestingResetTriggers != None
1121
        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1122
        popTo = None
1123
        inclusive = True
1124
        for i in range(len(self.tagStack)-1, 0, -1):
1125
            p = self.tagStack[i]
1126
            if (not p or p.name == name) and not isNestable:
1127
                #Non-nestable tags get popped to the top or to their
1128
                #last occurance.
1129
                popTo = name
1130
                break
1131
            if (nestingResetTriggers != None
1132
                and p.name in nestingResetTriggers) \
1133
                or (nestingResetTriggers == None and isResetNesting
1134
                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1135
                
1136
                #If we encounter one of the nesting reset triggers
1137
                #peculiar to this tag, or we encounter another tag
1138
                #that causes nesting to reset, pop up to but not
1139
                #including that tag.
1140
                popTo = p.name
1141
                inclusive = False
1142
                break
1143
            p = p.parent
1144
        if popTo:
1145
            self._popToTag(popTo, inclusive)
1146
1147
    def unknown_starttag(self, name, attrs, selfClosing=0):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1148
        #print "Start tag %s: %s" % (name, attrs)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1149
        if self.quoteStack:
1150
            #This is not a real tag.
1151
            #print "<%s> is not real!" % name
1152
            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1153
            self.currentData.append('<%s%s>' % (name, attrs))
1154
            return        
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1155
        self.endData()
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1156
1157
        if not self.isSelfClosingTag(name) and not selfClosing:
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1158
            self._smartPop(name)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1159
1160
        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1161
               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1162
            return
1163
1164
        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1165
        if self.previous:
1166
            self.previous.next = tag
1167
        self.previous = tag
1168
        self.pushTag(tag)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1169
        if selfClosing or self.isSelfClosingTag(name):
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1170
            self.popTag()                
1171
        if name in self.QUOTE_TAGS:
1172
            #print "Beginning quote (%s)" % name
1173
            self.quoteStack.append(name)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1174
            self.literal = 1
1175
        return tag
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1176
1177
    def unknown_endtag(self, name):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1178
        #print "End tag %s" % name
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1179
        if self.quoteStack and self.quoteStack[-1] != name:
1180
            #This is not a real end tag.
1181
            #print "</%s> is not real!" % name
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1182
            self.currentData.append('</%s>' % name)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1183
            return
1184
        self.endData()
1185
        self._popToTag(name)
1186
        if self.quoteStack and self.quoteStack[-1] == name:
1187
            self.quoteStack.pop()
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1188
            self.literal = (len(self.quoteStack) > 0)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1189
1190
    def handle_data(self, data):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1191
        if self.convertHTMLEntities:
1192
            if data[0] == '&':
1193
                data = self.BARE_AMPERSAND.sub("&amp;",data)
1194
            else:
1195
                data = data.replace('&','&amp;') \
1196
                           .replace('<','&lt;') \
1197
                           .replace('>','&gt;')
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1198
        self.currentData.append(data)
1199
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1200
    def _toStringSubclass(self, text, subclass):
1201
        """Adds a certain piece of text to the tree as a NavigableString
1202
        subclass."""
1203
        self.endData()
1204
        self.handle_data(text)
1205
        self.endData(subclass)
1206
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1207
    def handle_pi(self, text):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1208
        """Handle a processing instruction as a ProcessingInstruction
1209
        object, possibly one with a %SOUP-ENCODING% slot into which an
1210
        encoding will be plugged later."""
1211
        if text[:3] == "xml":
1212
            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1213
        self._toStringSubclass(text, ProcessingInstruction)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1214
1215
    def handle_comment(self, text):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1216
        "Handle comments as Comment objects."
1217
        self._toStringSubclass(text, Comment)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1218
1219
    def handle_charref(self, ref):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1220
        "Handle character references as data."
1221
        if ref[0] == 'x':
1222
            data = unichr(int(ref[1:],16))
1223
        else:
1224
            data = unichr(int(ref))
1225
        
1226
        if u'\x80' <= data <= u'\x9F':
1227
            data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
1228
        elif not self.convertHTMLEntities and not self.convertXMLEntities:
1229
            data = '&#%s;' % ref
1230
1231
        self.handle_data(data)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1232
1233
    def handle_entityref(self, ref):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1234
        """Handle entity references as data, possibly converting known
1235
        HTML entity references to the corresponding Unicode
1236
        characters."""
1237
        replaceWithXMLEntity = self.convertXMLEntities and \
1238
                               self.XML_ENTITIES_TO_CHARS.has_key(ref)
1239
        if self.convertHTMLEntities or replaceWithXMLEntity:
1240
            try:
1241
                data = unichr(name2codepoint[ref])
1242
            except KeyError:
1243
                if replaceWithXMLEntity:
1244
                    data = self.XML_ENTITIES_TO_CHARS.get(ref)
1245
                else:
1246
                    data="&amp;%s" % ref
1247
        else:
1248
            data = '&%s;' % ref
1249
        self.handle_data(data)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1250
        
1251
    def handle_decl(self, data):
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1252
        "Handle DOCTYPEs and the like as Declaration objects."
1253
        self._toStringSubclass(data, Declaration)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1254
1255
    def parse_declaration(self, i):
1256
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1257
        declaration as a CData object."""
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1258
        j = None
1259
        if self.rawdata[i:i+9] == '<![CDATA[':
1260
             k = self.rawdata.find(']]>', i)
1261
             if k == -1:
1262
                 k = len(self.rawdata)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1263
             data = self.rawdata[i+9:k]
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1264
             j = k+3
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1265
             self._toStringSubclass(data, CData)
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1266
        else:
1267
            try:
1268
                j = SGMLParser.parse_declaration(self, i)
1269
            except SGMLParseError:
1270
                toHandle = self.rawdata[i:]
1271
                self.handle_data(toHandle)
1272
                j = i + len(toHandle)
1273
        return j
1274
1275
class BeautifulSoup(BeautifulStoneSoup):
1276
1277
    """This parser knows the following facts about HTML:
1278
1279
    * Some tags have no closing tag and should be interpreted as being
1280
      closed as soon as they are encountered.
1281
1282
    * The text inside some tags (ie. 'script') may contain tags which
1283
      are not really part of the document and which should be parsed
1284
      as text, not tags. If you want to parse the text as tags, you can
1285
      always fetch it and parse it explicitly.
1286
1287
    * Tag nesting rules:
1288
1289
      Most tags can't be nested at all. For instance, the occurance of
1290
      a <p> tag should implicitly close the previous <p> tag.
1291
1292
       <p>Para1<p>Para2
1293
        should be transformed into:
1294
       <p>Para1</p><p>Para2
1295
1296
      Some tags can be nested arbitrarily. For instance, the occurance
1297
      of a <blockquote> tag should _not_ implicitly close the previous
1298
      <blockquote> tag.
1299
1300
       Alice said: <blockquote>Bob said: <blockquote>Blah
1301
        should NOT be transformed into:
1302
       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1303
1304
      Some tags can be nested, but the nesting is reset by the
1305
      interposition of other tags. For instance, a <tr> tag should
1306
      implicitly close the previous <tr> tag within the same <table>,
1307
      but not close a <tr> tag in another table.
1308
1309
       <table><tr>Blah<tr>Blah
1310
        should be transformed into:
1311
       <table><tr>Blah</tr><tr>Blah
1312
        but,
1313
       <tr>Blah<table><tr>Blah
1314
        should NOT be transformed into
1315
       <tr>Blah<table></tr><tr>Blah
1316
1317
    Differing assumptions about tag nesting rules are a major source
1318
    of problems with the BeautifulSoup class. If BeautifulSoup is not
1319
    treating as nestable a tag your page author treats as nestable,
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1320
    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1321
    BeautifulStoneSoup before writing your own subclass."""
1322
1323
    def __init__(self, *args, **kwargs):
1324
        if not kwargs.has_key('smartQuotesTo'):
1325
            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1326
        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1327
1328
    SELF_CLOSING_TAGS = buildTagMap(None,
1329
                                    ['br' , 'hr', 'input', 'img', 'meta',
1330
                                    'spacer', 'link', 'frame', 'base'])
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1331
1332
    QUOTE_TAGS = {'script': None}
1333
    
1334
    #According to the HTML standard, each of these inline tags can
1335
    #contain another tag of the same type. Furthermore, it's common
1336
    #to actually use these tags this way.
1337
    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1338
                            'center']
1339
1340
    #According to the HTML standard, these block tags can contain
1341
    #another tag of the same type. Furthermore, it's common
1342
    #to actually use these tags this way.
1343
    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1344
1345
    #Lists can contain other lists, but there are restrictions.    
1346
    NESTABLE_LIST_TAGS = { 'ol' : [],
1347
                           'ul' : [],
1348
                           'li' : ['ul', 'ol'],
1349
                           'dl' : [],
1350
                           'dd' : ['dl'],
1351
                           'dt' : ['dl'] }
1352
1353
    #Tables can contain other tables, but there are restrictions.    
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1354
    NESTABLE_TABLE_TAGS = {'table' : [], 
1355
                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1356
                           'td' : ['tr'],
1357
                           'th' : ['tr'],
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1358
                           'thead' : ['table'],
1359
                           'tbody' : ['table'],
1360
                           'tfoot' : ['table'],
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1361
                           }
1362
1363
    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1364
1365
    #If one of these tags is encountered, all tags up to the next tag of
1366
    #this type are popped.
1367
    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1368
                                     NON_NESTABLE_BLOCK_TAGS,
1369
                                     NESTABLE_LIST_TAGS,
1370
                                     NESTABLE_TABLE_TAGS)
1371
1372
    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1373
                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1374
1375
    # Used to detect the charset in a META tag; see start_meta
1376
    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1377
1378
    def start_meta(self, attrs):
1379
        """Beautiful Soup can detect a charset included in a META tag,
1380
        try to convert the document to that charset, and re-parse the
1381
        document from the beginning."""
1382
        httpEquiv = None
1383
        contentType = None
1384
        contentTypeIndex = None
1385
        tagNeedsEncodingSubstitution = False
1386
1387
        for i in range(0, len(attrs)):
1388
            key, value = attrs[i]
1389
            key = key.lower()
1390
            if key == 'http-equiv':
1391
                httpEquiv = value
1392
            elif key == 'content':
1393
                contentType = value
1394
                contentTypeIndex = i
1395
1396
        if httpEquiv and contentType: # It's an interesting meta tag.
1397
            match = self.CHARSET_RE.search(contentType)
1398
            if match:
1399
                if getattr(self, 'declaredHTMLEncoding') or \
1400
                       (self.originalEncoding == self.fromEncoding):
1401
                    # This is our second pass through the document, or
1402
                    # else an encoding was specified explicitly and it
1403
                    # worked. Rewrite the meta tag.
1404
                    newAttr = self.CHARSET_RE.sub\
1405
                              (lambda(match):match.group(1) +
1406
                               "%SOUP-ENCODING%", value)
1407
                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1408
                                               newAttr)
1409
                    tagNeedsEncodingSubstitution = True
1410
                else:
1411
                    # This is our first pass through the document.
1412
                    # Go through it again with the new information.
1413
                    newCharset = match.group(3)
1414
                    if newCharset and newCharset != self.originalEncoding:
1415
                        self.declaredHTMLEncoding = newCharset
1416
                        self._feed(self.declaredHTMLEncoding)
1417
                        raise StopParsing
1418
        tag = self.unknown_starttag("meta", attrs)
1419
        if tag and tagNeedsEncodingSubstitution:
1420
            tag.containsSubstitutions = True
1421
1422
class StopParsing(Exception):
1423
    pass
1424
   
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1425
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1426
1427
    """The BeautifulSoup class is oriented towards skipping over
1428
    common HTML errors like unclosed tags. However, sometimes it makes
1429
    errors of its own. For instance, consider this fragment:
1430
1431
     <b>Foo<b>Bar</b></b>
1432
1433
    This is perfectly valid (if bizarre) HTML. However, the
1434
    BeautifulSoup class will implicitly close the first b tag when it
1435
    encounters the second 'b'. It will think the author wrote
1436
    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1437
    there's no real-world reason to bold something that's already
1438
    bold. When it encounters '</b></b>' it will close two more 'b'
1439
    tags, for a grand total of three tags closed instead of two. This
1440
    can throw off the rest of your document structure. The same is
1441
    true of a number of other tags, listed below.
1442
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1443
    It's much more common for someone to forget to close a 'b' tag
1444
    than to actually use nested 'b' tags, and the BeautifulSoup class
1445
    handles the common case. This class handles the not-co-common
1446
    case: where you can't believe someone wrote what they did, but
1447
    it's valid HTML and BeautifulSoup screwed up by assuming it
1448
    wouldn't be."""
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1449
1450
    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1451
     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1452
      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1453
      'big']
1454
1455
    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1456
1457
    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1458
                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1459
                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1460
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1461
class MinimalSoup(BeautifulSoup):
1462
    """The MinimalSoup class is for parsing HTML that contains
1463
    pathologically bad markup. It makes no assumptions about tag
1464
    nesting, but it does know which tags are self-closing, that
1465
    <script> tags contain Javascript and should not be parsed, that
1466
    META tags may contain encoding information, and so on.
1467
1468
    This also makes it better for subclassing than BeautifulStoneSoup
1469
    or BeautifulSoup."""
1470
    
1471
    RESET_NESTING_TAGS = buildTagMap('noscript')
1472
    NESTABLE_TAGS = {}
1473
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1474
class BeautifulSOAP(BeautifulStoneSoup):
1475
    """This class will push a tag with only a single string child into
1476
    the tag's parent as an attribute. The attribute's name is the tag
1477
    name, and the value is the string child. An example should give
1478
    the flavor of the change:
1479
1480
    <foo><bar>baz</bar></foo>
1481
     =>
1482
    <foo bar="baz"><bar>baz</bar></foo>
1483
1484
    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1485
1486
    This is, of course, useful for scraping structures that tend to
1487
    use subelements instead of attributes, such as SOAP messages. Note
1488
    that it modifies its input, so don't print the modified version
1489
    out.
1490
1491
    I'm not sure how many people really want to use this class; let me
1492
    know if you do. Mainly I like the name."""
1493
1494
    def popTag(self):
1495
        if len(self.tagStack) > 1:
1496
            tag = self.tagStack[-1]
1497
            parent = self.tagStack[-2]
1498
            parent._getAttrMap()
1499
            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1500
                isinstance(tag.contents[0], NavigableString) and 
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1501
                not parent.attrMap.has_key(tag.name)):
1502
                parent[tag.name] = tag.contents[0]
1503
        BeautifulStoneSoup.popTag(self)
1504
1505
#Enterprise class names! It has come to our attention that some people
1506
#think the names of the Beautiful Soup parser classes are too silly
1507
#and "unprofessional" for use in enterprise screen-scraping. We feel
1508
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1509
#All-Night Kosher Bakery recommends renaming this file to
1510
#"RobustParser.py" (or, in cases of extreme enterprisitude,
1511
#"RobustParserBeanInterface.class") and using the following
1512
#enterprise-friendly class aliases:
1513
class RobustXMLParser(BeautifulStoneSoup):
1514
    pass
1515
class RobustHTMLParser(BeautifulSoup):
1516
    pass
1517
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1518
    pass
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1519
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1520
    pass
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1521
class SimplifyingSOAPParser(BeautifulSOAP):
1522
    pass
1523
3701 by Steve Alexander
make the onezerostatus page actually work, and update BeautifulSoup.py to version 3
1524
######################################################
1525
#
1526
# Bonus library: Unicode, Dammit
1527
#
1528
# This class forces XML data into a standard format (usually to UTF-8
1529
# or Unicode).  It is heavily based on code from Mark Pilgrim's
1530
# Universal Feed Parser. It does not rewrite the XML or HTML to
1531
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1532
# (XML) and BeautifulSoup.start_meta (HTML).
1533
1534
# Autodetects character encodings.
1535
# Download from http://chardet.feedparser.org/
1536
try:
1537
    import chardet
1538
#    import chardet.constants
1539
#    chardet.constants._debug = 1
1540
except:
1541
    chardet = None
1542
chardet = None
1543
1544
# cjkcodecs and iconv_codec make Python know about more character encodings.
1545
# Both are available from http://cjkpython.i18n.org/
1546
# They're built in if you use Python 2.4.
1547
try:
1548
    import cjkcodecs.aliases
1549
except:
1550
    pass
1551
try:
1552
    import iconv_codec
1553
except:
1554
    pass
1555
1556
class UnicodeDammit:
1557
    """A class for detecting the encoding of a *ML document and
1558
    converting it to a Unicode string. If the source encoding is
1559
    windows-1252, can replace MS smart quotes with their HTML or XML
1560
    equivalents."""
1561
1562
    # This dictionary maps commonly seen values for "charset" in HTML
1563
    # meta tags to the corresponding Python codec names. It only covers
1564
    # values that aren't in Python's aliases and can't be determined
1565
    # by the heuristics in find_codec.
1566
    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1567
                        "x-sjis" : "shift-jis" }
1568
    
1569
    def __init__(self, markup, overrideEncodings=[],
1570
                 smartQuotesTo='xml'):
1571
        self.markup, documentEncoding, sniffedEncoding = \
1572
                     self._detectEncoding(markup)
1573
        self.smartQuotesTo = smartQuotesTo
1574
        self.triedEncodings = []
1575
        if isinstance(markup, unicode):
1576
            return markup
1577
1578
        u = None
1579
        for proposedEncoding in overrideEncodings:
1580
            u = self._convertFrom(proposedEncoding)
1581
            if u: break
1582
        if not u:
1583
            for proposedEncoding in (documentEncoding, sniffedEncoding):
1584
                u = self._convertFrom(proposedEncoding)
1585
                if u: break
1586
                
1587
        # If no luck and we have auto-detection library, try that:
1588
        if not u and chardet and not isinstance(self.markup, unicode):
1589
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1590
1591
        # As a last resort, try utf-8 and windows-1252:
1592
        if not u:
1593
            for proposed_encoding in ("utf-8", "windows-1252"):
1594
                u = self._convertFrom(proposed_encoding)
1595
                if u: break
1596
        self.unicode = u
1597
        if not u: self.originalEncoding = None
1598
1599
    def subMSChar(orig, smartQuotesTo):
1600
        """Changes a MS smart quote character to an XML or HTML
1601
        entity."""
1602
        sub = UnicodeDammit.MS_CHARS.get(orig)
1603
        if type(sub) == types.TupleType:
1604
            if smartQuotesTo == 'xml':
1605
                sub = '&#x%s;' % sub[1]
1606
            elif smartQuotesTo == 'html':
1607
                sub = '&%s;' % sub[0]
1608
            else:
1609
                sub = unichr(int(sub[1],16))
1610
        return sub            
1611
    subMSChar = staticmethod(subMSChar)
1612
1613
    def _convertFrom(self, proposed):        
1614
        proposed = self.find_codec(proposed)
1615
        if not proposed or proposed in self.triedEncodings:
1616
            return None
1617
        self.triedEncodings.append(proposed)
1618
        markup = self.markup
1619
1620
        # Convert smart quotes to HTML if coming from an encoding
1621
        # that might have them.
1622
        if self.smartQuotesTo and proposed in("windows-1252",
1623
                                              "ISO-8859-1",
1624
                                              "ISO-8859-2"):
1625
            markup = re.compile("([\x80-\x9f])").sub \
1626
                     (lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
1627
                      markup)
1628
1629
        try:
1630
            # print "Trying to convert document to %s" % proposed
1631
            u = self._toUnicode(markup, proposed)
1632
            self.markup = u       
1633
            self.originalEncoding = proposed
1634
        except Exception, e:
1635
            # print "That didn't work!"
1636
            # print e
1637
            return None        
1638
        #print "Correct encoding: %s" % proposed
1639
        return self.markup
1640
1641
    def _toUnicode(self, data, encoding):
1642
        '''Given a string and its encoding, decodes the string into Unicode.
1643
        %encoding is a string recognized by encodings.aliases'''
1644
1645
        # strip Byte Order Mark (if present)
1646
        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1647
               and (data[2:4] != '\x00\x00'):
1648
            encoding = 'utf-16be'
1649
            data = data[2:]
1650
        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1651
                 and (data[2:4] != '\x00\x00'):
1652
            encoding = 'utf-16le'
1653
            data = data[2:]
1654
        elif data[:3] == '\xef\xbb\xbf':
1655
            encoding = 'utf-8'
1656
            data = data[3:]
1657
        elif data[:4] == '\x00\x00\xfe\xff':
1658
            encoding = 'utf-32be'
1659
            data = data[4:]
1660
        elif data[:4] == '\xff\xfe\x00\x00':
1661
            encoding = 'utf-32le'
1662
            data = data[4:]
1663
        newdata = unicode(data, encoding)
1664
        return newdata
1665
    
1666
    def _detectEncoding(self, xml_data):
1667
        """Given a document, tries to detect its XML encoding."""
1668
        xml_encoding = sniffed_xml_encoding = None
1669
        try:
1670
            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1671
                # EBCDIC
1672
                xml_data = self._ebcdic_to_ascii(xml_data)
1673
            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1674
                # UTF-16BE
1675
                sniffed_xml_encoding = 'utf-16be'
1676
                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1677
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1678
                     and (xml_data[2:4] != '\x00\x00'):
1679
                # UTF-16BE with BOM
1680
                sniffed_xml_encoding = 'utf-16be'
1681
                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1682
            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1683
                # UTF-16LE
1684
                sniffed_xml_encoding = 'utf-16le'
1685
                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1686
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1687
                     (xml_data[2:4] != '\x00\x00'):
1688
                # UTF-16LE with BOM
1689
                sniffed_xml_encoding = 'utf-16le'
1690
                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1691
            elif xml_data[:4] == '\x00\x00\x00\x3c':
1692
                # UTF-32BE
1693
                sniffed_xml_encoding = 'utf-32be'
1694
                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1695
            elif xml_data[:4] == '\x3c\x00\x00\x00':
1696
                # UTF-32LE
1697
                sniffed_xml_encoding = 'utf-32le'
1698
                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1699
            elif xml_data[:4] == '\x00\x00\xfe\xff':
1700
                # UTF-32BE with BOM
1701
                sniffed_xml_encoding = 'utf-32be'
1702
                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1703
            elif xml_data[:4] == '\xff\xfe\x00\x00':
1704
                # UTF-32LE with BOM
1705
                sniffed_xml_encoding = 'utf-32le'
1706
                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1707
            elif xml_data[:3] == '\xef\xbb\xbf':
1708
                # UTF-8 with BOM
1709
                sniffed_xml_encoding = 'utf-8'
1710
                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1711
            else:
1712
                sniffed_xml_encoding = 'ascii'
1713
                pass
1714
            xml_encoding_match = re.compile \
1715
                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1716
                                 .match(xml_data)
1717
        except:
1718
            xml_encoding_match = None
1719
        if xml_encoding_match:
1720
            xml_encoding = xml_encoding_match.groups()[0].lower()
1721
            if sniffed_xml_encoding and \
1722
               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1723
                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1724
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1725
                                 'utf16', 'u16')):
1726
                xml_encoding = sniffed_xml_encoding
1727
        return xml_data, xml_encoding, sniffed_xml_encoding
1728
1729
1730
    def find_codec(self, charset):
1731
        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1732
               or (charset and self._codec(charset.replace("-", ""))) \
1733
               or (charset and self._codec(charset.replace("-", "_"))) \
1734
               or charset
1735
1736
    def _codec(self, charset):
1737
        if not charset: return charset 
1738
        codec = None
1739
        try:
1740
            codecs.lookup(charset)
1741
            codec = charset
1742
        except LookupError:
1743
            pass
1744
        return codec
1745
1746
    EBCDIC_TO_ASCII_MAP = None
1747
    def _ebcdic_to_ascii(self, s):
1748
        c = self.__class__
1749
        if not c.EBCDIC_TO_ASCII_MAP:
1750
            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1751
                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1752
                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1753
                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1754
                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1755
                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1756
                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1757
                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1758
                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1759
                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1760
                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1761
                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1762
                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1763
                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1764
                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1765
                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1766
                    250,251,252,253,254,255)
1767
            import string
1768
            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1769
            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1770
        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1771
1772
    MS_CHARS = { '\x80' : ('euro', '20AC'),
1773
                 '\x81' : ' ',
1774
                 '\x82' : ('sbquo', '201A'),
1775
                 '\x83' : ('fnof', '192'),
1776
                 '\x84' : ('bdquo', '201E'),
1777
                 '\x85' : ('hellip', '2026'),
1778
                 '\x86' : ('dagger', '2020'),
1779
                 '\x87' : ('Dagger', '2021'),
1780
                 '\x88' : ('circ', '2C6'),
1781
                 '\x89' : ('permil', '2030'),
1782
                 '\x8A' : ('Scaron', '160'),
1783
                 '\x8B' : ('lsaquo', '2039'),
1784
                 '\x8C' : ('OElig', '152'),
1785
                 '\x8D' : '?',
1786
                 '\x8E' : ('#x17D', '17D'),
1787
                 '\x8F' : '?',
1788
                 '\x90' : '?',
1789
                 '\x91' : ('lsquo', '2018'),
1790
                 '\x92' : ('rsquo', '2019'),
1791
                 '\x93' : ('ldquo', '201C'),
1792
                 '\x94' : ('rdquo', '201D'),
1793
                 '\x95' : ('bull', '2022'),
1794
                 '\x96' : ('ndash', '2013'),
1795
                 '\x97' : ('mdash', '2014'),
1796
                 '\x98' : ('tilde', '2DC'),
1797
                 '\x99' : ('trade', '2122'),
1798
                 '\x9a' : ('scaron', '161'),
1799
                 '\x9b' : ('rsaquo', '203A'),
1800
                 '\x9c' : ('oelig', '153'),
1801
                 '\x9d' : '?',
1802
                 '\x9e' : ('#x17E', '17E'),
1803
                 '\x9f' : ('Yuml', '178'),}
1804
1805
#######################################################################
1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1806
1807
1808
#By default, act as an HTML pretty-printer.
1809
if __name__ == '__main__':
1810
    import sys
1811
    soup = BeautifulSoup(sys.stdin.read())
1812
    print soup.prettify()