~launchpad-pqm/launchpad/devel

1976 by Canonical.com Patch Queue Manager
dyson product release creator [r=dsilvers]
1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
v2.1.0
5
http://www.crummy.com/software/BeautifulSoup/
6
7
Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
8
into a tree representation. It provides methods and Pythonic idioms
9
that make it easy to search and modify the tree.
10
11
A well-formed XML/HTML document will yield a well-formed data
12
structure. An ill-formed XML/HTML document will yield a
13
correspondingly ill-formed data structure. If your document is only
14
locally well-formed, you can use this library to find and process the
15
well-formed part of it. The BeautifulSoup class has heuristics for
16
obtaining a sensible parse tree in the face of common HTML errors.
17
18
Beautiful Soup has no external dependencies. It works with Python 2.2
19
and up.
20
21
Beautiful Soup defines classes for four different parsing strategies:
22
23
 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
24
   language that kind of looks like XML.
25
26
 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
27
   or invalid.
28
29
 * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
30
   that trips up BeautifulSoup.
31
32
 * BeautifulSOAP, for making it easier to parse XML documents that use
33
   lots of subelements containing a single string, where you'd prefer
34
   they put that string into an attribute (such as SOAP messages).
35
36
You can subclass BeautifulStoneSoup or BeautifulSoup to create a
37
parsing strategy specific to an XML schema or a particular bizarre
38
HTML document. Typically your subclass would just override
39
SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
40
"""
41
from __future__ import generators
42
43
__author__ = "Leonard Richardson (leonardr@segfault.org)"
44
__version__ = "2.1.0"
45
__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
46
__license__ = "PSF"
47
48
from sgmllib import SGMLParser, SGMLParseError
49
import types
50
import re
51
import sgmllib
52
53
#This code makes Beautiful Soup able to parse XML with namespaces
54
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
55
56
class NullType(object):
57
58
    """Similar to NoneType with a corresponding singleton instance
59
    'Null' that, unlike None, accepts any message and returns itself.
60
61
    Examples:
62
    >>> Null("send", "a", "message")("and one more",
63
    ...      "and what you get still") is Null
64
    True
65
    """
66
67
    def __new__(cls):                    return Null
68
    def __call__(self, *args, **kwargs): return Null
69
##    def __getstate__(self, *args):       return Null
70
    def __getattr__(self, attr):         return Null
71
    def __getitem__(self, item):         return Null
72
    def __setattr__(self, attr, value):  pass
73
    def __setitem__(self, item, value):  pass
74
    def __len__(self):                   return 0
75
    # FIXME: is this a python bug? otherwise ``for x in Null: pass``
76
    #        never terminates...
77
    def __iter__(self):                  return iter([])
78
    def __contains__(self, item):        return False
79
    def __repr__(self):                  return "Null"
80
Null = object.__new__(NullType)
81
82
class PageElement:
83
    """Contains the navigational information for some part of the page
84
    (either a tag or a piece of text)"""
85
86
    def setup(self, parent=Null, previous=Null):
87
        """Sets up the initial relations between this element and
88
        other elements."""
89
        self.parent = parent
90
        self.previous = previous
91
        self.next = Null
92
        self.previousSibling = Null
93
        self.nextSibling = Null
94
        if self.parent and self.parent.contents:
95
            self.previousSibling = self.parent.contents[-1]
96
            self.previousSibling.nextSibling = self
97
98
    def findNext(self, name=None, attrs={}, text=None):
99
        """Returns the first item that matches the given criteria and
100
        appears after this Tag in the document."""
101
        return self._first(self.fetchNext, name, attrs, text)
102
    firstNext = findNext
103
104
    def fetchNext(self, name=None, attrs={}, text=None, limit=None):
105
        """Returns all items that match the given criteria and appear
106
        before after Tag in the document."""
107
        return self._fetch(name, attrs, text, limit, self.nextGenerator)
108
109
    def findNextSibling(self, name=None, attrs={}, text=None):
110
        """Returns the closest sibling to this Tag that matches the
111
        given criteria and appears after this Tag in the document."""
112
        return self._first(self.fetchNextSiblings, name, attrs, text)
113
    firstNextSibling = findNextSibling
114
115
    def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
116
        """Returns the siblings of this Tag that match the given
117
        criteria and appear after this Tag in the document."""
118
        return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
119
120
    def findPrevious(self, name=None, attrs={}, text=None):
121
        """Returns the first item that matches the given criteria and
122
        appears before this Tag in the document."""
123
        return self._first(self.fetchPrevious, name, attrs, text)
124
125
    def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
126
        """Returns all items that match the given criteria and appear
127
        before this Tag in the document."""
128
        return self._fetch(name, attrs, text, limit, self.previousGenerator)
129
    firstPrevious = findPrevious
130
131
    def findPreviousSibling(self, name=None, attrs={}, text=None):
132
        """Returns the closest sibling to this Tag that matches the
133
        given criteria and appears before this Tag in the document."""
134
        return self._first(self.fetchPreviousSiblings, name, attrs, text)
135
    firstPreviousSibling = findPreviousSibling
136
137
    def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
138
                              limit=None):
139
        """Returns the siblings of this Tag that match the given
140
        criteria and appear before this Tag in the document."""
141
        return self._fetch(name, attrs, text, limit,
142
                           self.previousSiblingGenerator)
143
144
    def findParent(self, name=None, attrs={}):
145
        """Returns the closest parent of this Tag that matches the given
146
        criteria."""
147
        r = Null
148
        l = self.fetchParents(name, attrs, 1)
149
        if l:
150
            r = l[0]
151
        return r
152
    firstParent = findParent
153
154
    def fetchParents(self, name=None, attrs={}, limit=None):
155
        """Returns the parents of this Tag that match the given
156
        criteria."""
157
        return self._fetch(name, attrs, None, limit, self.parentGenerator)
158
159
    #These methods do the real heavy lifting.
160
161
    def _first(self, method, name, attrs, text):
162
        r = Null
163
        l = method(name, attrs, text, 1)
164
        if l:
165
            r = l[0]
166
        return r
167
    
168
    def _fetch(self, name, attrs, text, limit, generator):
169
        "Iterates over a generator looking for things that match."
170
        if not hasattr(attrs, 'items'):
171
            attrs = {'class' : attrs}
172
173
        results = []
174
        g = generator()
175
        while True:
176
            try:
177
                i = g.next()
178
            except StopIteration:
179
                break
180
            found = None
181
            if isinstance(i, Tag):
182
                if not text:
183
                    if not name or self._matches(i, name):
184
                        match = True
185
                        for attr, matchAgainst in attrs.items():
186
                            check = i.get(attr)
187
                            if not self._matches(check, matchAgainst):
188
                                match = False
189
                                break
190
                        if match:
191
                            found = i
192
            elif text:
193
                if self._matches(i, text):
194
                    found = i                    
195
            if found:
196
                results.append(found)
197
                if limit and len(results) >= limit:
198
                    break
199
        return results
200
201
    #Generators that can be used to navigate starting from both
202
    #NavigableTexts and Tags.                
203
    def nextGenerator(self):
204
        i = self
205
        while i:
206
            i = i.next
207
            yield i
208
209
    def nextSiblingGenerator(self):
210
        i = self
211
        while i:
212
            i = i.nextSibling
213
            yield i
214
215
    def previousGenerator(self):
216
        i = self
217
        while i:
218
            i = i.previous
219
            yield i
220
221
    def previousSiblingGenerator(self):
222
        i = self
223
        while i:
224
            i = i.previousSibling
225
            yield i
226
227
    def parentGenerator(self):
228
        i = self
229
        while i:
230
            i = i.parent
231
            yield i
232
233
    def _matches(self, chunk, howToMatch):
234
        #print 'looking for %s in %s' % (howToMatch, chunk)
235
        #
236
        # If given a list of items, return true if the list contains a
237
        # text element that matches.
238
        if isList(chunk) and not isinstance(chunk, Tag):
239
            for tag in chunk:
240
                if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
241
                    return True
242
            return False
243
        if callable(howToMatch):
244
            return howToMatch(chunk)
245
        if isinstance(chunk, Tag):
246
            #Custom match methods take the tag as an argument, but all other
247
            #ways of matching match the tag name as a string
248
            chunk = chunk.name
249
        #Now we know that chunk is a string
250
        if not type(chunk) in types.StringTypes:
251
            chunk = str(chunk)
252
        if hasattr(howToMatch, 'match'):
253
            # It's a regexp object.
254
            return howToMatch.search(chunk)
255
        if isList(howToMatch):
256
            return chunk in howToMatch
257
        if hasattr(howToMatch, 'items'):
258
            return howToMatch.has_key(chunk)
259
        #It's just a string
260
        return str(howToMatch) == chunk
261
262
class NavigableText(PageElement):
263
264
    def __getattr__(self, attr):
265
        "For backwards compatibility, text.string gives you text"
266
        if attr == 'string':
267
            return self
268
        else:
269
            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
270
        
271
class NavigableString(str, NavigableText):
272
    pass
273
274
class NavigableUnicodeString(unicode, NavigableText):
275
    pass
276
277
class Tag(PageElement):
278
279
    """Represents a found HTML tag with its attributes and contents."""
280
281
    def __init__(self, name, attrs=None, parent=Null, previous=Null):
282
        "Basic constructor."
283
        self.name = name
284
        if attrs == None:
285
            attrs = []
286
        self.attrs = attrs
287
        self.contents = []
288
        self.setup(parent, previous)
289
        self.hidden = False
290
291
    def get(self, key, default=None):
292
        """Returns the value of the 'key' attribute for the tag, or
293
        the value given for 'default' if it doesn't have that
294
        attribute."""
295
        return self._getAttrMap().get(key, default)    
296
297
    def __getitem__(self, key):
298
        """tag[key] returns the value of the 'key' attribute for the tag,
299
        and throws an exception if it's not there."""
300
        return self._getAttrMap()[key]
301
302
    def __iter__(self):
303
        "Iterating over a tag iterates over its contents."
304
        return iter(self.contents)
305
306
    def __len__(self):
307
        "The length of a tag is the length of its list of contents."
308
        return len(self.contents)
309
310
    def __contains__(self, x):
311
        return x in self.contents
312
313
    def __nonzero__(self):
314
        "A tag is non-None even if it has no contents."
315
        return True
316
317
    def __setitem__(self, key, value):        
318
        """Setting tag[key] sets the value of the 'key' attribute for the
319
        tag."""
320
        self._getAttrMap()
321
        self.attrMap[key] = value
322
        found = False
323
        for i in range(0, len(self.attrs)):
324
            if self.attrs[i][0] == key:
325
                self.attrs[i] = (key, value)
326
                found = True
327
        if not found:
328
            self.attrs.append((key, value))
329
        self._getAttrMap()[key] = value
330
331
    def __delitem__(self, key):
332
        "Deleting tag[key] deletes all 'key' attributes for the tag."
333
        for item in self.attrs:
334
            if item[0] == key:
335
                self.attrs.remove(item)
336
                #We don't break because bad HTML can define the same
337
                #attribute multiple times.
338
            self._getAttrMap()
339
            if self.attrMap.has_key(key):
340
                del self.attrMap[key]
341
342
    def __call__(self, *args, **kwargs):
343
        """Calling a tag like a function is the same as calling its
344
        fetch() method. Eg. tag('a') returns a list of all the A tags
345
        found within this tag."""
346
        return apply(self.fetch, args, kwargs)
347
348
    def __getattr__(self, tag):
349
        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
350
            return self.first(tag[:-3])
351
        elif tag.find('__') != 0:
352
            return self.first(tag)
353
354
    def __eq__(self, other):
355
        """Returns true iff this tag has the same name, the same attributes,
356
        and the same contents (recursively) as the given tag.
357
358
        NOTE: right now this will return false if two tags have the
359
        same attributes in a different order. Should this be fixed?"""
360
        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
361
            return False
362
        for i in range(0, len(self.contents)):
363
            if self.contents[i] != other.contents[i]:
364
                return False
365
        return True
366
367
    def __ne__(self, other):
368
        """Returns true iff this tag is not identical to the other tag,
369
        as defined in __eq__."""
370
        return not self == other
371
372
    def __repr__(self):
373
        """Renders this tag as a string."""
374
        return str(self)
375
376
    def __unicode__(self):
377
        return self.__str__(1)
378
379
    def __str__(self, needUnicode=None, showStructureIndent=None):
380
        """Returns a string or Unicode representation of this tag and
381
        its contents.
382
383
        NOTE: since Python's HTML parser consumes whitespace, this
384
        method is not certain to reproduce the whitespace present in
385
        the original string."""
386
        
387
        attrs = []
388
        if self.attrs:
389
            for key, val in self.attrs:
390
                attrs.append('%s="%s"' % (key, val))
391
        close = ''
392
        closeTag = ''
393
        if self.isSelfClosing():
394
            close = ' /'
395
        else:
396
            closeTag = '</%s>' % self.name
397
        indentIncrement = None        
398
        if showStructureIndent != None:
399
            indentIncrement = showStructureIndent
400
            if not self.hidden:
401
                indentIncrement += 1
402
        contents = self.renderContents(indentIncrement, needUnicode=needUnicode)        
403
        if showStructureIndent:
404
            space = '\n%s' % (' ' * showStructureIndent)
405
        if self.hidden:
406
            s = contents
407
        else:
408
            s = []
409
            attributeString = ''
410
            if attrs:
411
                attributeString = ' ' + ' '.join(attrs)            
412
            if showStructureIndent:
413
                s.append(space)
414
            s.append('<%s%s%s>' % (self.name, attributeString, close))
415
            s.append(contents)
416
            if closeTag and showStructureIndent != None:
417
                s.append(space)
418
            s.append(closeTag)
419
            s = ''.join(s)
420
        isUnicode = type(s) == types.UnicodeType
421
        if needUnicode and not isUnicode:
422
            s = unicode(s)
423
        elif isUnicode and needUnicode==False:
424
            s = str(s)
425
        return s
426
427
    def prettify(self, needUnicode=None):
428
        return self.__str__(needUnicode, showStructureIndent=True)
429
430
    def renderContents(self, showStructureIndent=None, needUnicode=None):
431
        """Renders the contents of this tag as a (possibly Unicode) 
432
        string."""
433
        s=[]
434
        for c in self:
435
            text = None
436
            if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
437
                text = unicode(c)
438
            elif isinstance(c, Tag):
439
                s.append(c.__str__(needUnicode, showStructureIndent))
440
            elif needUnicode:
441
                text = unicode(c)
442
            else:
443
                text = str(c)
444
            if text:
445
                if showStructureIndent != None:
446
                    if text[-1] == '\n':
447
                        text = text[:-1]
448
                s.append(text)
449
        return ''.join(s)    
450
451
    #Soup methods
452
453
    def firstText(self, text, recursive=True):
454
        """Convenience method to retrieve the first piece of text matching the
455
        given criteria. 'text' can be a string, a regular expression object,
456
        a callable that takes a string and returns whether or not the
457
        string 'matches', etc."""
458
        return self.first(recursive=recursive, text=text)
459
460
    def fetchText(self, text, recursive=True, limit=None):
461
        """Convenience method to retrieve all pieces of text matching the
462
        given criteria. 'text' can be a string, a regular expression object,
463
        a callable that takes a string and returns whether or not the
464
        string 'matches', etc."""
465
        return self.fetch(recursive=recursive, text=text, limit=limit)
466
467
    def first(self, name=None, attrs={}, recursive=True, text=None):
468
        """Return only the first child of this
469
        Tag matching the given criteria."""
470
        r = Null
471
        l = self.fetch(name, attrs, recursive, text, 1)
472
        if l:
473
            r = l[0]
474
        return r
475
    findChild = first
476
477
    def fetch(self, name=None, attrs={}, recursive=True, text=None,
478
              limit=None):
479
        """Extracts a list of Tag objects that match the given
480
        criteria.  You can specify the name of the Tag and any
481
        attributes you want the Tag to have.
482
483
        The value of a key-value pair in the 'attrs' map can be a
484
        string, a list of strings, a regular expression object, or a
485
        callable that takes a string and returns whether or not the
486
        string matches for some custom definition of 'matches'. The
487
        same is true of the tag name."""
488
        generator = self.recursiveChildGenerator
489
        if not recursive:
490
            generator = self.childGenerator
491
        return self._fetch(name, attrs, text, limit, generator)
492
    fetchChildren = fetch
493
    
494
    #Utility methods
495
496
    def isSelfClosing(self):
497
        """Returns true iff this is a self-closing tag as defined in the HTML
498
        standard.
499
500
        TODO: This is specific to BeautifulSoup and its subclasses, but it's
501
        used by __str__"""
502
        return self.name in BeautifulSoup.SELF_CLOSING_TAGS
503
504
    def append(self, tag):
505
        """Appends the given tag to the contents of this tag."""
506
        self.contents.append(tag)
507
508
    #Private methods
509
510
    def _getAttrMap(self):
511
        """Initializes a map representation of this tag's attributes,
512
        if not already initialized."""
513
        if not getattr(self, 'attrMap'):
514
            self.attrMap = {}
515
            for (key, value) in self.attrs:
516
                self.attrMap[key] = value 
517
        return self.attrMap
518
519
    #Generator methods
520
    def childGenerator(self):
521
        for i in range(0, len(self.contents)):
522
            yield self.contents[i]
523
        raise StopIteration
524
    
525
    def recursiveChildGenerator(self):
526
        stack = [(self, 0)]
527
        while stack:
528
            tag, start = stack.pop()
529
            if isinstance(tag, Tag):            
530
                for i in range(start, len(tag.contents)):
531
                    a = tag.contents[i]
532
                    yield a
533
                    if isinstance(a, Tag) and tag.contents:
534
                        if i < len(tag.contents) - 1:
535
                            stack.append((tag, i+1))
536
                        stack.append((a, 0))
537
                        break
538
        raise StopIteration
539
540
541
def isList(l):
542
    """Convenience method that works with all 2.x versions of Python
543
    to determine whether or not something is listlike."""
544
    return hasattr(l, '__iter__') \
545
           or (type(l) in (types.ListType, types.TupleType))
546
547
def buildTagMap(default, *args):
548
    """Turns a list of maps, lists, or scalars into a single map.
549
    Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
550
    of lists and partial maps."""
551
    built = {}
552
    for portion in args:
553
        if hasattr(portion, 'items'):
554
            #It's a map. Merge it.
555
            for k,v in portion.items():
556
                built[k] = v
557
        elif isList(portion):
558
            #It's a list. Map each item to the default.
559
            for k in portion:
560
                built[k] = default
561
        else:
562
            #It's a scalar. Map it to the default.
563
            built[portion] = default
564
    return built
565
566
class BeautifulStoneSoup(Tag, SGMLParser):
567
568
    """This class contains the basic parser and fetch code. It defines
569
    a parser that knows nothing about tag behavior except for the
570
    following:
571
   
572
      You can't close a tag without closing all the tags it encloses.
573
      That is, "<foo><bar></foo>" actually means
574
      "<foo><bar></bar></foo>".
575
576
    [Another possible explanation is "<foo><bar /></foo>", but since
577
    this class defines no SELF_CLOSING_TAGS, it will never use that
578
    explanation.]
579
580
    This class is useful for parsing XML or made-up markup languages,
581
    or when BeautifulSoup makes an assumption counter to what you were
582
    expecting."""
583
584
    SELF_CLOSING_TAGS = {}
585
    NESTABLE_TAGS = {}
586
    QUOTE_TAGS = {}
587
588
    #As a public service we will by default silently replace MS smart quotes
589
    #and similar characters with their HTML or ASCII equivalents.
590
    MS_CHARS = { '\x80' : 'euro',
591
                 '\x81' : ' ',
592
                 '\x82' : 'sbquo',
593
                 '\x83' : 'fnof',
594
                 '\x84' : 'bdquo',
595
                 '\x85' : 'hellip',
596
                 '\x86' : 'dagger',
597
                 '\x87' : 'Dagger',
598
                 '\x88' : 'caret',
599
                 '\x89' : '%',
600
                 '\x8A' : 'Scaron',
601
                 '\x8B' : 'lt;',
602
                 '\x8C' : 'OElig',
603
                 '\x8D' : '?',
604
                 '\x8E' : 'Z',
605
                 '\x8F' : '?',
606
                 '\x90' : '?',
607
                 '\x91' : 'lsquo',
608
                 '\x92' : 'rsquo',
609
                 '\x93' : 'ldquo',
610
                 '\x94' : 'rdquo',
611
                 '\x95' : 'bull',
612
                 '\x96' : 'ndash',
613
                 '\x97' : 'mdash',
614
                 '\x98' : 'tilde',
615
                 '\x99' : 'trade',
616
                 '\x9a' : 'scaron',
617
                 '\x9b' : 'gt',
618
                 '\x9c' : 'oelig',
619
                 '\x9d' : '?',
620
                 '\x9e' : 'z',
621
                 '\x9f' : 'Yuml',}
622
623
    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
624
                       lambda(x):x.group(1) + ' />'),
625
                      (re.compile('<!\s+([^<>]*)>'),
626
                       lambda(x):'<!' + x.group(1) + '>'),
627
                      (re.compile("([\x80-\x9f])", re.M),
628
                       lambda(x): '&' + BeautifulStoneSoup.MS_CHARS.get(x.group(1)) + ';')
629
                      ]
630
631
    ROOT_TAG_NAME = '[document]'
632
633
    def __init__(self, text=None, avoidParserProblems=True,
634
                 initialTextIsEverything=True):
635
        """Initialize this as the 'root tag' and feed in any text to
636
        the parser.
637
638
        NOTE about avoidParserProblems: sgmllib will process most bad
639
        HTML, and BeautifulSoup has tricks for dealing with some HTML
640
        that kills sgmllib, but Beautiful Soup can nonetheless choke
641
        or lose data if your data uses self-closing tags or
642
        declarations incorrectly. By default, Beautiful Soup sanitizes
643
        its input to avoid the vast majority of these problems. The
644
        problems are relatively rare, even in bad HTML, so feel free
645
        to pass in False to avoidParserProblems if they don't apply to
646
        you, and you'll get better performance. The only reason I have
647
        this turned on by default is so I don't get so many tech
648
        support questions.
649
650
        The two most common instances of invalid HTML that will choke
651
        sgmllib are fixed by the default parser massage techniques:
652
653
         <br/> (No space between name of closing tag and tag close)
654
         <! --Comment--> (Extraneous whitespace in declaration)
655
656
        You can pass in a custom list of (RE object, replace method)
657
        tuples to get Beautiful Soup to scrub your input the way you
658
        want."""
659
        Tag.__init__(self, self.ROOT_TAG_NAME)
660
        if avoidParserProblems \
661
           and not isList(avoidParserProblems):
662
            avoidParserProblems = self.PARSER_MASSAGE            
663
        self.avoidParserProblems = avoidParserProblems
664
        SGMLParser.__init__(self)
665
        self.quoteStack = []
666
        self.hidden = 1
667
        self.reset()
668
        if hasattr(text, 'read'):
669
            #It's a file-type object.
670
            text = text.read()
671
        if text:
672
            self.feed(text)
673
        if initialTextIsEverything:
674
            self.done()
675
676
    def __getattr__(self, methodName):
677
        """This method routes method call requests to either the SGMLParser
678
        superclass or the Tag superclass, depending on the method name."""
679
        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
680
               or methodName.find('do_') == 0:
681
            return SGMLParser.__getattr__(self, methodName)
682
        elif methodName.find('__') != 0:
683
            return Tag.__getattr__(self, methodName)
684
        else:
685
            raise AttributeError
686
687
    def feed(self, text):
688
        if self.avoidParserProblems:
689
            for fix, m in self.avoidParserProblems:
690
                text = fix.sub(m, text)
691
        SGMLParser.feed(self, text)
692
        self.endData()
693
694
    def done(self):
695
        """Called when you're done parsing, so that the unclosed tags can be
696
        correctly processed."""
697
        while self.currentTag.name != self.ROOT_TAG_NAME:
698
            self.popTag()
699
            
700
    def reset(self):
701
        SGMLParser.reset(self)
702
        self.currentData = []
703
        self.currentTag = None
704
        self.tagStack = []
705
        self.pushTag(self)        
706
    
707
    def popTag(self):
708
        tag = self.tagStack.pop()
709
        # Tags with just one string-owning child get the child as a
710
        # 'string' property, so that soup.tag.string is shorthand for
711
        # soup.tag.contents[0]
712
        if len(self.currentTag.contents) == 1 and \
713
           isinstance(self.currentTag.contents[0], NavigableText):
714
            self.currentTag.string = self.currentTag.contents[0]
715
716
        #print "Pop", tag.name
717
        if self.tagStack:
718
            self.currentTag = self.tagStack[-1]
719
        return self.currentTag
720
721
    def pushTag(self, tag):
722
        #print "Push", tag.name
723
        if self.currentTag:
724
            self.currentTag.append(tag)
725
        self.tagStack.append(tag)
726
        self.currentTag = self.tagStack[-1]
727
728
    def endData(self):
729
        currentData = ''.join(self.currentData)
730
        if currentData:
731
            if not currentData.strip():
732
                if '\n' in currentData:
733
                    currentData = '\n'
734
                else:
735
                    currentData = ' '
736
            c = NavigableString
737
            if type(currentData) == types.UnicodeType:
738
                c = NavigableUnicodeString
739
            o = c(currentData)
740
            o.setup(self.currentTag, self.previous)
741
            if self.previous:
742
                self.previous.next = o
743
            self.previous = o
744
            self.currentTag.contents.append(o)
745
        self.currentData = []
746
747
    def _popToTag(self, name, inclusivePop=True):
748
        """Pops the tag stack up to and including the most recent
749
        instance of the given tag. If inclusivePop is false, pops the tag
750
        stack up to but *not* including the most recent instqance of
751
        the given tag."""
752
        if name == self.ROOT_TAG_NAME:
753
            return            
754
755
        #print "Pop to tag", name
756
        numPops = 0
757
        mostRecentTag = None
758
        for i in range(len(self.tagStack)-1, 0, -1):
759
            if name == self.tagStack[i].name:
760
                numPops = len(self.tagStack)-i
761
                break
762
        if not inclusivePop:
763
            numPops = numPops - 1
764
765
        for i in range(0, numPops):
766
            mostRecentTag = self.popTag()
767
        return mostRecentTag    
768
769
    def _smartPop(self, name):
770
771
        """We need to pop up to the previous tag of this type, unless
772
        one of this tag's nesting reset triggers comes between this
773
        tag and the previous tag of this type, OR unless this tag is a
774
        generic nesting trigger and another generic nesting trigger
775
        comes between this tag and the previous tag of this type.
776
777
        Examples:
778
         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
779
         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
780
         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
781
         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
782
783
         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
784
         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
785
         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
786
        """
787
788
        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
789
        isNestable = nestingResetTriggers != None
790
        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
791
        popTo = None
792
        inclusive = True
793
        for i in range(len(self.tagStack)-1, 0, -1):
794
            p = self.tagStack[i]
795
            if (not p or p.name == name) and not isNestable:
796
                #Non-nestable tags get popped to the top or to their
797
                #last occurance.
798
                popTo = name
799
                break
800
            if (nestingResetTriggers != None
801
                and p.name in nestingResetTriggers) \
802
                or (nestingResetTriggers == None and isResetNesting
803
                    and self.RESET_NESTING_TAGS.has_key(p.name)):
804
                
805
                #If we encounter one of the nesting reset triggers
806
                #peculiar to this tag, or we encounter another tag
807
                #that causes nesting to reset, pop up to but not
808
                #including that tag.
809
810
                popTo = p.name
811
                inclusive = False
812
                break
813
            p = p.parent
814
        if popTo:
815
            self._popToTag(popTo, inclusive)
816
817
    def unknown_starttag(self, name, attrs, selfClosing=0):
818
        if self.quoteStack:
819
            #This is not a real tag.
820
            #print "<%s> is not real!" % name
821
            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
822
            self.handle_data('<%s%s>' % (name, attrs))
823
            return
824
        self.endData()
825
        if not name in self.SELF_CLOSING_TAGS and not selfClosing:
826
            self._smartPop(name)
827
        tag = Tag(name, attrs, self.currentTag, self.previous)        
828
        if self.previous:
829
            self.previous.next = tag
830
        self.previous = tag
831
        self.pushTag(tag)
832
        if selfClosing or name in self.SELF_CLOSING_TAGS:
833
            self.popTag()                
834
        if name in self.QUOTE_TAGS:
835
            #print "Beginning quote (%s)" % name
836
            self.quoteStack.append(name)
837
838
    def unknown_endtag(self, name):
839
        if self.quoteStack and self.quoteStack[-1] != name:
840
            #This is not a real end tag.
841
            #print "</%s> is not real!" % name
842
            self.handle_data('</%s>' % name)
843
            return
844
        self.endData()
845
        self._popToTag(name)
846
        if self.quoteStack and self.quoteStack[-1] == name:
847
            self.quoteStack.pop()
848
849
    def handle_data(self, data):
850
        self.currentData.append(data)
851
852
    def handle_pi(self, text):
853
        "Propagate processing instructions right through."
854
        self.handle_data("<?%s>" % text)
855
856
    def handle_comment(self, text):
857
        "Propagate comments right through."
858
        self.handle_data("<!--%s-->" % text)
859
860
    def handle_charref(self, ref):
861
        "Propagate char refs right through."
862
        self.handle_data('&#%s;' % ref)
863
864
    def handle_entityref(self, ref):
865
        "Propagate entity refs right through."
866
        self.handle_data('&%s;' % ref)
867
        
868
    def handle_decl(self, data):
869
        "Propagate DOCTYPEs and the like right through."
870
        self.handle_data('<!%s>' % data)
871
872
    def parse_declaration(self, i):
873
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
874
        declaration as regular data."""
875
        j = None
876
        if self.rawdata[i:i+9] == '<![CDATA[':
877
             k = self.rawdata.find(']]>', i)
878
             if k == -1:
879
                 k = len(self.rawdata)
880
             self.handle_data(self.rawdata[i+9:k])
881
             j = k+3
882
        else:
883
            try:
884
                j = SGMLParser.parse_declaration(self, i)
885
            except SGMLParseError:
886
                toHandle = self.rawdata[i:]
887
                self.handle_data(toHandle)
888
                j = i + len(toHandle)
889
        return j
890
891
class BeautifulSoup(BeautifulStoneSoup):
892
893
    """This parser knows the following facts about HTML:
894
895
    * Some tags have no closing tag and should be interpreted as being
896
      closed as soon as they are encountered.
897
898
    * The text inside some tags (ie. 'script') may contain tags which
899
      are not really part of the document and which should be parsed
900
      as text, not tags. If you want to parse the text as tags, you can
901
      always fetch it and parse it explicitly.
902
903
    * Tag nesting rules:
904
905
      Most tags can't be nested at all. For instance, the occurance of
906
      a <p> tag should implicitly close the previous <p> tag.
907
908
       <p>Para1<p>Para2
909
        should be transformed into:
910
       <p>Para1</p><p>Para2
911
912
      Some tags can be nested arbitrarily. For instance, the occurance
913
      of a <blockquote> tag should _not_ implicitly close the previous
914
      <blockquote> tag.
915
916
       Alice said: <blockquote>Bob said: <blockquote>Blah
917
        should NOT be transformed into:
918
       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
919
920
      Some tags can be nested, but the nesting is reset by the
921
      interposition of other tags. For instance, a <tr> tag should
922
      implicitly close the previous <tr> tag within the same <table>,
923
      but not close a <tr> tag in another table.
924
925
       <table><tr>Blah<tr>Blah
926
        should be transformed into:
927
       <table><tr>Blah</tr><tr>Blah
928
        but,
929
       <tr>Blah<table><tr>Blah
930
        should NOT be transformed into
931
       <tr>Blah<table></tr><tr>Blah
932
933
    Differing assumptions about tag nesting rules are a major source
934
    of problems with the BeautifulSoup class. If BeautifulSoup is not
935
    treating as nestable a tag your page author treats as nestable,
936
    try ICantBelieveItsBeautifulSoup before writing your own
937
    subclass."""
938
939
    SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
940
                                           'spacer', 'link', 'frame'])
941
942
    QUOTE_TAGS = {'script': None}
943
    
944
    #According to the HTML standard, each of these inline tags can
945
    #contain another tag of the same type. Furthermore, it's common
946
    #to actually use these tags this way.
947
    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
948
                            'center']
949
950
    #According to the HTML standard, these block tags can contain
951
    #another tag of the same type. Furthermore, it's common
952
    #to actually use these tags this way.
953
    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
954
955
    #Lists can contain other lists, but there are restrictions.    
956
    NESTABLE_LIST_TAGS = { 'ol' : [],
957
                           'ul' : [],
958
                           'li' : ['ul', 'ol'],
959
                           'dl' : [],
960
                           'dd' : ['dl'],
961
                           'dt' : ['dl'] }
962
963
    #Tables can contain other tables, but there are restrictions.    
964
    NESTABLE_TABLE_TAGS = {'table' : ['tr', 'td'], #Not sure about this one.
965
                           'tr' : ['table'],
966
                           'td' : ['tr'],
967
                           'th' : ['tr'],
968
                           }
969
970
    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
971
972
    #If one of these tags is encountered, all tags up to the next tag of
973
    #this type are popped.
974
    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
975
                                     NON_NESTABLE_BLOCK_TAGS,
976
                                     NESTABLE_LIST_TAGS,
977
                                     NESTABLE_TABLE_TAGS)
978
979
    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
980
                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
981
    
982
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
983
984
    """The BeautifulSoup class is oriented towards skipping over
985
    common HTML errors like unclosed tags. However, sometimes it makes
986
    errors of its own. For instance, consider this fragment:
987
988
     <b>Foo<b>Bar</b></b>
989
990
    This is perfectly valid (if bizarre) HTML. However, the
991
    BeautifulSoup class will implicitly close the first b tag when it
992
    encounters the second 'b'. It will think the author wrote
993
    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
994
    there's no real-world reason to bold something that's already
995
    bold. When it encounters '</b></b>' it will close two more 'b'
996
    tags, for a grand total of three tags closed instead of two. This
997
    can throw off the rest of your document structure. The same is
998
    true of a number of other tags, listed below.
999
1000
    It's much more common for someone to forget to close (eg.) a 'b'
1001
    tag than to actually use nested 'b' tags, and the BeautifulSoup
1002
    class handles the common case. This class handles the
1003
    not-co-common case: where you can't believe someone wrote what
1004
    they did, but it's valid HTML and BeautifulSoup screwed up by
1005
    assuming it wouldn't be.
1006
1007
    If this doesn't do what you need, try subclassing this class or
1008
    BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
1009
1010
    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1011
     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1012
      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1013
      'big']
1014
1015
    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1016
1017
    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1018
                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1019
                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1020
1021
class BeautifulSOAP(BeautifulStoneSoup):
1022
    """This class will push a tag with only a single string child into
1023
    the tag's parent as an attribute. The attribute's name is the tag
1024
    name, and the value is the string child. An example should give
1025
    the flavor of the change:
1026
1027
    <foo><bar>baz</bar></foo>
1028
     =>
1029
    <foo bar="baz"><bar>baz</bar></foo>
1030
1031
    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1032
1033
    This is, of course, useful for scraping structures that tend to
1034
    use subelements instead of attributes, such as SOAP messages. Note
1035
    that it modifies its input, so don't print the modified version
1036
    out.
1037
1038
    I'm not sure how many people really want to use this class; let me
1039
    know if you do. Mainly I like the name."""
1040
1041
    def popTag(self):
1042
        if len(self.tagStack) > 1:
1043
            tag = self.tagStack[-1]
1044
            parent = self.tagStack[-2]
1045
            parent._getAttrMap()
1046
            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1047
                isinstance(tag.contents[0], NavigableText) and 
1048
                not parent.attrMap.has_key(tag.name)):
1049
                parent[tag.name] = tag.contents[0]
1050
        BeautifulStoneSoup.popTag(self)
1051
1052
#Enterprise class names! It has come to our attention that some people
1053
#think the names of the Beautiful Soup parser classes are too silly
1054
#and "unprofessional" for use in enterprise screen-scraping. We feel
1055
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1056
#All-Night Kosher Bakery recommends renaming this file to
1057
#"RobustParser.py" (or, in cases of extreme enterprisitude,
1058
#"RobustParserBeanInterface.class") and using the following
1059
#enterprise-friendly class aliases:
1060
class RobustXMLParser(BeautifulStoneSoup):
1061
    pass
1062
class RobustHTMLParser(BeautifulSoup):
1063
    pass
1064
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1065
    pass
1066
class SimplifyingSOAPParser(BeautifulSOAP):
1067
    pass
1068
1069
###
1070
1071
1072
#By default, act as an HTML pretty-printer.
1073
if __name__ == '__main__':
1074
    import sys
1075
    soup = BeautifulSoup(sys.stdin.read())
1076
    print soup.prettify()