# Use of this source code is governed by the MIT license. __license__ = "MIT" from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, Stylesheet, Script, TemplateString, nonwhitespace_re ) __all__ = [ 'HTMLTreeBuilder', 'SAXTreeBuilder', 'TreeBuilder', 'TreeBuilderRegistry', ] # Some useful features for a TreeBuilder to have. FAST = 'fast' PERMISSIVE = 'permissive' STRICT = 'strict' XML = 'xml' HTML = 'html' HTML_5 = 'html5' class TreeBuilderRegistry(object): """A way of looking up TreeBuilder subclasses by their name or by desired features. """ def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): """Register a treebuilder based on its advertised features. :param treebuilder_class: A subclass of Treebuilder. its .features attribute should list its features. """ for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): """Look up a TreeBuilder subclass with the desired features. :param features: A list of features to look for. If none are provided, the most recently registered TreeBuilder subclass will be used. :return: A TreeBuilder subclass, or None if there's no registered subclass with all the requested features. """ if len(self.builders) == 0: # There are no builders at all. return None if len(features) == 0: # They didn't ask for any features. Give them the most # recently registered builder. return self.builders[0] # Go down the list of features in order, and eliminate any builders # that don't match every feature. features = list(features) features.reverse() candidates = None candidate_set = None while len(features) > 0: feature = features.pop() we_have_the_feature = self.builders_for_feature.get(feature, []) if len(we_have_the_feature) > 0: if candidates is None: candidates = we_have_the_feature candidate_set = set(candidates) else: # Eliminate any candidates that don't have this feature. candidate_set = candidate_set.intersection( set(we_have_the_feature)) # The only valid candidates are the ones in candidate_set. # Go through the original list of candidates and pick the first one # that's in candidate_set. if candidate_set is None: return None for candidate in candidates: if candidate in candidate_set: return candidate return None # The BeautifulSoup class will take feature lists from developers and use them # to look up builders in this registry. builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a textual document into a Beautiful Soup object tree.""" NAME = "[Unknown tree builder]" ALTERNATE_NAMES = [] features = [] is_xml = False picklable = False empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. DEFAULT_CDATA_LIST_ATTRIBUTES = {} # Whitespace should be preserved inside these tags. DEFAULT_PRESERVE_WHITESPACE_TAGS = set() # The textual contents of tags with these names should be # instantiated with some class other than NavigableString. DEFAULT_STRING_CONTAINERS = {} USE_DEFAULT = object() # Most parsers don't keep track of line numbers. TRACKS_LINE_NUMBERS = False def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT, store_line_numbers=USE_DEFAULT, string_containers=USE_DEFAULT, ): """Constructor. :param multi_valued_attributes: If this is set to None, the TreeBuilder will not turn any values for attributes like 'class' into lists. Setting this to a dictionary will customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES for an example. Internally, these are called "CDATA list attributes", but that probably doesn't make sense to an end-user, so the argument name is `multi_valued_attributes`. :param preserve_whitespace_tags: A list of tags to treat the way
 tags are treated in HTML. Tags in this list
         are immune from pretty-printing; their contents will always be
         output as-is.

        :param string_containers: A dictionary mapping tag names to
        the classes that should be instantiated to contain the textual
        contents of those tags. The default is to use NavigableString
        for every tag, no matter what the name. You can override the
        default by changing DEFAULT_STRING_CONTAINERS.

        :param store_line_numbers: If the parser keeps track of the
         line numbers and positions of the original markup, that
         information will, by default, be stored in each corresponding
         `Tag` object. You can turn this off by passing
         store_line_numbers=False. If the parser you're using doesn't 
         keep track of this information, then setting store_line_numbers=True
         will do nothing.
        """
        self.soup = None
        if multi_valued_attributes is self.USE_DEFAULT:
            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
        self.cdata_list_attributes = multi_valued_attributes
        if preserve_whitespace_tags is self.USE_DEFAULT:
            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
        self.store_line_numbers = store_line_numbers 
        if string_containers == self.USE_DEFAULT:
            string_containers = self.DEFAULT_STRING_CONTAINERS
        self.string_containers = string_containers
        
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.

        :param soup: A BeautifulSoup object.
        """
        self.soup = soup
        
    def reset(self):
        """Do any work necessary to reset the underlying parser
        for a new document.

        By default, this does nothing.
        """
        pass

    def can_be_empty_element(self, tag_name):
        """Might a tag with this name be an empty-element tag?

        The final markup may or may not actually present this tag as
        self-closing.

        For instance: an HTMLBuilder does not consider a 

tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty

tag will be presented as "

", not "

" or "

". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an empty-element tag if and only if it has no children. "" will become "", and "bar" will be left alone. :param tag_name: The name of a markup tag. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): """Run some incoming markup through some parsing process, populating the `BeautifulSoup` object in self.soup. This method is not implemented in TreeBuilder; it must be implemented in subclasses. :return: None. """ raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): """Run any preliminary steps necessary to make incoming markup acceptable to the parser. :param markup: Some markup -- probably a bytestring. :param user_specified_encoding: The user asked to try this encoding. :param document_declared_encoding: The markup itself claims to be in this encoding. :param exclude_encodings: The user asked _not_ to try any of these encodings. :yield: A series of 4-tuples: (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for converting the document to Unicode and parsing it. Each strategy will be tried in turn. By default, the only strategy is to parse the markup as-is. See `LXMLTreeBuilderForXML` and `HTMLParserTreeBuilder` for implementations that take into account the quirks of particular parsers. """ yield markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. Different parsers do this differently. For instance, lxml introduces an empty tag, and html5lib doesn't. Abstracting this away lets us write simple tests which run HTML fragments through the parser and compare the results against other HTML fragments. This method should not be used outside of tests. :param fragment: A string -- fragment of HTML. :return: A string -- a full HTML document. """ return fragment def set_up_substitutions(self, tag): """Set up any substitutions that will need to be performed on a `Tag` when it's output as a string. By default, this does nothing. See `HTMLTreeBuilder` for a case where this is used. :param tag: A `Tag` :return: Whether or not a substitution was performed. """ return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): """When an attribute value is associated with a tag that can have multiple values for that attribute, convert the string value to a list of strings. Basically, replaces class="foo bar" with class=["foo", "bar"] NOTE: This method modifies its input in place. :param tag_name: The name of a tag. :param attrs: A dictionary containing the tag's attributes. Any appropriate attribute values will be modified in place. """ if not attrs: return attrs if self.cdata_list_attributes: universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] if isinstance(value, str): values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice # for the same tag when rearranging the parse # tree. On the second call the attribute value # here is already a list. If this happens, # leave the value alone rather than trying to # split it again. values = value attrs[attr] = values return attrs class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events. This is not currently used for anything, but it demonstrates how a simple TreeBuilder would work. """ def feed(self, markup): raise NotImplementedError() def close(self): pass def startElement(self, name, attrs): attrs = dict((key[1], value) for key, value in list(attrs.items())) #print "Start %s, %r" % (name, attrs) self.soup.handle_starttag(name, attrs) def endElement(self, name): #print "End %s" % name self.soup.handle_endtag(name) def startElementNS(self, nsTuple, nodeName, attrs): # Throw away (ns, nodeName) for now. self.startElement(nodeName, attrs) def endElementNS(self, nsTuple, nodeName): # Throw away (ns, nodeName) for now. self.endElement(nodeName) #handler.endElementNS((ns, node.nodeName), node.nodeName) def startPrefixMapping(self, prefix, nodeValue): # Ignore the prefix for now. pass def endPrefixMapping(self, prefix): # Ignore the prefix for now. # handler.endPrefixMapping(prefix) pass def characters(self, content): self.soup.handle_data(content) def startDocument(self): pass def endDocument(self): pass class HTMLTreeBuilder(TreeBuilder): """This TreeBuilder knows facts about HTML. Such as which tags are empty-element tags. """ empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', # These are from earlier versions of HTML and are removed in HTML5. 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) # The HTML standard defines these as block-level elements. Beautiful # Soup does not treat these elements differently from other elements, # but it may do so eventually, and this information is available if # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) # The HTML standard defines an unusual content model for these tags. # We represent this by using a string class other than NavigableString # inside these tags. # # I made this list by going through the HTML spec # (https://html.spec.whatwg.org/#metadata-content) and looking for # "metadata content" elements that can contain strings. # # TODO: Arguably