# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ from pdb import set_trace import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( PY3K, CData, Comment, Declaration, Doctype, Formatter, NavigableString, Script, SoupStrainer, Stylesheet, Tag, TemplateString, ) from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag['id'] for tag in tags], should_match) class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): soup = self.soup('
tag are empty-element, just because
# they have no contents.
self.assertEqual(b"
", xml_br.encode())
self.assertEqual(b"
Don't leave me here.
Don\'t leave!
""" soup = self.soup(doc) second_para = soup.find(id='2') bold = soup.b # Move the tag to the end of the second paragraph. soup.find(id='2').append(soup.b) # The tag is now a child of the second paragraph. self.assertEqual(bold.parent, second_para) self.assertEqual( soup.decode(), self.document_for( 'Don\'t leave me .
\n' 'Don\'t leave!here
')) def test_replace_with_returns_thing_that_was_replaced(self): text = "And now, a word:
And we're back.
") text = "p2
p3
" to_insert = self.soup(text) soup.insert(1, to_insert) for i in soup.descendants: assert not isinstance(i, BeautifulSoup) p1, p2, p3, p4 = list(soup.children) self.assertEqual("And now, a word:", p1.string) self.assertEqual("p2", p2.string) self.assertEqual("p3", p3.string) self.assertEqual("And we're back.", p4.string) def test_replace_with_maintains_next_element_throughout(self): soup = self.soup('onethree
') a = soup.a b = a.contents[0] # Make it so the tag has two text children. a.insert(1, "two") # Now replace each one with the empty string. left, right = a.contents left.replaceWith('') right.replaceWith('') # The tag is still connected to the tree. self.assertEqual("three", soup.b.string) def test_replace_final_node(self): soup = self.soup("Argh!") soup.find(text="Argh!").replace_with("Hooray!") new_text = soup.find(text="Hooray!") b = soup.b self.assertEqual(new_text.previous_element, b) self.assertEqual(new_text.parent, b) self.assertEqual(new_text.previous_element.next_element, new_text) self.assertEqual(new_text.next_element, None) def test_consecutive_text_nodes(self): # A builder should never create two consecutive text nodes, # but if you insert one next to another, Beautiful Soup will # handle it correctly. soup = self.soup("Argh!There's no business like show business
") no, show = soup.find_all('b') show.replace_with(no) self.assertEqual( soup.decode(), self.document_for( "There's business like no business
")) self.assertEqual(show.parent, None) self.assertEqual(no.parent, soup.p) self.assertEqual(no.next_element, "no") self.assertEqual(no.next_sibling, " business") def test_replace_first_child(self): data = "Unneeded formatting is unneeded
""") tree.em.unwrap() self.assertEqual(tree.em, None) self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") def test_wrap(self): soup = self.soup("I wish I was bold.") value = soup.string.wrap(soup.new_tag("b")) self.assertEqual(value.decode(), "I wish I was bold.") self.assertEqual( soup.decode(), self.document_for("I wish I was bold.")) def test_wrap_extracts_tag_from_elsewhere(self): soup = self.soup("I wish I was bold.") soup.b.next_sibling.wrap(soup.b) self.assertEqual( soup.decode(), self.document_for("I wish I was bold.")) def test_wrap_puts_new_contents_at_the_end(self): soup = self.soup("I like being bold.I wish I was bold.") soup.b.next_sibling.wrap(soup.b) self.assertEqual(2, len(soup.b.contents)) self.assertEqual( soup.decode(), self.document_for( "I like being bold.I wish I was bold.")) def test_extract(self): soup = self.soup( 'Some content. More content.') self.assertEqual(len(soup.body.contents), 3) extracted = soup.find(id="nav").extract() self.assertEqual( soup.decode(), "Some content. More content.") self.assertEqual(extracted.decode(), ' ') # The extracted tag is now an orphan. self.assertEqual(len(soup.body.contents), 2) self.assertEqual(extracted.parent, None) self.assertEqual(extracted.previous_element, None) self.assertEqual(extracted.next_element.next_element, None) # The gap where the extracted tag used to be has been mended. content_1 = soup.find(text="Some content. ") content_2 = soup.find(text=" More content.") self.assertEqual(content_1.next_element, content_2) self.assertEqual(content_1.next_sibling, content_2) self.assertEqual(content_2.previous_element, content_1) self.assertEqual(content_2.previous_sibling, content_1) def test_extract_distinguishes_between_identical_strings(self): soup = self.soup("foobar") foo_1 = soup.a.string bar_1 = soup.b.string foo_2 = soup.new_string("foo") bar_2 = soup.new_string("bar") soup.a.append(foo_2) soup.b.append(bar_2) # Now there are two identical strings in the tag, and two # in the tag. Let's remove the first "foo" and the second # "bar". foo_1.extract() bar_2.extract() self.assertEqual(foo_2, soup.a.string) self.assertEqual(bar_2, soup.b.string) def test_extract_multiples_of_same_tag(self): soup = self.soup(""" """) [soup.script.extract() for i in soup.find_all("script")] self.assertEqual("\n\n\n", str(soup.body)) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): soup = self.soup( '\n' 'hi\n' '') soup.find('body').extract() self.assertEqual(None, soup.find('body')) def test_clear(self): """Tag.clear()""" soup = self.soup("String Italicized and another
") # clear using extract() a = soup.a soup.p.clear() self.assertEqual(len(soup.p.contents), 0) self.assertTrue(hasattr(a, "contents")) # clear using decompose() em = a.em a.clear(decompose=True) self.assertEqual(0, len(em.contents)) def test_decompose(self): # Test PageElement.decompose() and PageElement.decomposed soup = self.soup("Another para
") p1, p2 = soup.find_all('p') a = p1.a text = p1.em.string for i in [p1, p2, a, text]: self.assertEqual(False, i.decomposed) # This sets p1 and everything beneath it to decomposed. p1.decompose() for i in [p1, a, text]: self.assertEqual(True, i.decomposed) # p2 is unaffected. self.assertEqual(False, p2.decomposed) def test_string_set(self): """Tag.string = 'string'""" soup = self.soup("