diff options
Diffstat (limited to 'chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3')
56 files changed, 31112 insertions, 0 deletions
diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/COPYING.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/COPYING.txt new file mode 100644 index 00000000000..fb6ae69cdf5 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/COPYING.txt @@ -0,0 +1,27 @@ +Beautiful Soup is made available under the MIT license: + + Copyright (c) 2004-2017 Leonard Richardson + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +Beautiful Soup incorporates code from the html5lib library, which is +also made available under the MIT license. Copyright (c) 2006-2013 +James Graham and other contributors diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/LICENSE b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/LICENSE new file mode 100644 index 00000000000..4c068bab272 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/LICENSE @@ -0,0 +1,30 @@ +Beautiful Soup is made available under the MIT license: + + Copyright (c) 2004-2019 Leonard Richardson + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +Beautiful Soup incorporates code from the html5lib library, which is +also made available under the MIT license. Copyright (c) 2006-2013 +James Graham and other contributors + +Beautiful Soup depends on the soupsieve library, which is also made +available under the MIT license. Copyright (c) 2018 Isaac Muse diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/MANIFEST.in b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/MANIFEST.in new file mode 100644 index 00000000000..33821b975af --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/MANIFEST.in @@ -0,0 +1,10 @@ +include test-all-versions +include convert-py3k +include LICENSE +include *.txt +include doc*/Makefile +include doc*/source/*.py +include doc*/source/*.rst +include doc*/source/*.jpg +include scripts/*.py +include scripts/*.txt diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/NEWS.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/NEWS.txt new file mode 100644 index 00000000000..625bb34c42b --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/NEWS.txt @@ -0,0 +1,1547 @@ += 4.9.3 (20201003) + +* Implemented a significant performance optimization to the process of + searching the parse tree. Patch by Morotti. [bug=1898212] + += 4.9.2 (20200926) + +* Fixed a bug that caused too many tags to be popped from the tag + stack during tree building, when encountering a closing tag that had + no matching opening tag. [bug=1880420] + +* Fixed a bug that inconsistently moved elements over when passing + a Tag, rather than a list, into Tag.extend(). [bug=1885710] + +* Specify the soupsieve dependency in a way that complies with + PEP 508. Patch by Mike Nerone. [bug=1893696] + +* Change the signatures for BeautifulSoup.insert_before and insert_after + (which are not implemented) to match PageElement.insert_before and + insert_after, quieting warnings in some IDEs. [bug=1897120] + += 4.9.1 (20200517) + +* Added a keyword argument 'on_duplicate_attribute' to the + BeautifulSoupHTMLParser constructor (used by the html.parser tree + builder) which lets you customize the handling of markup that + contains the same attribute more than once, as in: + <a href="url1" href="url2"> [bug=1878209] + +* Added a distinct subclass, GuessedAtParserWarning, for the warning + issued when BeautifulSoup is instantiated without a parser being + specified. [bug=1873787] + +* Added a distinct subclass, MarkupResemblesLocatorWarning, for the + warning issued when BeautifulSoup is instantiated with 'markup' that + actually seems to be a URL or the path to a file on + disk. [bug=1873787] + +* The new NavigableString subclasses (Stylesheet, Script, and + TemplateString) can now be imported directly from the bs4 package. + +* If you encode a document with a Python-specific encoding like + 'unicode_escape', that encoding is no longer mentioned in the final + XML or HTML document. Instead, encoding information is omitted or + left blank. [bug=1874955] + +* Fixed test failures when run against soupselect 2.0. Patch by Tomáš + Chvátal. [bug=1872279] + += 4.9.0 (20200405) + +* Added PageElement.decomposed, a new property which lets you + check whether you've already called decompose() on a Tag or + NavigableString. + +* Embedded CSS and Javascript is now stored in distinct Stylesheet and + Script tags, which are ignored by methods like get_text() since most + people don't consider this sort of content to be 'text'. This + feature is not supported by the html5lib treebuilder. [bug=1868861] + +* Added a Russian translation by 'authoress' to the repository. + +* Fixed an unhandled exception when formatting a Tag that had been + decomposed.[bug=1857767] + +* Fixed a bug that happened when passing a Unicode filename containing + non-ASCII characters as markup into Beautiful Soup, on a system that + allows Unicode filenames. [bug=1866717] + +* Added a performance optimization to PageElement.extract(). Patch by + Arthur Darcet. + += 4.8.2 (20191224) + +* Added Python docstrings to all public methods of the most commonly + used classes. + +* Added a Chinese translation by Deron Wang and a Brazilian Portuguese + translation by Cezar Peixeiro to the repository. + +* Fixed two deprecation warnings. Patches by Colin + Watson and Nicholas Neumann. [bug=1847592] [bug=1855301] + +* The html.parser tree builder now correctly handles DOCTYPEs that are + not uppercase. [bug=1848401] + +* PageElement.select() now returns a ResultSet rather than a regular + list, making it consistent with methods like find_all(). + += 4.8.1 (20191006) + +* When the html.parser or html5lib parsers are in use, Beautiful Soup + will, by default, record the position in the original document where + each tag was encountered. This includes line number (Tag.sourceline) + and position within a line (Tag.sourcepos). Based on code by Chris + Mayo. [bug=1742921] + +* When instantiating a BeautifulSoup object, it's now possible to + provide a dictionary ('element_classes') of the classes you'd like to be + instantiated instead of Tag, NavigableString, etc. + +* Fixed the definition of the default XML namespace when using + lxml 4.4. Patch by Isaac Muse. [bug=1840141] + +* Fixed a crash when pretty-printing tags that were not created + during initial parsing. [bug=1838903] + +* Copying a Tag preserves information that was originally obtained from + the TreeBuilder used to build the original Tag. [bug=1838903] + +* Raise an explanatory exception when the underlying parser + completely rejects the incoming markup. [bug=1838877] + +* Avoid a crash when trying to detect the declared encoding of a + Unicode document. [bug=1838877] + +* Avoid a crash when unpickling certain parse trees generated + using html5lib on Python 3. [bug=1843545] + += 4.8.0 (20190720, "One Small Soup") + +This release focuses on making it easier to customize Beautiful Soup's +input mechanism (the TreeBuilder) and output mechanism (the Formatter). + +* You can customize the TreeBuilder object by passing keyword + arguments into the BeautifulSoup constructor. Those keyword + arguments will be passed along into the TreeBuilder constructor. + + The main reason to do this right now is to change how which + attributes are treated as multi-valued attributes (the way 'class' + is treated by default). You can do this with the + 'multi_valued_attributes' argument. [bug=1832978] + +* The role of Formatter objects has been greatly expanded. The Formatter + class now controls the following: + + - The function to call to perform entity substitution. (This was + previously Formatter's only job.) + - Which tags should be treated as containing CDATA and have their + contents exempt from entity substitution. + - The order in which a tag's attributes are output. [bug=1812422] + - Whether or not to put a '/' inside a void element, e.g. '<br/>' vs '<br>' + + All preexisting code should work as before. + +* Added a new method to the API, Tag.smooth(), which consolidates + multiple adjacent NavigableString elements. [bug=1697296] + +* ' (which is valid in XML, XHTML, and HTML 5, but not HTML 4) is always + recognized as a named entity and converted to a single quote. [bug=1818721] + += 4.7.1 (20190106) + +* Fixed a significant performance problem introduced in 4.7.0. [bug=1810617] + +* Fixed an incorrectly raised exception when inserting a tag before or + after an identical tag. [bug=1810692] + +* Beautiful Soup will no longer try to keep track of namespaces that + are not defined with a prefix; this can confuse soupselect. [bug=1810680] + +* Tried even harder to avoid the deprecation warning originally fixed in + 4.6.1. [bug=1778909] + += 4.7.0 (20181231) + +* Beautiful Soup's CSS Selector implementation has been replaced by a + dependency on Isaac Muse's SoupSieve project (the soupsieve package + on PyPI). The good news is that SoupSieve has a much more robust and + complete implementation of CSS selectors, resolving a large number + of longstanding issues. The bad news is that from this point onward, + SoupSieve must be installed if you want to use the select() method. + + You don't have to change anything lf you installed Beautiful Soup + through pip (SoupSieve will be automatically installed when you + upgrade Beautiful Soup) or if you don't use CSS selectors from + within Beautiful Soup. + + SoupSieve documentation: https://facelessuser.github.io/soupsieve/ + +* Added the PageElement.extend() method, which works like list.append(). + [bug=1514970] + +* PageElement.insert_before() and insert_after() now take a variable + number of arguments. [bug=1514970] + +* Fix a number of problems with the tree builder that caused + trees that were superficially okay, but which fell apart when bits + were extracted. Patch by Isaac Muse. [bug=1782928,1809910] + +* Fixed a problem with the tree builder in which elements that + contained no content (such as empty comments and all-whitespace + elements) were not being treated as part of the tree. Patch by Isaac + Muse. [bug=1798699] + +* Fixed a problem with multi-valued attributes where the value + contained whitespace. Thanks to Jens Svalgaard for the + fix. [bug=1787453] + +* Clarified ambiguous license statements in the source code. Beautiful + Soup is released under the MIT license, and has been since 4.4.0. + +* This file has been renamed from NEWS.txt to CHANGELOG. + += 4.6.3 (20180812) + +* Exactly the same as 4.6.2. Re-released to make the README file + render properly on PyPI. + += 4.6.2 (20180812) + +* Fix an exception when a custom formatter was asked to format a void + element. [bug=1784408] + += 4.6.1 (20180728) + +* Stop data loss when encountering an empty numeric entity, and + possibly in other cases. Thanks to tos.kamiya for the fix. [bug=1698503] + +* Preserve XML namespaces introduced inside an XML document, not just + the ones introduced at the top level. [bug=1718787] + +* Added a new formatter, "html5", which represents void elements + as "<element>" rather than "<element/>". [bug=1716272] + +* Fixed a problem where the html.parser tree builder interpreted + a string like "&foo " as the character entity "&foo;" [bug=1728706] + +* Correctly handle invalid HTML numeric character entities like “ + which reference code points that are not Unicode code points. Note + that this is only fixed when Beautiful Soup is used with the + html.parser parser -- html5lib already worked and I couldn't fix it + with lxml. [bug=1782933] + +* Improved the warning given when no parser is specified. [bug=1780571] + +* When markup contains duplicate elements, a select() call that + includes multiple match clauses will match all relevant + elements. [bug=1770596] + +* Fixed code that was causing deprecation warnings in recent Python 3 + versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496] + +* Fixed a Windows crash in diagnose() when checking whether a long + markup string is a filename. [bug=1737121] + +* Stopped HTMLParser from raising an exception in very rare cases of + bad markup. [bug=1708831] + +* Fixed a bug where find_all() was not working when asked to find a + tag with a namespaced name in an XML document that was parsed as + HTML. [bug=1723783] + +* You can get finer control over formatting by subclassing + bs4.element.Formatter and passing a Formatter instance into (e.g.) + encode(). [bug=1716272] + +* You can pass a dictionary of `attrs` into + BeautifulSoup.new_tag. This makes it possible to create a tag with + an attribute like 'name' that would otherwise be masked by another + argument of new_tag. [bug=1779276] + +* Clarified the deprecation warning when accessing tag.fooTag, to cover + the possibility that you might really have been looking for a tag + called 'fooTag'. + += 4.6.0 (20170507) = + +* Added the `Tag.get_attribute_list` method, which acts like `Tag.get` for + getting the value of an attribute, but which always returns a list, + whether or not the attribute is a multi-value attribute. [bug=1678589] + +* It's now possible to use a tag's namespace prefix when searching, + e.g. soup.find('namespace:tag') [bug=1655332] + +* Improved the handling of empty-element tags like <br> when using the + html.parser parser. [bug=1676935] + +* HTML parsers treat all HTML4 and HTML5 empty element tags (aka void + element tags) correctly. [bug=1656909] + +* Namespace prefix is preserved when an XML tag is copied. Thanks + to Vikas for a patch and test. [bug=1685172] + += 4.5.3 (20170102) = + +* Fixed foster parenting when html5lib is the tree builder. Thanks to + Geoffrey Sneddon for a patch and test. + +* Fixed yet another problem that caused the html5lib tree builder to + create a disconnected parse tree. [bug=1629825] + += 4.5.2 (20170102) = + +* Apart from the version number, this release is identical to + 4.5.3. Due to user error, it could not be completely uploaded to + PyPI. Use 4.5.3 instead. + += 4.5.1 (20160802) = + +* Fixed a crash when passing Unicode markup that contained a + processing instruction into the lxml HTML parser on Python + 3. [bug=1608048] + += 4.5.0 (20160719) = + +* Beautiful Soup is no longer compatible with Python 2.6. This + actually happened a few releases ago, but it's now official. + +* Beautiful Soup will now work with versions of html5lib greater than + 0.99999999. [bug=1603299] + +* If a search against each individual value of a multi-valued + attribute fails, the search will be run one final time against the + complete attribute value considered as a single string. That is, if + a tag has class="foo bar" and neither "foo" nor "bar" matches, but + "foo bar" does, the tag is now considered a match. + + This happened in previous versions, but only when the value being + searched for was a string. Now it also works when that value is + a regular expression, a list of strings, etc. [bug=1476868] + +* Fixed a bug that deranged the tree when a whitespace element was + reparented into a tag that contained an identical whitespace + element. [bug=1505351] + +* Added support for CSS selector values that contain quoted spaces, + such as tag[style="display: foo"]. [bug=1540588] + +* Corrected handling of XML processing instructions. [bug=1504393] + +* Corrected an encoding error that happened when a BeautifulSoup + object was copied. [bug=1554439] + +* The contents of <textarea> tags will no longer be modified when the + tree is prettified. [bug=1555829] + +* When a BeautifulSoup object is pickled but its tree builder cannot + be pickled, its .builder attribute is set to None instead of being + destroyed. This avoids a performance problem once the object is + unpickled. [bug=1523629] + +* Specify the file and line number when warning about a + BeautifulSoup object being instantiated without a parser being + specified. [bug=1574647] + +* The `limit` argument to `select()` now works correctly, though it's + not implemented very efficiently. [bug=1520530] + +* Fixed a Python 3 ByteWarning when a URL was passed in as though it + were markup. Thanks to James Salter for a patch and + test. [bug=1533762] + +* We don't run the check for a filename passed in as markup if the + 'filename' contains a less-than character; the less-than character + indicates it's most likely a very small document. [bug=1577864] + += 4.4.1 (20150928) = + +* Fixed a bug that deranged the tree when part of it was + removed. Thanks to Eric Weiser for the patch and John Wiseman for a + test. [bug=1481520] + +* Fixed a parse bug with the html5lib tree-builder. Thanks to Roel + Kramer for the patch. [bug=1483781] + +* Improved the implementation of CSS selector grouping. Thanks to + Orangain for the patch. [bug=1484543] + +* Fixed the test_detect_utf8 test so that it works when chardet is + installed. [bug=1471359] + +* Corrected the output of Declaration objects. [bug=1477847] + + += 4.4.0 (20150703) = + +Especially important changes: + +* Added a warning when you instantiate a BeautifulSoup object without + explicitly naming a parser. [bug=1398866] + +* __repr__ now returns an ASCII bytestring in Python 2, and a Unicode + string in Python 3, instead of a UTF8-encoded bytestring in both + versions. In Python 3, __str__ now returns a Unicode string instead + of a bytestring. [bug=1420131] + +* The `text` argument to the find_* methods is now called `string`, + which is more accurate. `text` still works, but `string` is the + argument described in the documentation. `text` may eventually + change its meaning, but not for a very long time. [bug=1366856] + +* Changed the way soup objects work under copy.copy(). Copying a + NavigableString or a Tag will give you a new NavigableString that's + equal to the old one but not connected to the parse tree. Patch by + Martijn Peters. [bug=1307490] + +* Started using a standard MIT license. [bug=1294662] + +* Added a Chinese translation of the documentation by Delong .w. + +New features: + +* Introduced the select_one() method, which uses a CSS selector but + only returns the first match, instead of a list of + matches. [bug=1349367] + +* You can now create a Tag object without specifying a + TreeBuilder. Patch by Martijn Pieters. [bug=1307471] + +* You can now create a NavigableString or a subclass just by invoking + the constructor. [bug=1294315] + +* Added an `exclude_encodings` argument to UnicodeDammit and to the + Beautiful Soup constructor, which lets you prohibit the detection of + an encoding that you know is wrong. [bug=1469408] + +* The select() method now supports selector grouping. Patch by + Francisco Canas [bug=1191917] + +Bug fixes: + +* Fixed yet another problem that caused the html5lib tree builder to + create a disconnected parse tree. [bug=1237763] + +* Force object_was_parsed() to keep the tree intact even when an element + from later in the document is moved into place. [bug=1430633] + +* Fixed yet another bug that caused a disconnected tree when html5lib + copied an element from one part of the tree to another. [bug=1270611] + +* Fixed a bug where Element.extract() could create an infinite loop in + the remaining tree. + +* The select() method can now find tags whose names contain + dashes. Patch by Francisco Canas. [bug=1276211] + +* The select() method can now find tags with attributes whose names + contain dashes. Patch by Marek Kapolka. [bug=1304007] + +* Improved the lxml tree builder's handling of processing + instructions. [bug=1294645] + +* Restored the helpful syntax error that happens when you try to + import the Python 2 edition of Beautiful Soup under Python + 3. [bug=1213387] + +* In Python 3.4 and above, set the new convert_charrefs argument to + the html.parser constructor to avoid a warning and future + failures. Patch by Stefano Revera. [bug=1375721] + +* The warning when you pass in a filename or URL as markup will now be + displayed correctly even if the filename or URL is a Unicode + string. [bug=1268888] + +* If the initial <html> tag contains a CDATA list attribute such as + 'class', the html5lib tree builder will now turn its value into a + list, as it would with any other tag. [bug=1296481] + +* Fixed an import error in Python 3.5 caused by the removal of the + HTMLParseError class. [bug=1420063] + +* Improved docstring for encode_contents() and + decode_contents(). [bug=1441543] + +* Fixed a crash in Unicode, Dammit's encoding detector when the name + of the encoding itself contained invalid bytes. [bug=1360913] + +* Improved the exception raised when you call .unwrap() or + .replace_with() on an element that's not attached to a tree. + +* Raise a NotImplementedError whenever an unsupported CSS pseudoclass + is used in select(). Previously some cases did not result in a + NotImplementedError. + +* It's now possible to pickle a BeautifulSoup object no matter which + tree builder was used to create it. However, the only tree builder + that survives the pickling process is the HTMLParserTreeBuilder + ('html.parser'). If you unpickle a BeautifulSoup object created with + some other tree builder, soup.builder will be None. [bug=1231545] + += 4.3.2 (20131002) = + +* Fixed a bug in which short Unicode input was improperly encoded to + ASCII when checking whether or not it was the name of a file on + disk. [bug=1227016] + +* Fixed a crash when a short input contains data not valid in + filenames. [bug=1232604] + +* Fixed a bug that caused Unicode data put into UnicodeDammit to + return None instead of the original data. [bug=1214983] + +* Combined two tests to stop a spurious test failure when tests are + run by nosetests. [bug=1212445] + += 4.3.1 (20130815) = + +* Fixed yet another problem with the html5lib tree builder, caused by + html5lib's tendency to rearrange the tree during + parsing. [bug=1189267] + +* Fixed a bug that caused the optimized version of find_all() to + return nothing. [bug=1212655] + += 4.3.0 (20130812) = + +* Instead of converting incoming data to Unicode and feeding it to the + lxml tree builder in chunks, Beautiful Soup now makes successive + guesses at the encoding of the incoming data, and tells lxml to + parse the data as that encoding. Giving lxml more control over the + parsing process improves performance and avoids a number of bugs and + issues with the lxml parser which had previously required elaborate + workarounds: + + - An issue in which lxml refuses to parse Unicode strings on some + systems. [bug=1180527] + + - A returning bug that truncated documents longer than a (very + small) size. [bug=963880] + + - A returning bug in which extra spaces were added to a document if + the document defined a charset other than UTF-8. [bug=972466] + + This required a major overhaul of the tree builder architecture. If + you wrote your own tree builder and didn't tell me, you'll need to + modify your prepare_markup() method. + +* The UnicodeDammit code that makes guesses at encodings has been + split into its own class, EncodingDetector. A lot of apparently + redundant code has been removed from Unicode, Dammit, and some + undocumented features have also been removed. + +* Beautiful Soup will issue a warning if instead of markup you pass it + a URL or the name of a file on disk (a common beginner's mistake). + +* A number of optimizations improve the performance of the lxml tree + builder by about 33%, the html.parser tree builder by about 20%, and + the html5lib tree builder by about 15%. + +* All find_all calls should now return a ResultSet object. Patch by + Aaron DeVore. [bug=1194034] + += 4.2.1 (20130531) = + +* The default XML formatter will now replace ampersands even if they + appear to be part of entities. That is, "<" will become + "&lt;". The old code was left over from Beautiful Soup 3, which + didn't always turn entities into Unicode characters. + + If you really want the old behavior (maybe because you add new + strings to the tree, those strings include entities, and you want + the formatter to leave them alone on output), it can be found in + EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183] + +* Gave new_string() the ability to create subclasses of + NavigableString. [bug=1181986] + +* Fixed another bug by which the html5lib tree builder could create a + disconnected tree. [bug=1182089] + +* The .previous_element of a BeautifulSoup object is now always None, + not the last element to be parsed. [bug=1182089] + +* Fixed test failures when lxml is not installed. [bug=1181589] + +* html5lib now supports Python 3. Fixed some Python 2-specific + code in the html5lib test suite. [bug=1181624] + +* The html.parser treebuilder can now handle numeric attributes in + text when the hexidecimal name of the attribute starts with a + capital X. Patch by Tim Shirley. [bug=1186242] + += 4.2.0 (20130514) = + +* The Tag.select() method now supports a much wider variety of CSS + selectors. + + - Added support for the adjacent sibling combinator (+) and the + general sibling combinator (~). Tests by "liquider". [bug=1082144] + + - The combinators (>, +, and ~) can now combine with any supported + selector, not just one that selects based on tag name. + + - Added limited support for the "nth-of-type" pseudo-class. Code + by Sven Slootweg. [bug=1109952] + +* The BeautifulSoup class is now aliased to "_s" and "_soup", making + it quicker to type the import statement in an interactive session: + + from bs4 import _s + or + from bs4 import _soup + + The alias may change in the future, so don't use this in code you're + going to run more than once. + +* Added the 'diagnose' submodule, which includes several useful + functions for reporting problems and doing tech support. + + - diagnose(data) tries the given markup on every installed parser, + reporting exceptions and displaying successes. If a parser is not + installed, diagnose() mentions this fact. + + - lxml_trace(data, html=True) runs the given markup through lxml's + XML parser or HTML parser, and prints out the parser events as + they happen. This helps you quickly determine whether a given + problem occurs in lxml code or Beautiful Soup code. + + - htmlparser_trace(data) is the same thing, but for Python's + built-in HTMLParser class. + +* In an HTML document, the contents of a <script> or <style> tag will + no longer undergo entity substitution by default. XML documents work + the same way they did before. [bug=1085953] + +* Methods like get_text() and properties like .strings now only give + you strings that are visible in the document--no comments or + processing commands. [bug=1050164] + +* The prettify() method now leaves the contents of <pre> tags + alone. [bug=1095654] + +* Fix a bug in the html5lib treebuilder which sometimes created + disconnected trees. [bug=1039527] + +* Fix a bug in the lxml treebuilder which crashed when a tag included + an attribute from the predefined "xml:" namespace. [bug=1065617] + +* Fix a bug by which keyword arguments to find_parent() were not + being passed on. [bug=1126734] + +* Stop a crash when unwisely messing with a tag that's been + decomposed. [bug=1097699] + +* Now that lxml's segfault on invalid doctype has been fixed, fixed a + corresponding problem on the Beautiful Soup end that was previously + invisible. [bug=984936] + +* Fixed an exception when an overspecified CSS selector didn't match + anything. Code by Stefaan Lippens. [bug=1168167] + += 4.1.3 (20120820) = + +* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious + test failure caused by the lousy HTMLParser in those + versions. [bug=1038503] + +* Raise a more specific error (FeatureNotFound) when a requested + parser or parser feature is not installed. Raise NotImplementedError + instead of ValueError when the user calls insert_before() or + insert_after() on the BeautifulSoup object itself. Patch by Aaron + Devore. [bug=1038301] + += 4.1.2 (20120817) = + +* As per PEP-8, allow searching by CSS class using the 'class_' + keyword argument. [bug=1037624] + +* Display namespace prefixes for namespaced attribute names, instead of + the fully-qualified names given by the lxml parser. [bug=1037597] + +* Fixed a crash on encoding when an attribute name contained + non-ASCII characters. + +* When sniffing encodings, if the cchardet library is installed, + Beautiful Soup uses it instead of chardet. cchardet is much + faster. [bug=1020748] + +* Use logging.warning() instead of warning.warn() to notify the user + that characters were replaced with REPLACEMENT + CHARACTER. [bug=1013862] + += 4.1.1 (20120703) = + +* Fixed an html5lib tree builder crash which happened when html5lib + moved a tag with a multivalued attribute from one part of the tree + to another. [bug=1019603] + +* Correctly display closing tags with an XML namespace declared. Patch + by Andreas Kostyrka. [bug=1019635] + +* Fixed a typo that made parsing significantly slower than it should + have been, and also waited too long to close tags with XML + namespaces. [bug=1020268] + +* get_text() now returns an empty Unicode string if there is no text, + rather than an empty bytestring. [bug=1020387] + += 4.1.0 (20120529) = + +* Added experimental support for fixing Windows-1252 characters + embedded in UTF-8 documents. (UnicodeDammit.detwingle()) + +* Fixed the handling of " with the built-in parser. [bug=993871] + +* Comments, processing instructions, document type declarations, and + markup declarations are now treated as preformatted strings, the way + CData blocks are. [bug=1001025] + +* Fixed a bug with the lxml treebuilder that prevented the user from + adding attributes to a tag that didn't originally have + attributes. [bug=1002378] Thanks to Oliver Beattie for the patch. + +* Fixed some edge-case bugs having to do with inserting an element + into a tag it's already inside, and replacing one of a tag's + children with another. [bug=997529] + +* Added the ability to search for attribute values specified in UTF-8. [bug=1003974] + + This caused a major refactoring of the search code. All the tests + pass, but it's possible that some searches will behave differently. + += 4.0.5 (20120427) = + +* Added a new method, wrap(), which wraps an element in a tag. + +* Renamed replace_with_children() to unwrap(), which is easier to + understand and also the jQuery name of the function. + +* Made encoding substitution in <meta> tags completely transparent (no + more %SOUP-ENCODING%). + +* Fixed a bug in decoding data that contained a byte-order mark, such + as data encoded in UTF-16LE. [bug=988980] + +* Fixed a bug that made the HTMLParser treebuilder generate XML + definitions ending with two question marks instead of + one. [bug=984258] + +* Upon document generation, CData objects are no longer run through + the formatter. [bug=988905] + +* The test suite now passes when lxml is not installed, whether or not + html5lib is installed. [bug=987004] + +* Print a warning on HTMLParseErrors to let people know they should + install a better parser library. + += 4.0.4 (20120416) = + +* Fixed a bug that sometimes created disconnected trees. + +* Fixed a bug with the string setter that moved a string around the + tree instead of copying it. [bug=983050] + +* Attribute values are now run through the provided output formatter. + Previously they were always run through the 'minimal' formatter. In + the future I may make it possible to specify different formatters + for attribute values and strings, but for now, consistent behavior + is better than inconsistent behavior. [bug=980237] + +* Added the missing renderContents method from Beautiful Soup 3. Also + added an encode_contents() method to go along with decode_contents(). + +* Give a more useful error when the user tries to run the Python 2 + version of BS under Python 3. + +* UnicodeDammit can now convert Microsoft smart quotes to ASCII with + UnicodeDammit(markup, smart_quotes_to="ascii"). + += 4.0.3 (20120403) = + +* Fixed a typo that caused some versions of Python 3 to convert the + Beautiful Soup codebase incorrectly. + +* Got rid of the 4.0.2 workaround for HTML documents--it was + unnecessary and the workaround was triggering a (possibly different, + but related) bug in lxml. [bug=972466] + += 4.0.2 (20120326) = + +* Worked around a possible bug in lxml that prevents non-tiny XML + documents from being parsed. [bug=963880, bug=963936] + +* Fixed a bug where specifying `text` while also searching for a tag + only worked if `text` wanted an exact string match. [bug=955942] + += 4.0.1 (20120314) = + +* This is the first official release of Beautiful Soup 4. There is no + 4.0.0 release, to eliminate any possibility that packaging software + might treat "4.0.0" as being an earlier version than "4.0.0b10". + +* Brought BS up to date with the latest release of soupselect, adding + CSS selector support for direct descendant matches and multiple CSS + class matches. + += 4.0.0b10 (20120302) = + +* Added support for simple CSS selectors, taken from the soupselect project. + +* Fixed a crash when using html5lib. [bug=943246] + +* In HTML5-style <meta charset="foo"> tags, the value of the "charset" + attribute is now replaced with the appropriate encoding on + output. [bug=942714] + +* Fixed a bug that caused calling a tag to sometimes call find_all() + with the wrong arguments. [bug=944426] + +* For backwards compatibility, brought back the BeautifulStoneSoup + class as a deprecated wrapper around BeautifulSoup. + += 4.0.0b9 (20120228) = + +* Fixed the string representation of DOCTYPEs that have both a public + ID and a system ID. + +* Fixed the generated XML declaration. + +* Renamed Tag.nsprefix to Tag.prefix, for consistency with + NamespacedAttribute. + +* Fixed a test failure that occurred on Python 3.x when chardet was + installed. + +* Made prettify() return Unicode by default, so it will look nice on + Python 3 when passed into print(). + += 4.0.0b8 (20120224) = + +* All tree builders now preserve namespace information in the + documents they parse. If you use the html5lib parser or lxml's XML + parser, you can access the namespace URL for a tag as tag.namespace. + + However, there is no special support for namespace-oriented + searching or tree manipulation. When you search the tree, you need + to use namespace prefixes exactly as they're used in the original + document. + +* The string representation of a DOCTYPE always ends in a newline. + +* Issue a warning if the user tries to use a SoupStrainer in + conjunction with the html5lib tree builder, which doesn't support + them. + += 4.0.0b7 (20120223) = + +* Upon decoding to string, any characters that can't be represented in + your chosen encoding will be converted into numeric XML entity + references. + +* Issue a warning if characters were replaced with REPLACEMENT + CHARACTER during Unicode conversion. + +* Restored compatibility with Python 2.6. + +* The install process no longer installs docs or auxiliary text files. + +* It's now possible to deepcopy a BeautifulSoup object created with + Python's built-in HTML parser. + +* About 100 unit tests that "test" the behavior of various parsers on + invalid markup have been removed. Legitimate changes to those + parsers caused these tests to fail, indicating that perhaps + Beautiful Soup should not test the behavior of foreign + libraries. + + The problematic unit tests have been reformulated as informational + comparisons generated by the script + scripts/demonstrate_parser_differences.py. + + This makes Beautiful Soup compatible with html5lib version 0.95 and + future versions of HTMLParser. + += 4.0.0b6 (20120216) = + +* Multi-valued attributes like "class" always have a list of values, + even if there's only one value in the list. + +* Added a number of multi-valued attributes defined in HTML5. + +* Stopped generating a space before the slash that closes an + empty-element tag. This may come back if I add a special XHTML mode + (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty + useless. + +* Passing text along with tag-specific arguments to a find* method: + + find("a", text="Click here") + + will find tags that contain the given text as their + .string. Previously, the tag-specific arguments were ignored and + only strings were searched. + +* Fixed a bug that caused the html5lib tree builder to build a + partially disconnected tree. Generally cleaned up the html5lib tree + builder. + +* If you restrict a multi-valued attribute like "class" to a string + that contains spaces, Beautiful Soup will only consider it a match + if the values correspond to that specific string. + += 4.0.0b5 (20120209) = + +* Rationalized Beautiful Soup's treatment of CSS class. A tag + belonging to multiple CSS classes is treated as having a list of + values for the 'class' attribute. Searching for a CSS class will + match *any* of the CSS classes. + + This actually affects all attributes that the HTML standard defines + as taking multiple values (class, rel, rev, archive, accept-charset, + and headers), but 'class' is by far the most common. [bug=41034] + +* If you pass anything other than a dictionary as the second argument + to one of the find* methods, it'll assume you want to use that + object to search against a tag's CSS classes. Previously this only + worked if you passed in a string. + +* Fixed a bug that caused a crash when you passed a dictionary as an + attribute value (possibly because you mistyped "attrs"). [bug=842419] + +* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags + like <meta charset="utf-8" />. [bug=837268] + +* If Unicode, Dammit can't figure out a consistent encoding for a + page, it will try each of its guesses again, with errors="replace" + instead of errors="strict". This may mean that some data gets + replaced with REPLACEMENT CHARACTER, but at least most of it will + get turned into Unicode. [bug=754903] + +* Patched over a bug in html5lib (?) that was crashing Beautiful Soup + on certain kinds of markup. [bug=838800] + +* Fixed a bug that wrecked the tree if you replaced an element with an + empty string. [bug=728697] + +* Improved Unicode, Dammit's behavior when you give it Unicode to + begin with. + += 4.0.0b4 (20120208) = + +* Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() + +* BeautifulSoup.new_tag() will follow the rules of whatever + tree-builder was used to create the original BeautifulSoup object. A + new <p> tag will look like "<p />" if the soup object was created to + parse XML, but it will look like "<p></p>" if the soup object was + created to parse HTML. + +* We pass in strict=False to html.parser on Python 3, greatly + improving html.parser's ability to handle bad HTML. + +* We also monkeypatch a serious bug in html.parser that made + strict=False disastrous on Python 3.2.2. + +* Replaced the "substitute_html_entities" argument with the + more general "formatter" argument. + +* Bare ampersands and angle brackets are always converted to XML + entities unless the user prevents it. + +* Added PageElement.insert_before() and PageElement.insert_after(), + which let you put an element into the parse tree with respect to + some other element. + +* Raise an exception when the user tries to do something nonsensical + like insert a tag into itself. + + += 4.0.0b3 (20120203) = + +Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful +Soup's custom HTML parser in favor of a system that lets you write a +little glue code and plug in any HTML or XML parser you want. + +Beautiful Soup 4.0 comes with glue code for four parsers: + + * Python's standard HTMLParser (html.parser in Python 3) + * lxml's HTML and XML parsers + * html5lib's HTML parser + +HTMLParser is the default, but I recommend you install lxml if you +can. + +For complete documentation, see the Sphinx documentation in +bs4/doc/source/. What follows is a summary of the changes from +Beautiful Soup 3. + +=== The module name has changed === + +Previously you imported the BeautifulSoup class from a module also +called BeautifulSoup. To save keystrokes and make it clear which +version of the API is in use, the module is now called 'bs4': + + >>> from bs4 import BeautifulSoup + +=== It works with Python 3 === + +Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was +so bad that it barely worked at all. Beautiful Soup 4 works with +Python 3, and since its parser is pluggable, you don't sacrifice +quality. + +Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 +support to the finish line. Ezio Melotti is also to thank for greatly +improving the HTML parser that comes with Python 3.2. + +=== CDATA sections are normal text, if they're understood at all. === + +Currently, the lxml and html5lib HTML parsers ignore CDATA sections in +markup: + + <p><![CDATA[foo]]></p> => <p></p> + +A future version of html5lib will turn CDATA sections into text nodes, +but only within tags like <svg> and <math>: + + <svg><![CDATA[foo]]></svg> => <p>foo</p> + +The default XML parser (which uses lxml behind the scenes) turns CDATA +sections into ordinary text elements: + + <p><![CDATA[foo]]></p> => <p>foo</p> + +In theory it's possible to preserve the CDATA sections when using the +XML parser, but I don't see how to get it to work in practice. + +=== Miscellaneous other stuff === + +If the BeautifulSoup instance has .is_xml set to True, an appropriate +XML declaration will be emitted when the tree is transformed into a +string: + + <?xml version="1.0" encoding="utf-8"> + <markup> + ... + </markup> + +The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree +builders set it to False. If you want to parse XHTML with an HTML +parser, you can set it manually. + + += 3.2.0 = + +The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2 +to make it obvious which one you should use. + += 3.1.0 = + +A hybrid version that supports 2.4 and can be automatically converted +to run under Python 3.0. There are three backwards-incompatible +changes you should be aware of, but no new features or deliberate +behavior changes. + +1. str() may no longer do what you want. This is because the meaning +of str() inverts between Python 2 and 3; in Python 2 it gives you a +byte string, in Python 3 it gives you a Unicode string. + +The effect of this is that you can't pass an encoding to .__str__ +anymore. Use encode() to get a string and decode() to get Unicode, and +you'll be ready (well, readier) for Python 3. + +2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, +which is gone in Python 3. There's some bad HTML that SGMLParser +handled but HTMLParser doesn't, usually to do with attribute values +that aren't closed or have brackets inside them: + + <a href="foo</a>, </a><a href="bar">baz</a> + <a b="<a>">', '<a b="<a>"></a><a>"></a> + +A later version of Beautiful Soup will allow you to plug in different +parsers to make tradeoffs between speed and the ability to handle bad +HTML. + +3. In Python 3 (but not Python 2), HTMLParser converts entities within +attributes to the corresponding Unicode characters. In Python 2 it's +possible to parse this string and leave the é intact. + + <a href="http://crummy.com?sacré&bleu"> + +In Python 3, the é is always converted to \xe9 during +parsing. + + += 3.0.7a = + +Added an import that makes BS work in Python 2.3. + + += 3.0.7 = + +Fixed a UnicodeDecodeError when unpickling documents that contain +non-ASCII characters. + +Fixed a TypeError that occurred in some circumstances when a tag +contained no text. + +Jump through hoops to avoid the use of chardet, which can be extremely +slow in some circumstances. UTF-8 documents should never trigger the +use of chardet. + +Whitespace is preserved inside <pre> and <textarea> tags that contain +nothing but whitespace. + +Beautiful Soup can now parse a doctype that's scoped to an XML namespace. + + += 3.0.6 = + +Got rid of a very old debug line that prevented chardet from working. + +Added a Tag.decompose() method that completely disconnects a tree or a +subset of a tree, breaking it up into bite-sized pieces that are +easy for the garbage collecter to collect. + +Tag.extract() now returns the tag that was extracted. + +Tag.findNext() now does something with the keyword arguments you pass +it instead of dropping them on the floor. + +Fixed a Unicode conversion bug. + +Fixed a bug that garbled some <meta> tags when rewriting them. + + += 3.0.5 = + +Soup objects can now be pickled, and copied with copy.deepcopy. + +Tag.append now works properly on existing BS objects. (It wasn't +originally intended for outside use, but it can be now.) (Giles +Radford) + +Passing in a nonexistent encoding will no longer crash the parser on +Python 2.4 (John Nagle). + +Fixed an underlying bug in SGMLParser that thinks ASCII has 255 +characters instead of 127 (John Nagle). + +Entities are converted more consistently to Unicode characters. + +Entity references in attribute values are now converted to Unicode +characters when appropriate. Numeric entities are always converted, +because SGMLParser always converts them outside of attribute values. + +ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to +XHTML_ENTITIES. + +The regular expression for bare ampersands was too loose. In some +cases ampersands were not being escaped. (Sam Ruby?) + +Non-breaking spaces and other special Unicode space characters are no +longer folded to ASCII spaces. (Robert Leftwich) + +Information inside a TEXTAREA tag is now parsed literally, not as HTML +tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) + += 3.0.4 = + +Fixed a bug that crashed Unicode conversion in some cases. + +Fixed a bug that prevented UnicodeDammit from being used as a +general-purpose data scrubber. + +Fixed some unit test failures when running against Python 2.5. + +When considering whether to convert smart quotes, UnicodeDammit now +looks at the original encoding in a case-insensitive way. + += 3.0.3 (20060606) = + +Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be +sure to pass in an appropriate value for convertEntities, or XML/HTML +entities might stick around that aren't valid in HTML/XML). The result +may not validate, but it should be good enough to not choke a +real-world XML parser. Specifically, the output of a properly +constructed soup object should always be valid as part of an XML +document, but parts may be missing if they were missing in the +original. As always, if the input is valid XML, the output will also +be valid. + += 3.0.2 (20060602) = + +Previously, Beautiful Soup correctly handled attribute values that +contained embedded quotes (sometimes by escaping), but not other kinds +of XML character. Now, it correctly handles or escapes all special XML +characters in attribute values. + +I aliased methods to the 2.x names (fetch, find, findText, etc.) for +backwards compatibility purposes. Those names are deprecated and if I +ever do a 4.0 I will remove them. I will, I tell you! + +Fixed a bug where the findAll method wasn't passing along any keyword +arguments. + +When run from the command line, Beautiful Soup now acts as an HTML +pretty-printer, not an XML pretty-printer. + += 3.0.1 (20060530) = + +Reintroduced the "fetch by CSS class" shortcut. I thought keyword +arguments would replace it, but they don't. You can't call soup('a', +class='foo') because class is a Python keyword. + +If Beautiful Soup encounters a meta tag that declares the encoding, +but a SoupStrainer tells it not to parse that tag, Beautiful Soup will +no longer try to rewrite the meta tag to mention the new +encoding. Basically, this makes SoupStrainers work in real-world +applications instead of crashing the parser. + += 3.0.0 "Who would not give all else for two p" (20060528) = + +This release is not backward-compatible with previous releases. If +you've got code written with a previous version of the library, go +ahead and keep using it, unless one of the features mentioned here +really makes your life easier. Since the library is self-contained, +you can include an old copy of the library in your old applications, +and use the new version for everything else. + +The documentation has been rewritten and greatly expanded with many +more examples. + +Beautiful Soup autodetects the encoding of a document (or uses the one +you specify), and converts it from its native encoding to +Unicode. Internally, it only deals with Unicode strings. When you +print out the document, it converts to UTF-8 (or another encoding you +specify). [Doc reference] + +It's now easy to make large-scale changes to the parse tree without +screwing up the navigation members. The methods are extract, +replaceWith, and insert. [Doc reference. See also Improving Memory +Usage with extract] + +Passing True in as an attribute value gives you tags that have any +value for that attribute. You don't have to create a regular +expression. Passing None for an attribute value gives you tags that +don't have that attribute at all. + +Tag objects now know whether or not they're self-closing. This avoids +the problem where Beautiful Soup thought that tags like <BR /> were +self-closing even in XML documents. You can customize the self-closing +tags for a parser object by passing them in as a list of +selfClosingTags: you don't have to subclass anymore. + +There's a new built-in parser, MinimalSoup, which has most of +BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc +reference] + +You can use a SoupStrainer to tell Beautiful Soup to parse only part +of a document. This saves time and memory, often making Beautiful Soup +about as fast as a custom-built SGMLParser subclass. [Doc reference, +SoupStrainer reference] + +You can (usually) use keyword arguments instead of passing a +dictionary of attributes to a search method. That is, you can replace +soup(args={"id" : "5"}) with soup(id="5"). You can still use args if +(for instance) you need to find an attribute whose name clashes with +the name of an argument to findAll. [Doc reference: **kwargs attrs] + +The method names have changed to the better method names used in +Rubyful Soup. Instead of find methods and fetch methods, there are +only find methods. Instead of a scheme where you can't remember which +method finds one element and which one finds them all, we have find +and findAll. In general, if the method name mentions All or a plural +noun (eg. findNextSiblings), then it finds many elements +method. Otherwise, it only finds one element. [Doc reference] + +Some of the argument names have been renamed for clarity. For instance +avoidParserProblems is now parserMassage. + +Beautiful Soup no longer implements a feed method. You need to pass a +string or a filehandle into the soup constructor, not with feed after +the soup has been created. There is still a feed method, but it's the +feed method implemented by SGMLParser and calling it will bypass +Beautiful Soup and cause problems. + +The NavigableText class has been renamed to NavigableString. There is +no NavigableUnicodeString anymore, because every string inside a +Beautiful Soup parse tree is a Unicode string. + +findText and fetchText are gone. Just pass a text argument into find +or findAll. + +Null was more trouble than it was worth, so I got rid of it. Anything +that used to return Null now returns None. + +Special XML constructs like comments and CDATA now have their own +NavigableString subclasses, instead of being treated as oddly-formed +data. If you parse a document that contains CDATA and write it back +out, the CDATA will still be there. + +When you're parsing a document, you can get Beautiful Soup to convert +XML or HTML entities into the corresponding Unicode characters. [Doc +reference] + += 2.1.1 (20050918) = + +Fixed a serious performance bug in BeautifulStoneSoup which was +causing parsing to be incredibly slow. + +Corrected several entities that were previously being incorrectly +translated from Microsoft smart-quote-like characters. + +Fixed a bug that was breaking text fetch. + +Fixed a bug that crashed the parser when text chunks that look like +HTML tag names showed up within a SCRIPT tag. + +THEAD, TBODY, and TFOOT tags are now nestable within TABLE +tags. Nested tables should parse more sensibly now. + +BASE is now considered a self-closing tag. + += 2.1.0 "Game, or any other dish?" (20050504) = + +Added a wide variety of new search methods which, given a starting +point inside the tree, follow a particular navigation member (like +nextSibling) over and over again, looking for Tag and NavigableText +objects that match certain criteria. The new methods are findNext, +fetchNext, findPrevious, fetchPrevious, findNextSibling, +fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings, +findParent, and fetchParents. All of these use the same basic code +used by first and fetch, so you can pass your weird ways of matching +things into these methods. + +The fetch method and its derivatives now accept a limit argument. + +You can now pass keyword arguments when calling a Tag object as though +it were a method. + +Fixed a bug that caused all hand-created tags to share a single set of +attributes. + += 2.0.3 (20050501) = + +Fixed Python 2.2 support for iterators. + +Fixed a bug that gave the wrong representation to tags within quote +tags like <script>. + +Took some code from Mark Pilgrim that treats CDATA declarations as +data instead of ignoring them. + +Beautiful Soup's setup.py will now do an install even if the unit +tests fail. It won't build a source distribution if the unit tests +fail, so I can't release a new version unless they pass. + += 2.0.2 (20050416) = + +Added the unit tests in a separate module, and packaged it with +distutils. + +Fixed a bug that sometimes caused renderContents() to return a Unicode +string even if there was no Unicode in the original string. + +Added the done() method, which closes all of the parser's open +tags. It gets called automatically when you pass in some text to the +constructor of a parser class; otherwise you must call it yourself. + +Reinstated some backwards compatibility with 1.x versions: referencing +the string member of a NavigableText object returns the NavigableText +object instead of throwing an error. + += 2.0.1 (20050412) = + +Fixed a bug that caused bad results when you tried to reference a tag +name shorter than 3 characters as a member of a Tag, eg. tag.table.td. + +Made sure all Tags have the 'hidden' attribute so that an attempt to +access tag.hidden doesn't spawn an attempt to find a tag named +'hidden'. + +Fixed a bug in the comparison operator. + += 2.0.0 "Who cares for fish?" (20050410) + +Beautiful Soup version 1 was very useful but also pretty stupid. I +originally wrote it without noticing any of the problems inherent in +trying to build a parse tree out of ambiguous HTML tags. This version +solves all of those problems to my satisfaction. It also adds many new +clever things to make up for the removal of the stupid things. + +== Parsing == + +The parser logic has been greatly improved, and the BeautifulSoup +class should much more reliably yield a parse tree that looks like +what the page author intended. For a particular class of odd edge +cases that now causes problems, there is a new class, +ICantBelieveItsBeautifulSoup. + +By default, Beautiful Soup now performs some cleanup operations on +text before parsing it. This is to avoid common problems with bad +definitions and self-closing tags that crash SGMLParser. You can +provide your own set of cleanup operations, or turn it off +altogether. The cleanup operations include fixing self-closing tags +that don't close, and replacing Microsoft smart quotes and similar +characters with their HTML entity equivalents. + +You can now get a pretty-print version of parsed HTML to get a visual +picture of how Beautiful Soup parses it, with the Tag.prettify() +method. + +== Strings and Unicode == + +There are separate NavigableText subclasses for ASCII and Unicode +strings. These classes directly subclass the corresponding base data +types. This means you can treat NavigableText objects as strings +instead of having to call methods on them to get the strings. + +str() on a Tag always returns a string, and unicode() always returns +Unicode. Previously it was inconsistent. + +== Tree traversal == + +In a first() or fetch() call, the tag name or the desired value of an +attribute can now be any of the following: + + * A string (matches that specific tag or that specific attribute value) + * A list of strings (matches any tag or attribute value in the list) + * A compiled regular expression object (matches any tag or attribute + value that matches the regular expression) + * A callable object that takes the Tag object or attribute value as a + string. It returns None/false/empty string if the given string + doesn't match, and any other value if it does. + +This is much easier to use than SQL-style wildcards (see, regular +expressions are good for something). Because of this, I took out +SQL-style wildcards. I'll put them back if someone complains, but +their removal simplifies the code a lot. + +You can use fetch() and first() to search for text in the parse tree, +not just tags. There are new alias methods fetchText() and firstText() +designed for this purpose. As with searching for tags, you can pass in +a string, a regular expression object, or a method to match your text. + +If you pass in something besides a map to the attrs argument of +fetch() or first(), Beautiful Soup will assume you want to match that +thing against the "class" attribute. When you're scraping +well-structured HTML, this makes your code a lot cleaner. + +1.x and 2.x both let you call a Tag object as a shorthand for +fetch(). For instance, foo("bar") is a shorthand for +foo.fetch("bar"). In 2.x, you can also access a specially-named member +of a Tag object as a shorthand for first(). For instance, foo.barTag +is a shorthand for foo.first("bar"). By chaining these shortcuts you +traverse a tree in very little code: for header in +soup.bodyTag.pTag.tableTag('th'): + +If an element relationship (like parent or next) doesn't apply to a +tag, it'll now show up Null instead of None. first() will also return +Null if you ask it for a nonexistent tag. Null is an object that's +just like None, except you can do whatever you want to it and it'll +give you Null instead of throwing an error. + +This lets you do tree traversals like soup.htmlTag.headTag.titleTag +without having to worry if the intermediate stages are actually +there. Previously, if there was no 'head' tag in the document, headTag +in that instance would have been None, and accessing its 'titleTag' +member would have thrown an AttributeError. Now, you can get what you +want when it exists, and get Null when it doesn't, without having to +do a lot of conditionals checking to see if every stage is None. + +There are two new relations between page elements: previousSibling and +nextSibling. They reference the previous and next element at the same +level of the parse tree. For instance, if you have HTML like this: + + <p><ul><li>Foo<br /><li>Bar</ul> + +The first 'li' tag has a previousSibling of Null and its nextSibling +is the second 'li' tag. The second 'li' tag has a nextSibling of Null +and its previousSibling is the first 'li' tag. The previousSibling of +the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the +'br' tag. + +I took out the ability to use fetch() to find tags that have a +specific list of contents. See, I can't even explain it well. It was +really difficult to use, I never used it, and I don't think anyone +else ever used it. To the extent anyone did, they can probably use +fetchText() instead. If it turns out someone needs it I'll think of +another solution. + +== Tree manipulation == + +You can add new attributes to a tag, and delete attributes from a +tag. In 1.x you could only change a tag's existing attributes. + +== Porting Considerations == + +There are three changes in 2.0 that break old code: + +In the post-1.2 release you could pass in a function into fetch(). The +function took a string, the tag name. In 2.0, the function takes the +actual Tag object. + +It's no longer to pass in SQL-style wildcards to fetch(). Use a +regular expression instead. + +The different parsing algorithm means the parse tree may not be shaped +like you expect. This will only actually affect you if your code uses +one of the affected parts. I haven't run into this problem yet while +porting my code. + += Between 1.2 and 2.0 = + +This is the release to get if you want Python 1.5 compatibility. + +The desired value of an attribute can now be any of the following: + + * A string + * A string with SQL-style wildcards + * A compiled RE object + * A callable that returns None/false/empty string if the given value + doesn't match, and any other value otherwise. + +This is much easier to use than SQL-style wildcards (see, regular +expressions are good for something). Because of this, I no longer +recommend you use SQL-style wildcards. They may go away in a future +release to clean up the code. + +Made Beautiful Soup handle processing instructions as text instead of +ignoring them. + +Applied patch from Richie Hindle (richie at entrian dot com) that +makes tag.string a shorthand for tag.contents[0].string when the tag +has only one string-owning child. + +Added still more nestable tags. The nestable tags thing won't work in +a lot of cases and needs to be rethought. + +Fixed an edge case where searching for "%foo" would match any string +shorter than "foo". + += 1.2 "Who for such dainties would not stoop?" (20040708) = + +Applied patch from Ben Last (ben at benlast dot com) that made +Tag.renderContents() correctly handle Unicode. + +Made BeautifulStoneSoup even dumber by making it not implicitly close +a tag when another tag of the same type is encountered; only when an +actual closing tag is encountered. This change courtesy of Fuzzy (mike +at pcblokes dot com). BeautifulSoup still works as before. + += 1.1 "Swimming in a hot tureen" = + +Added more 'nestable' tags. Changed popping semantics so that when a +nestable tag is encountered, tags are popped up to the previously +encountered nestable tag (of whatever kind). I will revert this if +enough people complain, but it should make more people's lives easier +than harder. This enhancement was suggested by Anthony Baxter (anthony +at interlink dot com dot au). + += 1.0 "So rich and green" (20040420) = + +Initial release. diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/PKG-INFO b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/PKG-INFO new file mode 100644 index 00000000000..c5685222994 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/PKG-INFO @@ -0,0 +1,126 @@ +Metadata-Version: 2.1 +Name: beautifulsoup4 +Version: 4.9.3 +Summary: Screen-scraping library +Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ +Author: Leonard Richardson +Author-email: leonardr@segfault.org +License: MIT +Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ +Description: Beautiful Soup is a library that makes it easy to scrape information + from web pages. It sits atop an HTML or XML parser, providing Pythonic + idioms for iterating, searching, and modifying the parse tree. + + # Quick start + + ``` + >>> from bs4 import BeautifulSoup + >>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML") + >>> print(soup.prettify()) + <html> + <body> + <p> + Some + <b> + bad + <i> + HTML + </i> + </b> + </p> + </body> + </html> + >>> soup.find(text="bad") + 'bad' + >>> soup.i + <i>HTML</i> + # + >>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml") + # + >>> print(soup.prettify()) + <?xml version="1.0" encoding="utf-8"?> + <tag1> + Some + <tag2/> + bad + <tag3> + XML + </tag3> + </tag1> + ``` + + To go beyond the basics, [comprehensive documentation is available](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). + + # Links + + * [Homepage](http://www.crummy.com/software/BeautifulSoup/bs4/) + * [Documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) + * [Discussion group](http://groups.google.com/group/beautifulsoup/) + * [Development](https://code.launchpad.net/beautifulsoup/) + * [Bug tracker](https://bugs.launchpad.net/beautifulsoup/) + * [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG) + + # Note on Python 2 sunsetting + + Since 2012, Beautiful Soup has been developed as a Python 2 library + which is automatically converted to Python 3 code as necessary. This + makes it impossible to take advantage of some features of Python + 3. + + For this reason, I plan to discontinue Beautiful Soup's Python 2 + support at some point after December 31, 2020: one year after the + sunset date for Python 2 itself. Beyond that point, new Beautiful Soup + development will exclusively target Python 3. Of course, older + releases of Beautiful Soup, which support both versions, will continue + to be available. + + # Supporting the project + + If you use Beautiful Soup as part of your professional work, please consider a + [Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme). + This will support many of the free software projects your organization + depends on, not just Beautiful Soup. + + If you use Beautiful Soup for personal projects, the best way to say + thank you is to read + [Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I + wrote about what Beautiful Soup has taught me about software + development. + + # Building the documentation + + The bs4/doc/ directory contains full documentation in Sphinx + format. Run `make html` in that directory to create HTML + documentation. + + # Running the unit tests + + Beautiful Soup supports unit test discovery from the project root directory: + + ``` + $ nosetests + ``` + + ``` + $ python -m unittest discover -s bs4 + ``` + + If you checked out the source tree, you should see a script in the + home directory called test-all-versions. This script will run the unit + tests under Python 2, then create a temporary Python 3 conversion of + the source and run the unit tests again under Python 3. + +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Text Processing :: Markup :: HTML +Classifier: Topic :: Text Processing :: Markup :: XML +Classifier: Topic :: Text Processing :: Markup :: SGML +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Description-Content-Type: text/markdown +Provides-Extra: html5lib +Provides-Extra: lxml diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/README.md b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/README.md new file mode 100644 index 00000000000..92dd3394237 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/README.md @@ -0,0 +1,102 @@ +Beautiful Soup is a library that makes it easy to scrape information +from web pages. It sits atop an HTML or XML parser, providing Pythonic +idioms for iterating, searching, and modifying the parse tree. + +# Quick start + +``` +>>> from bs4 import BeautifulSoup +>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML") +>>> print(soup.prettify()) +<html> + <body> + <p> + Some + <b> + bad + <i> + HTML + </i> + </b> + </p> + </body> +</html> +>>> soup.find(text="bad") +'bad' +>>> soup.i +<i>HTML</i> +# +>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml") +# +>>> print(soup.prettify()) +<?xml version="1.0" encoding="utf-8"?> +<tag1> + Some + <tag2/> + bad + <tag3> + XML + </tag3> +</tag1> +``` + +To go beyond the basics, [comprehensive documentation is available](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). + +# Links + +* [Homepage](http://www.crummy.com/software/BeautifulSoup/bs4/) +* [Documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) +* [Discussion group](http://groups.google.com/group/beautifulsoup/) +* [Development](https://code.launchpad.net/beautifulsoup/) +* [Bug tracker](https://bugs.launchpad.net/beautifulsoup/) +* [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG) + +# Note on Python 2 sunsetting + +Since 2012, Beautiful Soup has been developed as a Python 2 library +which is automatically converted to Python 3 code as necessary. This +makes it impossible to take advantage of some features of Python +3. + +For this reason, I plan to discontinue Beautiful Soup's Python 2 +support at some point after December 31, 2020: one year after the +sunset date for Python 2 itself. Beyond that point, new Beautiful Soup +development will exclusively target Python 3. Of course, older +releases of Beautiful Soup, which support both versions, will continue +to be available. + +# Supporting the project + +If you use Beautiful Soup as part of your professional work, please consider a +[Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme). +This will support many of the free software projects your organization +depends on, not just Beautiful Soup. + +If you use Beautiful Soup for personal projects, the best way to say +thank you is to read +[Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I +wrote about what Beautiful Soup has taught me about software +development. + +# Building the documentation + +The bs4/doc/ directory contains full documentation in Sphinx +format. Run `make html` in that directory to create HTML +documentation. + +# Running the unit tests + +Beautiful Soup supports unit test discovery from the project root directory: + +``` +$ nosetests +``` + +``` +$ python -m unittest discover -s bs4 +``` + +If you checked out the source tree, you should see a script in the +home directory called test-all-versions. This script will run the unit +tests under Python 2, then create a temporary Python 3 conversion of +the source and run the unit tests again under Python 3. diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/TODO.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/TODO.txt new file mode 100644 index 00000000000..e26d6264dac --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/TODO.txt @@ -0,0 +1,31 @@ +Additions +--------- + +More of the jQuery API: nextUntil? + +Optimizations +------------- + +The html5lib tree builder doesn't use the standard tree-building API, +which worries me and has resulted in a number of bugs. + +markup_attr_map can be optimized since it's always a map now. + +Upon encountering UTF-16LE data or some other uncommon serialization +of Unicode, UnicodeDammit will convert the data to Unicode, then +encode it at UTF-8. This is wasteful because it will just get decoded +back to Unicode. + +CDATA +----- + +The elementtree XMLParser has a strip_cdata argument that, when set to +False, should allow Beautiful Soup to preserve CDATA sections instead +of treating them as text. Except it doesn't. (This argument is also +present for HTMLParser, and also does nothing there.) + +Currently, htm5lib converts CDATA sections into comments. An +as-yet-unreleased version of html5lib changes the parser's handling of +CDATA sections to allow CDATA sections in tags like <svg> and +<math>. The HTML5TreeBuilder will need to be updated to create CData +objects instead of Comment objects in this situation. diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/PKG-INFO b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/PKG-INFO new file mode 100644 index 00000000000..c5685222994 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/PKG-INFO @@ -0,0 +1,126 @@ +Metadata-Version: 2.1 +Name: beautifulsoup4 +Version: 4.9.3 +Summary: Screen-scraping library +Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ +Author: Leonard Richardson +Author-email: leonardr@segfault.org +License: MIT +Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ +Description: Beautiful Soup is a library that makes it easy to scrape information + from web pages. It sits atop an HTML or XML parser, providing Pythonic + idioms for iterating, searching, and modifying the parse tree. + + # Quick start + + ``` + >>> from bs4 import BeautifulSoup + >>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML") + >>> print(soup.prettify()) + <html> + <body> + <p> + Some + <b> + bad + <i> + HTML + </i> + </b> + </p> + </body> + </html> + >>> soup.find(text="bad") + 'bad' + >>> soup.i + <i>HTML</i> + # + >>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml") + # + >>> print(soup.prettify()) + <?xml version="1.0" encoding="utf-8"?> + <tag1> + Some + <tag2/> + bad + <tag3> + XML + </tag3> + </tag1> + ``` + + To go beyond the basics, [comprehensive documentation is available](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). + + # Links + + * [Homepage](http://www.crummy.com/software/BeautifulSoup/bs4/) + * [Documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) + * [Discussion group](http://groups.google.com/group/beautifulsoup/) + * [Development](https://code.launchpad.net/beautifulsoup/) + * [Bug tracker](https://bugs.launchpad.net/beautifulsoup/) + * [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG) + + # Note on Python 2 sunsetting + + Since 2012, Beautiful Soup has been developed as a Python 2 library + which is automatically converted to Python 3 code as necessary. This + makes it impossible to take advantage of some features of Python + 3. + + For this reason, I plan to discontinue Beautiful Soup's Python 2 + support at some point after December 31, 2020: one year after the + sunset date for Python 2 itself. Beyond that point, new Beautiful Soup + development will exclusively target Python 3. Of course, older + releases of Beautiful Soup, which support both versions, will continue + to be available. + + # Supporting the project + + If you use Beautiful Soup as part of your professional work, please consider a + [Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme). + This will support many of the free software projects your organization + depends on, not just Beautiful Soup. + + If you use Beautiful Soup for personal projects, the best way to say + thank you is to read + [Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I + wrote about what Beautiful Soup has taught me about software + development. + + # Building the documentation + + The bs4/doc/ directory contains full documentation in Sphinx + format. Run `make html` in that directory to create HTML + documentation. + + # Running the unit tests + + Beautiful Soup supports unit test discovery from the project root directory: + + ``` + $ nosetests + ``` + + ``` + $ python -m unittest discover -s bs4 + ``` + + If you checked out the source tree, you should see a script in the + home directory called test-all-versions. This script will run the unit + tests under Python 2, then create a temporary Python 3 conversion of + the source and run the unit tests again under Python 3. + +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Text Processing :: Markup :: HTML +Classifier: Topic :: Text Processing :: Markup :: XML +Classifier: Topic :: Text Processing :: Markup :: SGML +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Description-Content-Type: text/markdown +Provides-Extra: html5lib +Provides-Extra: lxml diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/SOURCES.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/SOURCES.txt new file mode 100644 index 00000000000..9eea8f35050 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/SOURCES.txt @@ -0,0 +1,53 @@ +COPYING.txt +LICENSE +MANIFEST.in +NEWS.txt +README.md +TODO.txt +convert-py3k +setup.cfg +setup.py +test-all-versions +beautifulsoup4.egg-info/PKG-INFO +beautifulsoup4.egg-info/SOURCES.txt +beautifulsoup4.egg-info/dependency_links.txt +beautifulsoup4.egg-info/requires.txt +beautifulsoup4.egg-info/top_level.txt +bs4/__init__.py +bs4/dammit.py +bs4/diagnose.py +bs4/element.py +bs4/formatter.py +bs4/testing.py +bs4/builder/__init__.py +bs4/builder/_html5lib.py +bs4/builder/_htmlparser.py +bs4/builder/_lxml.py +bs4/tests/__init__.py +bs4/tests/test_builder_registry.py +bs4/tests/test_docs.py +bs4/tests/test_html5lib.py +bs4/tests/test_htmlparser.py +bs4/tests/test_lxml.py +bs4/tests/test_soup.py +bs4/tests/test_tree.py +doc/Makefile +doc.ptbr/Makefile +doc.ptbr/source/6.1.jpg +doc.ptbr/source/conf.py +doc.ptbr/source/index.rst +doc.ru/Makefile +doc.ru/source/6.1.jpg +doc.ru/source/bs4ru.rst +doc.ru/source/conf.py +doc.ru/source/index.rst +doc.zh/Makefile +doc.zh/source/6.1.jpg +doc.zh/source/conf.py +doc.zh/source/index.rst +doc/source/6.1.jpg +doc/source/check_doc.py +doc/source/conf.py +doc/source/index.rst +scripts/demonstrate_parser_differences.py +scripts/demonstration_markup.txt
\ No newline at end of file diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/dependency_links.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/dependency_links.txt new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/requires.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/requires.txt new file mode 100644 index 00000000000..32caeaa4d64 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/requires.txt @@ -0,0 +1,12 @@ + +[:python_version < "3.0"] +soupsieve<2.0,>1.2 + +[:python_version >= "3.0"] +soupsieve>1.2 + +[html5lib] +html5lib + +[lxml] +lxml diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/top_level.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/top_level.txt new file mode 100644 index 00000000000..13154420d48 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/beautifulsoup4.egg-info/top_level.txt @@ -0,0 +1 @@ +bs4 diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/__init__.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/__init__.py new file mode 100644 index 00000000000..8f78809b313 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/__init__.py @@ -0,0 +1,791 @@ +"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". + +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. + +Beautiful Soup works with Python 2.7 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.9.3" +__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +from collections import Counter +import os +import re +import sys +import traceback +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + PYTHON_SPECIFIC_ENCODINGS, + ResultSet, + Script, + Stylesheet, + SoupStrainer, + Tag, + TemplateString, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +# Define some custom warnings. +class GuessedAtParserWarning(UserWarning): + """The warning issued when BeautifulSoup has to guess what parser to + use -- probably because no parser was specified in the constructor. + """ + +class MarkupResemblesLocatorWarning(UserWarning): + """The warning issued when BeautifulSoup is given 'markup' that + actually looks like a resource locator -- a URL or a path to a file + on disk. + """ + + +class BeautifulSoup(Tag): + """A data structure representing a parsed HTML or XML document. + + Most of the methods you'll call on a BeautifulSoup object are inherited from + PageElement or Tag. + + Internally, this class defines the basic interface called by the + tree builders when converting an HTML/XML document into a data + structure. The interface abstracts away the differences between + parsers. To write a new tree builder, you'll need to understand + these methods as a whole. + + These methods will be called by the BeautifulSoup constructor: + * reset() + * feed(markup) + + The tree builder may call these methods from its feed() implementation: + * handle_starttag(name, attrs) # See note about return value + * handle_endtag(name) + * handle_data(data) # Appends to the current data node + * endData(containerClass) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's <br> tag), call handle_starttag and then + handle_endtag. + """ + + # Since BeautifulSoup subclasses Tag, it's possible to treat it as + # a Tag with a .name. This name makes it clear the BeautifulSoup + # object isn't a real markup tag. + ROOT_TAG_NAME = u'[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + # A string containing all ASCII whitespace characters, used in + # endData() to detect data chunks that seem 'empty'. + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + element_classes=None, **kwargs): + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be + used. This may be the name of a specific parser ("lxml", + "lxml-xml", "html.parser", or "html5lib") or it may be the + type of markup to be used ("html", "html5", "xml"). It's + recommended that you name a specific parser, so that + Beautiful Soup gives you the same results across platforms + and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param element_classes: A dictionary mapping BeautifulSoup + classes like Tag and NavigableString, to other classes you'd + like to be instantiated instead as the parse tree is + built. This is useful for subclassing Tag or NavigableString + to modify default behavior. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be + ignored. + + Apart from this, any keyword arguments passed into the + BeautifulSoup constructor are propagated to the TreeBuilder + constructor. This makes it possible to configure a + TreeBuilder by passing in arguments, not just by saying which + one to use. + """ + if 'convertEntities' in kwargs: + del kwargs['convertEntities'] + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + + self.element_classes = element_classes or dict() + + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, basestring): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ) and markup: + # The user did not tell us which TreeBuilder to use, + # and we had to guess. Issue a warning. + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn( + self.NO_PARSER_SPECIFIED_WARNING % values, + GuessedAtParserWarning, stacklevel=2 + ) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + + self.builder = builder + self.is_xml = builder.is_xml + self.known_xml = self.is_xml + self._namespaces = dict() + self.parse_only = parse_only + + self.builder.initialize_soup(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, unicode) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception, e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + warnings.warn( + '"%s" looks like a filename, not markup. You should' + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % self._decode_markup(markup), + MarkupResemblesLocatorWarning + ) + self._check_markup_is_url(markup) + + rejections = [] + success = False + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + success = True + break + except ParserRejectedMarkup as e: + rejections.append(e) + pass + + if not success: + other_exceptions = [unicode(e) for e in rejections] + raise ParserRejectedMarkup( + u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @classmethod + def _decode_markup(cls, markup): + """Ensure `markup` is bytes so it's safe to send into warnings.warn. + + TODO: warnings.warn had this problem back in 2010 but it might not + anymore. + """ + if isinstance(markup, bytes): + decoded = markup.decode('utf-8', 'replace') + else: + decoded = markup + return decoded + + @classmethod + def _check_markup_is_url(cls, markup): + """Error-handling method to raise a warning if incoming markup looks + like a URL. + + :param markup: A string. + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % cls._decode_markup( + markup + ), + MarkupResemblesLocatorWarning + ) + + def _feed(self): + """Internal method that parses previously set markup, creating a large + number of Tag and NavigableString objects. + """ + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + """Reset this object to a state as though it had never parsed any + markup. + """ + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.open_tag_counter = Counter() + self.preserve_whitespace_tag_stack = [] + self.string_container_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, + sourceline=None, sourcepos=None, **kwattrs): + """Create a new Tag associated with this BeautifulSoup object. + + :param name: The name of the new Tag. + :param namespace: The URI of the new Tag's XML namespace, if any. + :param prefix: The prefix for the new Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values; can + be used instead of `kwattrs` for attributes like 'class' + that are reserved words in Python. + :param sourceline: The line number where this tag was + (purportedly) found in its source document. + :param sourcepos: The character position within `sourceline` where this + tag was (purportedly) found. + :param kwattrs: Keyword arguments for the new Tag's attribute values. + + """ + kwattrs.update(attrs) + return self.element_classes.get(Tag, Tag)( + None, self.builder, name, namespace, nsprefix, kwattrs, + sourceline=sourceline, sourcepos=sourcepos + ) + + def string_container(self, base_class=None): + container = base_class or NavigableString + + # There may be a general override of NavigableString. + container = self.element_classes.get( + container, container + ) + + # On top of that, we may be inside a tag that needs a special + # container class. + if self.string_container_stack: + container = self.builder.string_containers.get( + self.string_container_stack[-1].name, container + ) + return container + + def new_string(self, s, subclass=None): + """Create a new NavigableString associated with this BeautifulSoup + object. + """ + container = self.string_container(subclass) + return container(s) + + def insert_before(self, *args): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, *args): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + """Internal method called by _popToTag when a tag is closed.""" + tag = self.tagStack.pop() + if tag.name in self.open_tag_counter: + self.open_tag_counter[tag.name] -= 1 + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + if self.string_container_stack and tag == self.string_container_stack[-1]: + self.string_container_stack.pop() + #print("Pop", tag.name) + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + """Internal method called by handle_starttag when a tag is opened.""" + #print("Push", tag.name) + if self.currentTag is not None: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name != self.ROOT_TAG_NAME: + self.open_tag_counter[tag.name] += 1 + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + if tag.name in self.builder.string_containers: + self.string_container_stack.append(tag) + + def endData(self, containerClass=None): + """Method called by the TreeBuilder when the end of a data segment + occurs. + """ + containerClass = self.string_container(containerClass) + + if self.current_data: + current_data = u''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Method called by the TreeBuilder to integrate an object into the parse tree.""" + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if previous_element is None: + previous_element = o.previous_element + + fix = parent.next_element is not None + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. + + If there are no open tags with the given name, nothing will be + popped. + + :param name: Pop up to the most recent tag with this name. + :param nsprefix: The namespace prefix that goes with `name`. + :param inclusivePop: It this is false, pops the tag stack up + to but *not* including the most recent instqance of the + given tag. + + """ + #print("Popping to %s" % name) + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + if not self.open_tag_counter.get(name): + break + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, + sourcepos=None): + """Called by the tree builder when a new tag is encountered. + + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + :param attrs: A dictionary of attribute values. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + + If this method returns None, the tag was rejected by an active + SoupStrainer. You should proceed as if the tag had not occurred + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + # print("Start tag %s: %s" % (name, attrs)) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = self.element_classes.get(Tag, Tag)( + self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element, + sourceline=sourceline, sourcepos=sourcepos + ) + if tag is None: + return tag + if self._most_recent_element is not None: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + """Called by the tree builder when an ending tag is encountered. + + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + """ + #print("End tag: " + name) + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + """Called by the tree builder when a chunk of textual data is encountered.""" + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of the parse tree + as an HTML or XML document. + + :param pretty_print: If this is True, indentation will be used to + make the document more readable. + :param eventual_encoding: The encoding of the final document. + If this is None, the document will be a Unicode string. + """ + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + # This is a special Python encoding; it can't actually + # go into an XML document because it means nothing + # outside of Python. + eventual_encoding = None + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'<?xml version="1.0"%s?>\n' % encoding_part + else: + prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + """Exception raised by a TreeBuilder if it's unable to continue parsing.""" + pass + +class FeatureNotFound(ValueError): + """Exception raised by the BeautifulSoup constructor if no parser with the + requested features is found. + """ + pass + + +#If this file is run as a script, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print(soup.prettify()) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/__init__.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/__init__.py new file mode 100644 index 00000000000..03da4c6e40c --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/__init__.py @@ -0,0 +1,519 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + Stylesheet, + Script, + TemplateString, + nonwhitespace_re +) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of Treebuilder. its .features + attribute should list its features. + """ + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a textual document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + # Whitespace should be preserved inside these tags. + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + # The textual contents of tags with these names should be + # instantiated with some class other than NavigableString. + DEFAULT_STRING_CONTAINERS = {} + + USE_DEFAULT = object() + + # Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS = False + + def __init__(self, multi_valued_attributes=USE_DEFAULT, + preserve_whitespace_tags=USE_DEFAULT, + store_line_numbers=USE_DEFAULT, + string_containers=USE_DEFAULT, + ): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this to a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: A list of tags to treat + the way <pre> tags are treated in HTML. Tags in this list + are immune from pretty-printing; their contents will always be + output as-is. + + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing DEFAULT_STRING_CONTAINERS. + + :param store_line_numbers: If the parser keeps track of the + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. + """ + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. + """ + self.soup = soup + + def reset(self): + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p/>" or "<p>". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will + be left alone. + + :param tag_name: The name of a markup tag. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + + This method is not implemented in TreeBuilder; it must be + implemented in subclasses. + + :return: None. + """ + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + """ + yield markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + + :param fragment: A string -- fragment of HTML. + :return: A string -- a full HTML document. + """ + return fragment + + def set_up_substitutions(self, tag): + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. + + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in attrs.keys(): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, basestring): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, but it demonstrates + how a simple TreeBuilder would work. + """ + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print("Start %s, %r" % (name, attrs)) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print("End %s" % name) + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines an unusual content model for these tags. + # We represent this by using a string class other than NavigableString + # inside these tags. + # + # I made this list by going through the HTML spec + # (https://html.spec.whatwg.org/#metadata-content) and looking for + # "metadata content" elements that can contain strings. + # + # TODO: Arguably <noscript> could go here but it seems + # qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS = { + 'style': Stylesheet, + 'script': Script, + 'template': TemplateString, + } + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + def set_up_substitutions(self, tag): + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ + # We are only interested in <meta> tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + this_module = sys.modules[__name__] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + """An Exception to be raised when the underlying parser simply + refuses to parse the given markup. + """ + def __init__(self, message_or_exception): + """Explain why the parser rejected the given markup, either + with a textual explanation or another exception. + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception + message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last resort. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_html5lib.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_html5lib.py new file mode 100644 index 00000000000..a1c6134c165 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_html5lib.py @@ -0,0 +1,467 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +import re +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, + prefixes, + ) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree. + + Note that this TreeBuilder does not support some features common + to HTML TreeBuilders. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + * This TreeBuilder doesn't use different subclasses of NavigableString + based on the name of the tag in which the string was found. + + * You can't use a SoupStrainer to parse only part of a document. + """ + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + # html5lib can tell us which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + self.underlying_builder.parser = parser + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, unicode): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + self.underlying_builder.parser = None + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup, + store_line_numbers=self.store_line_numbers + ) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><head></head><body>%s</body></html>' % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, namespaceHTMLElements, soup=None, + store_line_numbers=True, **kwargs): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup( + "", "html.parser", store_line_numbers=store_line_numbers, + **kwargs + ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + # This will be set later to an html5lib.html5parser.HTMLParser + # object, which we can use to track the current line number. + self.parser = None + self.store_line_numbers = store_line_numbers + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + kwargs = {} + if self.parser and self.store_line_numbers: + # This represents the point immediately after the end of the + # tag. We don't know when the tag started, but we do know + # where it ended -- the character just before this one. + sourceline, sourcepos = self.parser.tokenizer.stream.position() + kwargs['sourceline'] = sourceline + kwargs['sourcepos'] = sourcepos-1 + tag = self.soup.new_tag(name, namespace, **kwargs) + + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup("", "html.parser") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) + else: + rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = nonwhitespace_re.findall(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(treebuilder_base.Node): + def __init__(self, element, soup, namespace): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, basestring): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + node.parent = self + else: + child = node.element + node.parent = self + + if not isinstance(child, basestring) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "a</a>a</a>a</a>..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, basestring): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print("MOVE", self.element.contents) + # print("FROM", self.element) + # print("TO", new_parent.element) + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element is not None: + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print("DONE WITH MOVE") + # print("FROM", self.element) + # print("TO", new_parent_element) + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_htmlparser.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_htmlparser.py new file mode 100644 index 00000000000..96a7b7d4d44 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_htmlparser.py @@ -0,0 +1,477 @@ +# encoding: utf-8 +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + """A subclass of the Python standard library's HTMLParser class, which + listens for HTMLParser events and translates them into calls + to Beautiful Soup's tree construction API. + """ + + # Strategies for handling duplicate attributes + IGNORE = 'ignore' + REPLACE = 'replace' + + def __init__(self, *args, **kwargs): + """Constructor. + + :param on_duplicate_attribute: A strategy for what to do if a + tag includes the same attribute more than once. Accepted + values are: REPLACE (replace earlier values with later + ones, the default), IGNORE (keep the earliest value + encountered), or a callable. A callable must take three + arguments: the dictionary of attributes already processed, + the name of the duplicate attribute, and the most recent value + encountered. + """ + self.on_duplicate_attribute = kwargs.pop( + 'on_duplicate_attribute', self.REPLACE + ) + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although + this requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() by raising an exception, + which we don't want to do. + + In any event, this method is called only on very strange + markup and our best strategy is to pretend it didn't happen + and keep going. + """ + warnings.warn(msg) + + def handle_startendtag(self, name, attrs): + """Handle an incoming empty-element tag. + + This is only called when the markup looks like <tag/>. + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + """ + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): + """Handle an opening tag, e.g. '<tag>' + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + :param handle_empty_element: True if this tag is known to be + an empty-element tag (i.e. there is not expected to be any + closing tag). + """ + # XXX namespace + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + if key in attr_dict: + # A single attribute shows up multiple times in this + # tag. How to handle it depends on the + # on_duplicate_attribute setting. + on_dupe = self.on_duplicate_attribute + if on_dupe == self.IGNORE: + pass + elif on_dupe in (None, self.REPLACE): + attr_dict[key] = value + else: + on_dupe(attr_dict, key, value) + else: + attr_dict[key] = value + attrvalue = '""' + #print("START", name) + sourceline, sourcepos = self.getpos() + tag = self.soup.handle_starttag( + name, None, None, attr_dict, sourceline=sourceline, + sourcepos=sourcepos + ) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # <tag/>.) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + """Handle a closing tag, e.g. '</tag>' + + :param name: A tag name. + :param check_already_closed: True if this tag is expected to + be the closing portion of an empty-element tag, + e.g. '<tag></tag>'. + """ + #print("END", name) + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + #print("ALREADY CLOSED", name) + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + + def handle_data(self, data): + """Handle some textual data that shows up between tags.""" + self.soup.handle_data(data) + + def handle_charref(self, name): + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Character number, possibly in hexadecimal. + """ + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError, e: + pass + if not data: + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + pass + data = data or u"\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): + """Handle a named entity reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Name of the entity reference. + """ + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name + self.handle_data(data) + + def handle_comment(self, data): + """Handle an HTML comment. + + :param data: The text of the comment. + """ + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + """Handle a DOCTYPE declaration. + + :param data: The text of the declaration. + """ + self.soup.endData() + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + """Handle a declaration of unknown type -- probably a CDATA block. + + :param data: The text of the declaration. + """ + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + """Handle a processing instruction. + + :param data: The text of the instruction. + """ + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, + found in the Python standard library. + """ + is_xml = False + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] + + # The html.parser knows which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + """Constructor. + + :param parser_args: Positional arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param parser_kwargs: Keyword arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param kwargs: Keyword arguments for the superclass constructor. + """ + # Some keyword arguments will be pulled out of kwargs and placed + # into parser_kwargs. + extra_parser_kwargs = dict() + for arg in ('on_duplicate_attribute',): + if arg in kwargs: + value = kwargs.pop(arg) + extra_parser_kwargs[arg] = value + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + parser_kwargs.update(extra_parser_kwargs) + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: + parser_kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ + if isinstance(markup, unicode): + # Parse Unicode as-is. + yield (markup, None, None, False) + return + + # Ask UnicodeDammit to sniff the most likely encoding. + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + """ + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + parser.close() + except HTMLParseError, e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + parser.already_closed_empty_element = [] + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like <p></p> as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: <span attr="value" /> + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_lxml.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_lxml.py new file mode 100644 index 00000000000..1b44d7516bf --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/builder/_lxml.py @@ -0,0 +1,332 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable + +from io import BytesIO +from StringIO import StringIO +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in d.items()) + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + # NOTE: If we parsed Element objects and looked at .sourceline, + # we'd be able to see the line numbers from the original document. + # But instead we build an XMLParser or HTMLParser object to serve + # as the target of parse messages, and those messages don't include + # line numbers. + # See: https://bugs.launchpad.net/lxml/+bug/1846906 + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + + :param soup: A `BeautifulSoup`. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + + :param mapping: A dictionary mapping namespace prefixes to URIs. + """ + for key, value in mapping.items(): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + + def default_parser(self, encoding): + """Find the default parser for the given encoding. + + :param encoding: A string. + :return: Either a parser object or a class, which + will be instantiated with default arguments. + """ + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + """Instantiate an appropriate parser for the given encoding. + + :param encoding: A string. + :return: A parser object such as an `etree.XMLParser`. + """ + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, Callable): + # Instantiate the parser with default arguments + parser = parser( + target=self, strip_cdata=False, recover=True, encoding=encoding + ) + return parser + + def __init__(self, parser=None, empty_element_tags=None, **kwargs): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + lxml really wants to get a bytestring and convert it to + Unicode itself. So instead of using UnicodeDammit to convert + the bytestring to Unicode using different encodings, this + implementation uses EncodingDetector to iterate over the + encodings, and tell lxml to try to parse the document as each + one in turn. + + :param markup: Some markup -- hopefully a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, unicode): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, unicode): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, unicode): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(e) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + processing_instruction_class = ProcessingInstruction + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(e) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'<html><body>%s</body></html>' % fragment diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/dammit.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/dammit.py new file mode 100644 index 00000000000..33f7b7d1be3 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/dammit.py @@ -0,0 +1,939 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. +""" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import codecs +from htmlentitydefs import codepoint2name +import re +import logging +import string + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + if isinstance(s, unicode): + return None + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + if isinstance(s, unicode): + return None + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +# +# TODO: This doesn't work anymore and the closest thing, iconv_codecs, +# is GPL-licensed. Check whether this is still necessary. +try: + import iconv_codec +except ImportError: + pass + +# Build bytestring and Unicode versions of regular expressions for finding +# a declared encoding inside an XML or HTML document. +xml_encoding = u'^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' +encoding_res = dict() +encoding_res[bytes] = { + 'html' : re.compile(html_meta.encode("ascii"), re.I), + 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), +} +encoding_res[unicode] = { + 'html' : re.compile(html_meta, re.I), + 'xml' : re.compile(xml_encoding, re.I) +} + +class EntitySubstitution(object): + """The ability to substitute XML or HTML entities for certain characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = unichr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate HTML entity for a special character.""" + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for a special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + + :param s: A Unicode string. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a <meta> tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + """Constructor. + + :param markup: Some markup in an unknown encoding. + :param override_encodings: These encodings will be tried first. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be tried, even + if they otherwise would be. + """ + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + """Should we even bother to try this encoding? + + :param encoding: Name of an encoding. + :param tried: Encodings that have already been tried. This will be modified + as a side effect. + """ + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup. + + :yield: A sequence of strings. + """ + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies. + + :param data: Some markup. + :return: A 2-tuple (modified data, implied encoding) + """ + encoding = None + if isinstance(data, unicode): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a <meta> tag, hopefully near the + beginning of the document. + + :param markup: Some markup. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param search_entire_document: Since an encoding is supposed to declared near the beginning + of the document, most of the time it's only necessary to search a few kilobytes of data. + Set this to True to force this method to search the entire document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: + res = encoding_res[unicode] + + xml_re = res['xml'] + html_re = res['html'] + declared_encoding = None + declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0] + if declared_encoding: + if isinstance(declared_encoding, bytes): + declared_encoding = declared_encoding.decode('ascii', 'replace') + return declared_encoding.lower() + return None + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[]): + """Constructor. + + :param markup: A bytestring representing markup in an unknown encoding. + :param override_encodings: These encodings will be tried first, + before any sniffing code is run. + + :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted + to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. + Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' + will convert them to HTML entity references. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be considered, even + if the sniffing code thinks they might make sense. + """ + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, unicode) or markup == '': + self.markup = markup + self.unicode_markup = unicode(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + self.log.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER." + ) + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + """Attempt to convert the markup to the proposed encoding. + + :param proposed: The name of a character encoding. + """ + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print("Trying to convert document to %s (errors=%s)" % ( + # proposed, errors)) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print("That didn't work!") + #print(e) + return None + #print("Correct encoding: %s" % proposed) + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + """Given a string and its encoding, decodes the string into Unicode. + + :param encoding: The name of an encoding. + """ + return unicode(data, encoding, errors) + + @property + def declared_html_encoding(self): + """If the markup is an HTML document, returns the encoding declared _within_ + the document. + """ + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset): + """Convert the name of a character set to a codec name. + + :param charset: The name of a character set. + :return: The name of a codec. + """ + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', # + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + :param in_bytes: A bytestring that you suspect contains + characters from multiple encodings. Note that this _must_ + be a bytestring. If you've already converted the document + to Unicode, you're too late. + :param main_encoding: The primary encoding of `in_bytes`. + :param embedded_encoding: The encoding that was used to embed characters + in the main document. + :return: A bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/diagnose.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/diagnose.py new file mode 100644 index 00000000000..e4f2f47fa96 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/diagnose.py @@ -0,0 +1,242 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import cProfile +from StringIO import StringIO +from HTMLParser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems. + + :param data: A string containing markup that needs to be explained. + :return: None; diagnostics are printed to standard output. + """ + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print( + "I noticed that %s is not installed. Installing it may help." % + name) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError, e: + print( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print("Found html5lib version %s" % html5lib.__version__) + except ImportError, e: + print( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + else: + try: + if os.path.exists(data): + print('"%s" looks like a filename. Reading data from the file.' % data) + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print("") + + for parser in basic_parsers: + print("Trying to parse your markup with %s" % parser) + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True + except Exception, e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) + + print("-" * 80) + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. You can use this to determine whether + an lxml-specific problem is in Beautiful Soup's lxml tree builders + or in lxml itself. + + :param data: Some markup. + :param html: If True, markup will be parsed with lxml's HTML parser. + if False, lxml's XML parser will be used. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print("%s, %4s, %s" % (event, element.tag, element.text)) + +class AnnouncingParser(HTMLParser): + """Subclass of HTMLParser that announces parse events, without doing + anything else. + + You can use this to get a picture of how html.parser sees a given + document. The easiest way to do this is to call `htmlparser_trace`. + """ + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + + :param data: Some markup. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("</%s>" % tag_name) + return "<html>" + "\n".join(elements) + "</html>" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) + data = rdoc(num_elements) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception, e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print("Raw lxml parsed the markup in %.2fs." % (b-a)) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) + +def profile(num_elements=100000, parser="lxml"): + """Use Python's profiler on a randomly generated document.""" + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +# If this file is run as a script, standard input is diagnosed. +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/element.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/element.py new file mode 100644 index 00000000000..09a81d92bbc --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/element.py @@ -0,0 +1,2175 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable +import re +import sys +import warnings +try: + import soupsieve +except ImportError, e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +# These encodings are recognized by Python (so PageElement.encode +# could theoretically support them) but XML and HTML don't recognize +# them (so they should not show up in an XML or HTML document as that +# document's encoding). +# +# If an XML document is encoded in one of these encodings, no encoding +# will be mentioned in the XML declaration. If an HTML document is +# encoded in one of these encodings, and the HTML document has a +# <meta> tag that mentions an encoding, the encoding will be given as +# the empty string. +# +# Source: +# https://docs.python.org/3/library/codecs.html#python-specific-encodings +PYTHON_SPECIFIC_ENCODINGS = set([ + u"idna", + u"mbcs", + u"oem", + u"palmos", + u"punycode", + u"raw_unicode_escape", + u"undefined", + u"unicode_escape", + u"raw-unicode-escape", + u"unicode-escape", + u"string-escape", + u"string_escape", +]) + + +class NamespacedAttribute(unicode): + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ + + def __new__(cls, prefix, name=None, namespace=None): + if not name: + # This is the default namespace. Its name "has no value" + # per https://www.w3.org/TR/xml-names/#defaulting + name = None + + if name is None: + obj = unicode.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = unicode.__new__(cls, name) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '<meta charset="utf8">', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + """When an HTML document is being encoded to a given encoding, the + value of a meta tag's 'charset' is the name of the encoding. + """ + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + <meta http-equiv="content-type" content="text/html; charset=utf8"> + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page: + that is, its current location in the parse tree. + + NavigableString, Tag, etc. are all subclasses of PageElement. + """ + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements. + + :param parent: The parent of this element. + + :param previous_element: The element parsed immediately before + this one. + + :param next_element: The element parsed immediately before + this one. + + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. + + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. + """ + self.parent = parent + + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s, formatter): + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look + up an XMLFormatter or HTMLFormatter in the appropriate + registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if isinstance(formatter, Callable): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + """Replace this PageElement with another one, keeping the rest of the + tree the same. + + :param replace_with: A PageElement. + :return: `self`, no longer part of the tree. + """ + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree.") + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract(_self_index=my_index) + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + """Replace this PageElement with its contents. + + :return: `self`, no longer part of the tree. + """ + my_parent = self.parent + if self.parent is None: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") + my_index = self.parent.index(self) + self.extract(_self_index=my_index) + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + """Wrap this PageElement inside another one. + + :param wrap_inside: A PageElement. + :return: `wrap_inside`, occupying the position in the tree that used + to be occupied by `self`, and with `self` inside it. + """ + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self, _self_index=None): + """Destructively rips this element out of the tree. + + :param _self_index: The location of this element in its parent's + .contents, if known. Passing this in allows for a performance + optimization. + + :return: `self`, no longer part of the tree. + """ + if self.parent is not None: + if _self_index is None: + _self_index = self.parent.index(self) + del self.parent.contents[_self_index] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if (self.previous_element is not None and + self.previous_element is not next_element): + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if (self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling): + self.previous_sibling.next_sibling = self.next_sibling + if (self.next_sibling is not None + and self.next_sibling is not self.previous_sibling): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self, is_initialized=True, accept_self=True): + """Finds the last element beneath this object to be parsed. + + :param is_initialized: Has `setup` been called on this PageElement + yet? + :param accept_self: Is `self` an acceptable answer to the question? + """ + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + """Insert a new PageElement in the list of this PageElement's children. + + This works the same way as `list.insert`. + + :param position: The numeric position that should be occupied + in `self.children` by the new PageElement. + :param new_child: A PageElement. + """ + if new_child is None: + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, basestring) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant(False) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given PageElement to the contents of this one. + + :param tag: A PageElement. + """ + self.insert(len(self.contents), tag) + + def extend(self, tags): + """Appends the given PageElements to this one's contents. + + :param tags: A list of PageElements. + """ + if isinstance(tags, Tag): + # Calling self.append() on another tag's contents will change + # the list we're iterating over. Make a list that won't + # change. + tags = list(tags.contents) + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + All the elements will have the same parent, and the given elements + will be immediately before this one. + + :param args: One or more PageElements. + """ + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements + will be immediately after this one. + + :param args: One or more PageElements. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Find all PageElements that match the given criteria and appear + later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet containing PageElements. + """ + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Find all siblings of this PageElement that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Look backwards in the document from this PageElement and find the + first PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Look backwards in the document from this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this PageElement that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1, **kwargs) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Find all parents of this PageElement that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + """The PageElement, if any, that was parsed just after this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self.next_element + + @property + def previous(self): + """The PageElement, if any, that was parsed just before this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + + if isinstance(name, SoupStrainer): + strainer = name + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + + if text is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) + elif isinstance(name, basestring): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) + else: + prefix = None + local_name = name + result = (element for element in generator + if isinstance(element, Tag) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) + ) + return ResultSet(strainer, result) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + """All PageElements that were parsed after this one. + + :yield: A sequence of PageElements. + """ + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + """All PageElements that are siblings of this one but were parsed + later. + + :yield: A sequence of PageElements. + """ + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + """All PageElements that are parents of this PageElement. + + :yield: A sequence of PageElements. + """ + i = self.parent + while i is not None: + yield i + i = i.parent + + @property + def decomposed(self): + """Check whether a PageElement has been decomposed. + + :rtype: bool + """ + return getattr(self, '_decomposed', False) or False + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(unicode, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a NavigableString for the string "penguin". + """ + + PREFIX = '' + SUFFIX = '' + + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + u = unicode.__new__(cls, value) + else: + u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + + def __copy__(self): + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter. + + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self): + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + """ + return None + + @name.setter + def name(self, name): + """Prevent NavigableString.name from ever being set.""" + raise AttributeError("A NavigableString cannot be given a name.") + + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + This is an abstract class used for special kinds of strings such + as comments (the Comment class) and CDATA blocks (the CData + class). + """ + + PREFIX = '' + SUFFIX = '' + + def output_ready(self, formatter=None): + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A Formatter object, or a string naming one + of the standard formatters. The string will be passed into the + Formatter, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + """A CDATA block.""" + PREFIX = u'<![CDATA[' + SUFFIX = u']]>' + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX = u'<?' + SUFFIX = u'>' + +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = u'<?' + SUFFIX = u'?>' + +class Comment(PreformattedString): + """An HTML or XML comment.""" + PREFIX = u'<!--' + SUFFIX = u'-->' + + +class Declaration(PreformattedString): + """An XML declaration.""" + PREFIX = u'<?' + SUFFIX = u'?>' + + +class Doctype(PreformattedString): + """A document type declaration.""" + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + + :return: A Doctype. + """ + value = name or '' + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = u'<!DOCTYPE ' + SUFFIX = u'>\n' + + +class Stylesheet(NavigableString): + """A NavigableString representing an stylesheet (probably + CSS). + + Used to distinguish embedded stylesheets from textual content. + """ + pass + + +class Script(NavigableString): + """A NavigableString representing an executable script (probably + Javascript). + + Used to distinguish executable code from textual content. + """ + pass + + +class TemplateString(NavigableString): + """A NavigableString representing a string found inside an HTML + template embedded in a larger document. + + Used to distinguish such strings from the main body of the document. + """ + pass + + +class Tag(PageElement): + """Represents an HTML or XML tag that is part of a parse tree, along + with its attributes and contents. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a Tag object representing the <b> tag. + """ + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None, sourceline=None, sourcepos=None, + can_be_empty_element=None, cdata_list_attributes=None, + preserve_whitespace_tags=None + ): + """Basic constructor. + + :param parser: A BeautifulSoup object. + :param builder: A TreeBuilder. + :param name: The name of the tag. + :param namespace: The URI of this Tag's XML namespace, if any. + :param prefix: The prefix for this Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values. + :param parent: The PageElement to use as this Tag's parent. + :param previous: The PageElement that was parsed immediately before + this tag. + :param is_xml: If True, this is an XML tag. Otherwise, this is an + HTML tag. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + :param can_be_empty_element: If True, this tag should be + represented as <tag/>. If False, this tag should be represented + as <tag></tag>. + :param cdata_list_attributes: A list of attributes whose values should + be treated as CDATA if they ever show up on this tag. + :param preserve_whitespace_tags: A list of tag names whose contents + should have their whitespace preserved. + """ + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if ((not builder or builder.store_line_numbers) + and (sourceline is not None or sourcepos is not None)): + self.sourceline = sourceline + self.sourcepos = sourcepos + if attrs is None: + attrs = {} + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + else: + attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + if builder is None: + # In the absence of a TreeBuilder, use whatever values were + # passed in here. They're probably None, unless this is a copy of some + # other tag. + self.can_be_empty_element = can_be_empty_element + self.cdata_list_attributes = cdata_list_attributes + self.preserve_whitespace_tags = preserve_whitespace_tags + else: + # Set up any substitutions for this tag, such as the charset in a META tag. + builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. + self.can_be_empty_element = builder.can_be_empty_element(name) + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + parserClass = _alias("parser_class") # BS3 + + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)( + None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml, + sourceline=self.sourceline, sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags + ) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this + PageElement. + + TODO It might make sense to have NavigableString.string return + itself. + + :return: If this element has a single string child, return + value is that string. If this element has one child tag, + return value is the 'string' attribute of the child tag, + recursively. If this element is itself a string, has no + children, or has more than one child, return value is None. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + """Replace this PageElement's contents with `string`.""" + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + :param strip: If True, all strings will be stripped before being + yielded. + + :types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, this means only NavigableString and CData objects + will be considered. So no comments, processing instructions, + etc. + + :yield: A sequence of strings. + """ + for descendant in self.descendants: + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + + strings = property(_all_strings) + + @property + def stripped_strings(self): + """Yield all strings in the document, stripping them first. + + :yield: A sequence of stripped strings. + """ + for string in self._all_strings(True): + yield string + + def get_text(self, separator=u"", strip=False, + types=(NavigableString, CData)): + """Get all child strings, concatenated using the given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, this means only NavigableString and CData objects + will be considered. So no comments, processing instructions, + stylesheets, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys this PageElement and its children. + + This element will be removed from the tree and wiped out; so + will everything beneath it. + + The behavior of a decomposed PageElement is undefined and you + should never use one for anything, but if you need to _check_ + whether an element has been decomposed, you can use the + `decomposed` property. + """ + self.extract() + i = self + while i is not None: + n = i.next_element + i.__dict__.clear() + i.contents = [] + i._decomposed = True + i = n + + def clear(self, decompose=False): + """Wipe out all children of this PageElement by calling extract() + on them. + + :param decompose: If this is True, decompose() (a more + destructive method) will be called instead of extract(). + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def smooth(self): + """Smooth out this element's children by consolidating consecutive + strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + + def index(self, element): + """Find the index of a child by identity, not value. + + Avoids issues with tag.contents.index(element) getting the + index of equal elements. + + :param element: Look for this PageElement in `self.contents`. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this PageElement. + :return: A list of values, probably containing only a single + value. + """ + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + + def has_attr(self, key): + """Does this PageElement have an attribute with the given name?""" + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the Tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a Tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a Tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a Tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + """Calling tag.subtag is the same as calling tag.find(name="subtag")""" + #print("Getattr %s.%s" % (self.__class__, tag)) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag == "contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this Tag has the same name, the same attributes, + and the same contents (recursively) as `other`.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this Tag is not identical to `other`, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding="unicode-escape"): + """Renders this PageElement as a string. + + :param encoding: The encoding to use (Python 2 only). + :return: Under Python 2, a bytestring; under Python 3, + a Unicode string. + """ + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) + + def __unicode__(self): + """Renders this PageElement as a Unicode string.""" + return self.decode() + + def __str__(self): + """Renders this PageElement as a generic string. + + :return: Under Python 2, a UTF-8 bytestring; under Python 3, + a Unicode string. + """ + if PY3K: + return self.decode() + else: + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + """Render a bytestring representation of this PageElement and its + contents. + + :param encoding: The destination encoding. + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :param errors: An error handling strategy such as + 'xmlcharrefreplace'. This value is passed along into + encode() and its value should be one of the constants + defined by Python. + :return: A bytestring. + + """ + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Render a Unicode representation of this PageElement and its + contents. + + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + """ + + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): + val = unicode(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + unicode(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = formatter.void_element_close_prefix or '' + else: + closeTag = '</%s%s>' % (prefix, self.name) + + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) + if pretty_print: + space = indent_space + indent_contents = indent_level + 1 + else: + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter + ) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. + s.append("\n") + s = ''.join(s) + return s + + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed? + + Most of them should, but some (such as <pre> in HTML + documents) should not. + """ + return ( + indent_level is not None + and ( + not self.preserve_whitespace_tags + or self.name not in self.preserve_whitespace_tags + ) + ) + + def prettify(self, encoding=None, formatter="minimal"): + """Pretty-print this PageElement as a string. + + :param encoding: The eventual encoding of the string. If this is None, + a Unicode string will be returned. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :return: A Unicode string (if encoding==None) or a bytestring + (otherwise). + """ + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. decode_contents() is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + """ + # First off, turn a string formatter into a Formatter object. This + # will stop the lookup from happening over and over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: + text = text.strip() + if text: + if pretty_print and not preserve_whitespace: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print and not preserve_whitespace: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this PageElement as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + + :return: A bytestring. + """ + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Deprecated method for BS3 compatibility.""" + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Look in the children of this PageElement and find the first + PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find #BS2 + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Look in the children of this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find_all() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + """Iterate over all direct children of this PageElement. + + :yield: A sequence of PageElements. + """ + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + """Iterate over all children of this PageElement in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # CSS selector code + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A Tag. + :rtype: bs4.element.Tag + """ + value = self.select(selector, namespaces, 1, **kwargs) + if value: + return value[0] + return None + + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. + + This uses the SoupSieve library. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of Tags. + :rtype: bs4.element.ResultSet + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + results = soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # We do this because it's more consistent and because + # ResultSet.__getattr__ has a helpful error message. + return ResultSet(None, results) + + # Old names for backwards compatibility + def childGenerator(self): + """Deprecated generator.""" + return self.children + + def recursiveChildGenerator(self): + """Deprecated generator.""" + return self.descendants + + def has_key(self, key): + """Deprecated method. This was kind of misleading because has_key() + (attributes) was different from __in__ (contents). + + has_key() is gone in Python 3, anyway. + """ + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + string). + + This is primarily used to underpin the find_* methods, but you can + create one yourself and pass it in as `parse_only` to the + `BeautifulSoup` constructor, to parse a subset of a large + document. + """ + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + """Constructor. + + The SoupStrainer constructor takes the same arguments passed + into the find_* methods. See the online documentation for + detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + """ + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in attrs.items(): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, unicode)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return unicode(str(value)) + + def __str__(self): + """A human-readable representation of this SoupStrainer.""" + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + """Check whether a Tag with the given name and attributes would + match this SoupStrainer. + + Used prospectively to decide whether to even bother creating a Tag + object. + + :param markup_name: A tag name as found in some markup. + :param markup_attrs: A dictionary of attributes as found in some markup. + + :return: True if the prospective tag would match this SoupStrainer; + False otherwise. + """ + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + + if isinstance(self.name, basestring): + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. + if markup and not markup.prefix and self.name != markup.name: + return False + + call_function_with_tag_data = ( + isinstance(self.name, Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + + # For BS3 compatibility. + searchTag = search_tag + + def search(self, markup): + """Find all items in `markup` that match this SoupStrainer. + + Used by the core _find_all() method, which is ultimately + called by all find_* methods. + + :param markup: A PageElement or a list of them. + """ + # print('looking for %s in %s' % (self, markup)) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, basestring): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against, already_tried=None): + # print(u"Matching %s against %s" % (markup, match_against)) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + original_markup = markup + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, basestring)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, unicode): + # Exact string match + match = markup == match_against + + if not match and hasattr(match_against, 'search'): + # Regexp match + return match_against.search(markup) + + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source, result=()): + """Constructor. + + :param source: A SoupStrainer. + :param result: A list of PageElements. + """ + super(ResultSet, self).__init__(result) + self.source = source + + def __getattr__(self, key): + """Raise a helpful exception to explain a common code fix.""" + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key + ) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/formatter.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/formatter.py new file mode 100644 index 00000000000..9a692ecae3c --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/formatter.py @@ -0,0 +1,152 @@ +from bs4.dammit import EntitySubstitution + +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. + + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. + + Formatters are passed in as the `formatter` argument to methods + like `PageElement.encode`. Most people won't need to think about + formatters, and most people who need to think about them can pass + in one of these predefined strings as `formatter` rather than + making a new Formatter object: + + For HTML documents: + * 'html' - HTML entity substitution for generic HTML documents. (default) + * 'html5' - HTML entity substitution for HTML5 documents. + * 'minimal' - Only make the substitutions necessary to guarantee + valid HTML. + * None - Do not perform any substitution. This will be faster + but may result in invalid markup. + + For XML documents: + * 'html' - Entity substitution for XHTML documents. + * 'minimal' - Only make the substitutions necessary to guarantee + valid XML. (default) + * None - Do not perform any substitution. This will be faster + but may result in invalid markup. + """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} + + HTML = 'html' + XML = 'xml' + + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + ) + + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + ): + """Constructor. + + :param language: This should be Formatter.XML if you are formatting + XML markup and Formatter.HTML if you are formatting HTML markup. + + :param entity_substitution: A function to call to replace special + characters with XML/HTML entities. For examples, see + bs4.dammit.EntitySubstitution.substitute_html and substitute_xml. + :param void_element_close_prefix: By default, void elements + are represented as <tag/> (XML rules) rather than <tag> + (HTML rules). To get <tag>, pass in the empty string. + :param cdata_containing_tags: The list of tags that are defined + as containing CDATA in this dialect. For example, in HTML, + <script> and <style> tags are defined as containing CDATA, + and their contents should not be formatted. + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution. + This may be a string encountered in an attribute value or as + text. + + :param ns: A string. + :return: A string with certain characters replaced by named + or numeric entities. + """ + if not self.entity_substitution: + return ns + from element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return self.entity_substitution(ns) + + def attribute_value(self, value): + """Process the value of an attribute. + + :param ns: A string. + :return: A string with certain characters replaced by named + or numeric entities. + """ + return self.substitute(value) + + def attributes(self, tag): + """Reorder a tag's attributes however you want. + + By default, attributes are sorted alphabetically. This makes + behavior consistent between Python 2 and Python 3, and preserves + backwards compatibility with older versions of Beautiful Soup. + """ + if tag.attrs is None: + return [] + return sorted(tag.attrs.items()) + + +class HTMLFormatter(Formatter): + """A generic Formatter for HTML.""" + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + + +class XMLFormatter(Formatter): + """A generic Formatter for XML.""" + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/testing.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/testing.py new file mode 100644 index 00000000000..a2f83a1ad16 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/bs4/testing.py @@ -0,0 +1,1101 @@ +# encoding: utf-8 +"""Helper classes for tests.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import pickle +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + PYTHON_SPECIFIC_ENCODINGS, + SoupStrainer, + Script, + Stylesheet, + Tag +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + +BAD_DOCUMENT = u"""A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup, **kwargs): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder(**kwargs).test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + # Verify that the documents come out the same. + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + # Also run some checks on the BeautifulSoup object itself: + + # Verify that every tag that was opened was eventually closed. + + # There are no tags in the open tag counter. + assert all(v==0 for v in obj.open_tag_counter.values()) + + # The only tag in the tag stack is the one for the root + # document. + self.assertEqual( + [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack] + ) + + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e + + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + + def test_special_string_containers(self): + soup = self.soup( + "<style>Some CSS</style><script>Some Javascript</script>" + ) + assert isinstance(soup.style.string, Stylesheet) + assert isinstance(soup.script.string, Script) + + soup = self.soup( + "<style><!--Some CSS--></style>" + ) + assert isinstance(soup.style.string, Stylesheet) + # The contents of the style tag resemble an HTML comment, but + # it's not treated as a comment. + self.assertEqual("<!--Some CSS-->", soup.style.string) + assert isinstance(soup.style.string, Stylesheet) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + doctype_str + ) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"): + """Generate and parse a document with the given doctype.""" + doctype = '<!%s %s>' % (doctype_string, doctype_fragment) + markup = doctype + '\n<p>foo</p>' + soup = self.soup(markup) + return doctype.encode("utf8"), soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_mixed_case_doctype(self): + # A lowercase or mixed-case doctype becomes a Doctype. + for doctype_fragment in ("doctype", "DocType"): + doctype_str, soup = self._document_with_doctype( + "html", doctype_fragment + ) + + # Make sure a Doctype object was created and that the DOCTYPE + # is uppercase. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, "html") + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + b"<!DOCTYPE html>" + ) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + + def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = u"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A <p> tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("<p/>") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "<p></p>") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("<p>", "<p></p>") + self.assertSoupEquals("<b>", "<b></b>") + + self.assertSoupEquals("<br>", "<br/>") + + def test_br_is_always_empty_element_tag(self): + """A <br> tag is designated as an empty-element tag. + + Some parsers treat <br></br> as one <br/> tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("<br></br>") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "<br/>") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("<em><em></em></em>") + + def test_double_head(self): + html = '''<!DOCTYPE html> +<html> +<head> +<title>Ordinary HEAD element test</title> +</head> +<script type="text/javascript"> +alert("Help!"); +</script> +<body> +Hello, world! +</body> +</html> +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "<p>foo<!--foobar-->baz</p>" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in <pre> and <textarea> tags, + even if that would mean not prettifying the markup. + """ + pre_markup = "<pre> </pre>" + textarea_markup = "<textarea> woo\nwoo </textarea>" + self.assertSoupEquals(pre_markup) + self.assertSoupEquals(textarea_markup) + + soup = self.soup(pre_markup) + self.assertEqual(soup.pre.prettify(), pre_markup) + + soup = self.soup(textarea_markup) + self.assertEqual(soup.textarea.prettify(), textarea_markup) + + soup = self.soup("<textarea></textarea>") + self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>") + + def test_nested_inline_elements(self): + """Inline elements can be nested indefinitely.""" + b_tag = "<b>Inside a B tag</b>" + self.assertSoupEquals(b_tag) + + nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + self.assertSoupEquals( + markup, + '<table id="1"><tr><td>Here\'s another table:' + '<table id="2"><tr><td>foo</td></tr></table>' + '</td></tr></table>') + + self.assertSoupEquals( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '<table><div><div class="css"></div></div></table>' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # <html> tag. This has caused problems with multivalued + # attributes. + markup = '<html class="a b"></html>' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + u"<p>• AT&T is in the s&p 500</p>", + u"<p>\u2022 AT&T is in the s&p 500</p>" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + u"<p>Bob's Bar</p>", + u"<p>Bob's Bar</p>", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEquals(u"“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): + expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("<p>I said "good day!"</p>", + '<p>I said "good day!"</p>') + + def test_out_of_range_entity(self): + expect = u"\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>") + self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>") + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<html><head></head> + <link></link> + <body>foo</body> +</html> +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<!DOCTYPE html> +<html> + <body> + <article id="a" > + <div><a href="1"></div> + <footer> + <a href="2"></a> + </footer> + </article> + </body> +</html> +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'<a class="foo bar">' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", + parse_only=strainer) + self.assertEqual(soup.decode(), "<b>bold</b>") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("<foo attr='bar'></foo>", + '<foo attr="bar"></foo>') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('<this is="really messed up & stuff"></this>', + '<this is="really messed up & stuff"></this>') + + self.assertSoupEquals( + '<a href="http://example.org?a=1&b=2;3">foo</a>', + '<a href="http://example.org?a=1&b=2;3">foo</a>') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "<p><<sacré bleu!>></p>" + expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"<p>\x91Foo\x92</p>" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("<a> </a>") + self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "<p><<sacré bleu!>></p>" + expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'<html><head></head><body><pre>' + b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' + b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' + b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' + b'</pre></body></html>') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/>') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta id="encoding" charset="x-sjis" />') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_python_specific_encodings_not_used_in_charset(self): + # You can encode an HTML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. Instead, the document will appear to + # have no encoding. + for markup in [ + b'<meta charset="utf8"></head>' + b'<meta id="encoding" charset="utf-8" />' + ]: + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + u'idna', u'mbcs', u'oem', u'undefined', + u'string_escape', u'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'meta charset=""' in encoded + assert encoding.encode("ascii") not in encoded + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("<a>text</a>") + data.a['foo'] = 'bar' + self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + + def test_closing_tag_with_no_opening_tag(self): + # Without BeautifulSoup.open_tag_counter, the </span> tag will + # cause _popToTag to be called over and over again as we look + # for a <span> tag that wasn't there. The result is that 'text2' + # will show up outside the body of the document. + soup = self.soup("<body><div><p>text1</p></span>text2</div></body>") + self.assertEqual( + "<body><div><p>text1</p>text2</div></body>", soup.body.decode() + ) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class XMLTreeBuilderSmokeTest(object): + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def test_docstring_generated(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') + + def test_xml_declaration(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_python_specific_encodings_not_used_in_xml_declaration(self): + # You can encode an XML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. + markup = b"""<?xml version="1.0"?>\n<foo/>""" + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + u'idna', u'mbcs', u'oem', u'undefined', + u'string_escape', u'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'<?xml version="1.0"?>' in encoded + assert encoding.encode("ascii") not in encoded + + def test_processing_instruction(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_nested_namespaces(self): + doc = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<parent xmlns="http://ns1/"> +<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> +<grandchild ns3:attr="value" xmlns="http://ns4/"/> +</child> +</parent>""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + <script type="text/javascript"> + </script> +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' + soup = self.soup(markup) + self.assertEqual( + unicode(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode("latin1"), + b'<?xml version="1.0" encoding="latin1"?>\n<root/>') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' + + b'0' * (2**12) + + b'</root>') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("<p>", "<p/>") + self.assertSoupEquals("<p>foo</p>") + + def test_namespaces_are_preserved(self): + markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' + soup = self.soup(markup) + self.assertEqual(unicode(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = '<foo xml:lang="fr">bar</foo>' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + + def test_find_by_prefixed_name(self): + doc = """<?xml version="1.0" encoding="utf-8"?> +<Document xmlns="http://example.com/ns0" + xmlns:ns1="http://example.com/ns1" + xmlns:ns2="http://example.com/ns2" + <ns1:tag>foo</ns1:tag> + <ns1:tag>bar</ns1:tag> + <ns2:tag key="value">baz</ns2:tag> +</Document> +""" + soup = self.soup(doc) + + # There are three <tag> tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + def test_copy_tag_preserves_namespace(self): + xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<w:document xmlns:w="http://example.com/ns0"/>""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + self.assertEqual(tag.prefix, duplicate.prefix) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "<a>" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '<svg><circle/></svg>' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '<math><msqrt>5</msqrt></math>' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/convert-py3k b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/convert-py3k new file mode 100755 index 00000000000..05fab53ca27 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/convert-py3k @@ -0,0 +1,16 @@ +#!/bin/sh +# +# The Python 2 source is the definitive source. This script uses 2to3-3.2 to +# create a new python3/bs4 source tree that works under Python 3. +# +# See README.txt to see how to run the test suite after conversion. +echo "About to destroy and rebuild the py3k/bs4 directory." +echo "If you've got stuff in there, Ctrl-C out of this script or answer 'n'." +mkdir -p py3k +rm -rfI py3k/bs4 +cp -r bs4/ py3k/ +2to3 -w py3k +echo "" +echo "OK, conversion is done." +echo "Now running the unit tests." +(cd py3k && python3 -m unittest discover -s bs4)
\ No newline at end of file diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/Makefile b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/Makefile new file mode 100644 index 00000000000..8c833d2cedb --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/Makefile @@ -0,0 +1,130 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/6.1.jpg b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/6.1.jpg Binary files differnew file mode 100644 index 00000000000..97014f0ec04 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/6.1.jpg diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/conf.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/conf.py new file mode 100644 index 00000000000..cd679b5cb25 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/conf.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# +# Beautiful Soup documentation build configuration file, created by +# sphinx-quickstart on Thu Jan 26 11:22:55 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Beautiful Soup' +copyright = u'2004-2015, Leonard Richardson' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4' +# The full version, including alpha/beta/rc tags. +release = '4.4.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BeautifulSoupdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', + u'Leonard Richardson', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'beautifulsoup', u'Beautiful Soup Documentation', + [u'Leonard Richardson'], 1) +] + + +# -- Options for Epub output --------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = u'Beautiful Soup' +epub_author = u'Leonard Richardson' +epub_publisher = u'Leonard Richardson' +epub_copyright = u'2012, Leonard Richardson' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/index.rst b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/index.rst new file mode 100644 index 00000000000..2b6a5f9324e --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ptbr/source/index.rst @@ -0,0 +1,3266 @@ +Documentação Beautiful Soup +============================ + +.. image:: 6.1.jpg + :align: right + :alt: "O Lacaio-Peixe começou tirando debaixo do braço uma grande carta, quase tão grande quanto ele mesmo." + + +`Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/>`_ é uma biblioteca +Python de extração de dados de arquivos HTML e XML. Ela funciona com o seu interpretador (parser) favorito +a fim de prover maneiras mais intuitivas de navegar, buscar e modificar uma árvore de análise (parse tree). +Ela geralmente economiza horas ou dias de trabalho de programadores ao redor do mundo. + +Estas instruções ilustram as principais funcionalidades do Beautiful Soup 4 +com exemplos. Mostro para o que a biblioteca é indicada, como funciona, +como se usa e como fazer aquilo que você quer e o que fazer quando ela frustra suas +expectativas. + +Os exemplos nesta documentação devem funcionar da mesma maneira em Python 2.7 e Python 3.2. + +`Você pode estar procurando pela documentação do Beautiful Soup 3 +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. +Se está, informo que o Beautiful Soup 3 não está mais sendo desenvolvido, +e que o Beautiful Soup 4 é o recomendado para todos os novos projetos. +Se você quiser saber as diferenças entre as versões 3 e 4, veja `Portabilidade de código para BS4`_. + +Esta documentação foi traduzida para outros idiomas pelos usuários do Beautiful Soup: + +* `这篇文档当然还有中文版. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/>`_ +* このページは日本語で利用できます(`外部リンク <http://kondou.com/BS4/>`_) +* `이 문서는 한국어 번역도 가능합니다. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ko/>`_ +* `Эта документация доступна на русском языке. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ru/>`_ + +Como conseguir ajuda: +--------------------- + +Se você tem perguntas sobre o Beautiful Soup ou está com dificuldades, +`envie uma mensagem para nosso grupo de discussão +<https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_. Se o seu +problema envolve a interpretação de um documento HTML, não esqueça de mencionar +:ref:`o que a função diagnose() diz <diagnose>` sobre seu documento. + +Início Rápido +============= + +Este é o HTML que usarei como exemplo ao longo deste documento +É um trecho de "Alice no País das Maravilhas":: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + +Executando o arquivo "three sisters" através do Beautiful Soup, ele nos +retorna um objeto ``BeautifulSoup``, que apresenta o documento como uma estrutura +de dados aninhada:: + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + + print(soup.prettify()) + # <html> + # <head> + # <title> + # The Dormouse's story + # </title> + # </head> + # <body> + # <p class="title"> + # <b> + # The Dormouse's story + # </b> + # </p> + # <p class="story"> + # Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # , + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # and + # <a class="sister" href="http://example.com/tillie" id="link2"> + # Tillie + # </a> + # ; and they lived at the bottom of a well. + # </p> + # <p class="story"> + # ... + # </p> + # </body> + # </html> + +Abaixo verificamos algumas maneiras simples de navegar na estrutura:: + + soup.title + # <title>The Dormouse's story</title> + + soup.title.name + # u'title' + + soup.title.string + # u'The Dormouse's story' + + soup.title.parent.name + # u'head' + + soup.p + # <p class="title"><b>The Dormouse's story</b></p> + + soup.p['class'] + # u'title' + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find(id="link3") + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +Uma tarefa comum é extratir todas as URLs encontradas nas tags <a> de uma página:: + + for link in soup.find_all('a'): + print(link.get('href')) + # http://example.com/elsie + # http://example.com/lacie + # http://example.com/tillie + +Outra tarefa comum é extrair todo o texto de uma página:: + + print(soup.get_text()) + # The Dormouse's story + # + # The Dormouse's story + # + # Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well. + # + # ... + +Isso se parece com o que você precisa? Então vá em frente! + +Instalando o Beautiful Soup +=========================== + +Se você está usando uma versão recente das distribuições Linux Debian ou Ubuntu, +você pode instalar o Beautiful Soup facilmente utilizando o gerenciador de pacotes + +:kbd:`$ apt-get install python-bs4` (for Python 2) + +:kbd:`$ apt-get install python3-bs4` (for Python 3) + +O Beautiful Soup 4 também está publicado no PyPi. Portanto, se +você não conseguir instalá-lo através de seu gerenciador de pacotes, você +pode fazer isso com ``easy_install`` ou ``pip``. O nome do pacote é ``beautifulsoup4``, +e o mesmo pacote é válido tanto para Python 2 quanto Python 3. Tenha certeza de utilizar +a versão correta de ``pip`` ou ``easy_install`` para sua versão do Python (estarão +nomeados como ``pip3`` ou ``easy_install3`` ,respectivamente, se você estiver usando Python 3). + + +:kbd:`$ easy_install beautifulsoup4` + +:kbd:`$ pip install beautifulsoup4` + +(O pacote ``BeautifulSoup`` provavelmente `não` é o que você quer. Esta +é a versão anterior, `Beautiful Soup 3`_. Muitos softwares utilizam +BS3, por isso ele ainda está disponível, mas se você está criando algo novo, +você deve instalar o ``beautifulsoup4``.) + +Se você não possui o ``easy_install`` ou ``pip`` instalados, você pode fazer +o download através do tarball do arquivo fonte do Beautiful Soup 4 +<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ e +instalar através do ``setup.py``. + +:kbd:`$ python setup.py install` + +Se tudo isso falhar, a licença do Beautiful Soup lhe permite empacotar +toda a biblioteca em sua aplicação. Você pode fazer o download do arquivo +tarball, copiar o diretório ``bs4`` do código-fonte para sua aplicação e +utilizar o Beautiful Soup sem nenhum processo de instalação. + +Eu utilizo Python 2.7 e Python 3.2 para desenvolver o Beautiful Soup, +mas ele também funcionará com outras versões recentes. + +Problemas após a instalação +--------------------------- + +O Beautiful Soup é empacotado em Python 2. Quando você o instala utilizando +Python 3 ele é automaticamente convertido para esta versão. Se você não instalar o pacote, o +código não será convertido. Também foi relatado versões erradas sendo instaladas em +máquinas Windows. + +Se você receber um ``ImportError`` "No module named HTMLParser", seu problema +é que você está utilizando o formato de código Python 2 sob Python 3. + +Se você receber um ``ImportError`` "No module named html.parser", seu problema +é que você está utilizando o formato de código Python 3 sob Python 2. + +Em ambos os casos, sua melhor opção é remover completamente a +instalação do Beautiful Soup do seu sistema (incluindo qualquer diretório +criado quando o tarball foi descompactado) e realizar a instalação novamente. + +Se você receber um ``SyntaxError`` "Invalid syntax" na linha +``ROOT_TAG_NAME = u'[document]'``, você terá que converter o Python 2 +em Python 3. Você pode fazer isso instalando o pacote: + +:kbd:`$ python3 setup.py install` + +ou manualmente executando o script de conversão ``2to3`` no +diretório ``bs4``: + +:kbd:`$ 2to3-3.2 -w bs4` + +.. _parser-installation: + + +Instalando um interpretador (parser) +------------------------------------ + + +O Beautiful Soup não só suporta o parser HTML incluído na biblioteca +padrão do Python como também inúmeros parsers de terceiros. +Um deles é o `parser lxml <http://lxml.de/>`_. Dependendo de sua configuração, +você podera instalar o lxml com algum dos seguintes comandos: + +:kbd:`$ apt-get install python-lxml` + +:kbd:`$ easy_install lxml` + +:kbd:`$ pip install lxml` + +Outra alternativa é o parser `html5lib +<http://code.google.com/p/html5lib/>`_ do Python puro, o qual analisa o HTML +da mesma maneira que o navegador o faz. Dependendo de sua configuração, +você podera instalar o html5lib com algum dos seguintes comandos: + +:kbd:`$ apt-get install python-html5lib` + +:kbd:`$ easy_install html5lib` + +:kbd:`$ pip install html5lib` + +Esta tabela resume as vantagens e desvantagens de cada parser:- + ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| Parser | Uso Padrão | Vantagens | Desvantagens | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| html.parser (puro) | ``BeautifulSoup(markup, "html.parser")`` | * Baterias inclusas | * Não tão rápido quanto | +| | | * Velocidade Decente | lxml, menos leniente | +| | | * Leniente (Python 2.7.3 | que html5lib. | +| | | e 3.2.) | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| HTML (lxml) | ``BeautifulSoup(markup, "lxml")`` | * Muito rápido | * Dependencia externa de | +| | | * Leniente | C | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| XML (lxml) | ``BeautifulSoup(markup, "lxml-xml")`` | * Muito rápido | * Dependência externa de | +| | ``BeautifulSoup(markup, "xml")`` | * O único parser XML atualmente| C | +| | | suportado | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| html5lib | ``BeautifulSoup(markup, "html5lib")`` | * Extremamente leniente | * Muito lento | +| | | * Analisa as páginas da mesma | * Dependência externa de | +| | | maneira que o navegador o faz| Python | +| | | * Cria HTML5 válidos | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ + +Se for possível recomendo que você instale e utilize o lxml pelo desempenho. +Se você está utilizando o Python 2 anterior a 2.7.3 ou uma versão do Python 3 +anterior a 3.2.2, é `essencial` que você instale o lxml ou o html5lib. O parser +HTML nativo do Python não é muito bom para versões mais antigas. + +Note que se um documento é inválido, diferentes parsers irão gerar +diferentes árvores BeautifulSoup para isso. Veja +:ref:`Diferenças entre os interpretadores (parsers) <differences-between-parsers>` +para detalhes. + + +Criando a "Sopa" +================ + +Para analisar um documento, passe-o como argumento dentro de um construtor ``BeautifulSoup``. +Você pode passar este argumento como uma string ou manipulador da função open():: + + from bs4 import BeautifulSoup + + with open("index.html") as fp: + soup = BeautifulSoup(fp) + + soup = BeautifulSoup("<html>data</html>") + +Primeiro, o documento é convertido para Unicode e as entidades HTML +são convertidas para caracteres Unicode:: + + BeautifulSoup("Sacré bleu!") + <html><head></head><body>Sacré bleu!</body></html> + +O Beautiful Soup então interpreta o documento usando o melhor parser disponível. +Ele irá utilizar um parser HTML ao menos que você indique a ele que utilize um +parser XML. (Veja `Analisando um XML`_.) + +Tipos de objetos +================ + +O Beautiful Soup transforma um documento HTML complexo em uma complexa árvore de objetos Python. +Mas você terá apenas que lidar com quatro `tipos` de objetos: ``Tag``, ``NavigableString``, ``BeautifulSoup``, +e ``Comment``. + +.. _Tag: + +``Tag`` +------- + +Um objeto ``Tag`` corresponde a uma tag XML ou HTML do documento original:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + type(tag) + # <class 'bs4.element.Tag'> + +As tags possuem muitos atributos e métodos que eu falarei mais sobre em +`Navegando pela árvore`_ e `Buscando na árvore`_. Por agora, as características +mais importantes da tag são seu nome e atributos. + +Nome +^^^^ + +Toda tag possui um nome, acessível através de ``.name``:: + + tag.name + # u'b' + +Se você mudar o nome de uma tag, a alteração será refletida em qualquer HTML gerado pelo +Beautiful Soup:: + + tag.name = "blockquote" + tag + # <blockquote class="boldest">Extremely bold</blockquote> + +Atributos +^^^^^^^^^^ +Uma tag pode ter inúmeros atributos. A tag ``<b id="boldest">`` +possui um atributo "id" que possui o valor "boldest". Você pode +acessar um atributo de uma tag tratando-a como um dicionário:: + + tag['id'] + # u'boldest' + +Você pode acessar este dicionário diretamente através de ``.attrs``:: + + tag.attrs + # {u'id': 'boldest'} + +Você pode adicionar, remover ou modificar os atributos de uma tag. Novamente, isso pode +ser feito tratando a tag como um dicionário:: + + tag['id'] = 'verybold' + tag['another-attribute'] = 1 + tag + # <b another-attribute="1" id="verybold"></b> + + del tag['id'] + del tag['another-attribute'] + tag + # <b></b> + + tag['id'] + # KeyError: 'id' + print(tag.get('id')) + # None + +.. _multivalue: + +Atributos com múltiplos valores +&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& + +O HTML 4 define alguns atributos que podem ter múltiplos valores. O HTML 5 +removeu alguns deles, mas definiu alguns novos. O atributo mais comum +que pode receber múltiplos valores é o ``class`` (ou seja, a tag pode ter mais de uma classe CSS). +Outros são ``rel``, ``rev``, ``accept-charset``, ``headers``, e ``accesskey``. +O Beautiful Soup apresenta o(s) valor(es) de um atributo deste tipo como uma lista:: + + css_soup = BeautifulSoup('<p class="body"></p>') + css_soup.p['class'] + # ["body"] + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.p['class'] + # ["body", "strikeout"] + +Se um atributo possui mais de um valor, mas não é um atributo +que aceita múltiplos valores conforme definido por qualquer versão do +padrão HTML, o Beautiful Soup retornará como um valor único:: + + id_soup = BeautifulSoup('<p id="my id"></p>') + id_soup.p['id'] + # 'my id' + +Quando a tag é transformada novamente em string, os valores do atributo múltiplo +são consolidados:: + + rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>') + rel_soup.a['rel'] + # ['index'] + rel_soup.a['rel'] = ['index', 'contents'] + print(rel_soup.p) + # <p>Back to the <a rel="index contents">homepage</a></p> + +Você pode desabilitar esta opção passando ``multi_valued_attributes=None`` como argumento +dentro do construtor ``BeautifulSoup`` :: + + no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html', multi_valued_attributes=None) + no_list_soup.p['class'] + # u'body strikeout' + +Você pode utilizar ```get_attribute_list`` para retornar um valor no formato de lista, seja um atributo de +múltiplos valores ou não:: + + id_soup.p.get_attribute_list('id') + # ["my id"] + +Se você analisar um documento como XML, nenhum atributo será tratado como de múltiplos valores:: + + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') + xml_soup.p['class'] + # u'body strikeout' + +Novamente, você pode configurar isto usando o argumento ``multi_valued_attributes``:: + + class_is_multi= { '*' : 'class'} + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi) + xml_soup.p['class'] + # [u'body', u'strikeout'] + +Você provavelmente não precisará fazer isso, mas se fizer, use os padrões como guia. +Eles implementam as regras descritas na especificação do HTML:: + + from bs4.builder import builder_registry + builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES + + +``NavigableString`` +------------------- + +Uma string corresponde a um texto dentro de uma tag. +O Beautiful Soup usa a classe ``NavigableString`` para armazenar este texto:: + + tag.string + # u'Extremely bold' + type(tag.string) + # <class 'bs4.element.NavigableString'> + +Uma ``NavigableString`` é como uma string Unicode do Python, exceto +que ela também suporta algumas características descritas em `Navegando pela árvore`_ +e `Buscando na árvore`_. Você pode converter um +``NavigableString`` em uma string Unicode utilizando ``unicode()``:: + + unicode_string = unicode(tag.string) + unicode_string + # u'Extremely bold' + type(unicode_string) + # <type 'unicode'> + +Você não pode editar uma string "in place", mas você pode substituir +uma string por outra usando :ref:`replace_with()`:: + + tag.string.replace_with("No longer bold") + tag + # <blockquote>No longer bold</blockquote> + +``NavigableString`` suporta a maior parte das características descritas em +`Navegando pela árvore`_ e `Buscando na árvore`_, mas não todas elas. +Em particular, desde que uma string não pode conter de tudo (da maneira que +uma tag pode conter uma string ou outra tag), as strings não suportam os +atributos ``.contents`` ou ``.string`` ou o método ``find()``. + +Se você quer utilizar uma ``NavigableString`` fora do Beautiful Soup, +você deve chamar o ``unicode()`` para transformá-la em uma string Unicode Python +padrão. Se você não fizer isso, sua string irá carregar uma referência de toda sua +árvore Beautiful Soup, mesmo que você já não esteja mais usando ela, o que é um grande +desperdício de memória. + +``BeautifulSoup`` +----------------- + +O objeto ``BeautifulSoup`` em si representa o documento como um todo. +Para maioria dos propósitos, você pode tratá-lo como um objeto :ref:`Tag`. +Isso significa que irá suportar a maioria dos métodos descritos em +`Navegando pela árvore`_ e `Buscando na árvore`_. + +Sabendo que o objeto ``BeautifulSoup`` não corresponde a uma tag +HTML ou XML propriamente dita, ele não tem nome e nem atributos. Mas em alguns +casos é útil observar seu ``.name``; então, foi dado o especial +``.name`` "[document]":: + + soup.name + # u'[document]' + +Comentários e outras strings especiais +-------------------------------------- + +``Tag``, ``NavigableString``, e ``BeautifulSoup`` abrangem quase +tudo o que você encontrará em um arquivo HTML ou XML, mas há alguns +pontos faltando. O único deles que você provavelmente precisará se preocupar +é o comentário:: + + markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" + soup = BeautifulSoup(markup) + comment = soup.b.string + type(comment) + # <class 'bs4.element.Comment'> + +O objeto ``Comment`` é apenas um tipo especial de ``NavigableString``:: + + comment + # u'Hey, buddy. Want to buy a used parser' + +Mas quando aparece como parte de um documento HTML, um ``Comment`` é +exibido com uma formatação especial:: + + print(soup.b.prettify()) + # <b> + # <!--Hey, buddy. Want to buy a used parser?--> + # </b> + +O Beautiful Soup define classes para qualquer outra coisa que possa +aparecer em um documento XML: ``CData``, ``ProcessingInstruction``, +``Declaration`` e ``Doctype``. Assim como ``Comment``, estas classes +são subclasses de ``NavigableString`` que adicionam algo a string. +Aqui está um exemplo que substitui o comentário por um bloco CDATA:: + + from bs4 import CData + cdata = CData("A CDATA block") + comment.replace_with(cdata) + + print(soup.b.prettify()) + # <b> + # <![CDATA[A CDATA block]]> + # </b> + + +Navegando pela árvore +===================== + +Aqui está o documento HTML "Three sisters" novamente:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + +Eu usarei este documento como exemplo para mostrar como navegar +de uma parte para outra do documento. + +Descendo na Árvore +------------------ +As tags podem conter strings e outras tags. Estes elementos são as tags +`filhas` (children). O Beautiful Soup oferece diferentes atributos para +navegar e iterar sobre as tags filhas. + +Note que as strings Beautiful Soup não suportam qualquer destes atributos, +porque uma string não pode ter filhos. + +Navegar usando os nomes das tags +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A maneira mais simples de navegar pela árvore é utilizar +o nome da tag que você quer. Se você quer a tag <head>, +simplesmente use ``soup.head``:: + + soup.head + # <head><title>The Dormouse's story</title></head> + + soup.title + # <title>The Dormouse's story</title> + +Você pode usar este truque de novo, e de novo, para focar em certa parte da +árvore de análise. Este código retorna a primeira tag <b> abaixo da tag <body>:: + + soup.body.b + # <b>The Dormouse's story</b> + +Utilizando o nome da tag como atributo irá lhe retornar apenas a `primeira` +tag com aquele nome:: + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +Se você precisar retornar `todas` as tags <a>, ou algo mais complicado +que a primeira tag com um certo nome, você precisará utilizar um dos +métodos descritos em `Buscando na árvore`_, como `find_all()`:: + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +``.contents`` e ``.children`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As tags filhas de uma outra tag estão disponíveis em uma lista chamada por ``.contents``:: + + head_tag = soup.head + head_tag + # <head><title>The Dormouse's story</title></head> + + head_tag.contents + [<title>The Dormouse's story</title>] + + title_tag = head_tag.contents[0] + title_tag + # <title>The Dormouse's story</title> + title_tag.contents + # [u'The Dormouse's story'] + +O objeto ``BeautifulSoup`` em si possui filhos. Neste caso, a tag +<html> é a filha do objeto ``BeautifulSoup``.:: + + len(soup.contents) + # 1 + soup.contents[0].name + # u'html' + +Uma string não possui o atributo ``.contents``, porque ela não pode conter +nada:: + + text = title_tag.contents[0] + text.contents + # AttributeError: 'NavigableString' object has no attribute 'contents' + +Ao invés de retorná-las como uma lista, você pode iterar sobre as +tag's filhas usando o gerador ``.children``:: + + for child in title_tag.children: + print(child) + # The Dormouse's story + +``.descendants`` +^^^^^^^^^^^^^^^^ + +Os atributos ``.contents`` e ``.children`` somente consideram tags que +são `filhas diretas`. Por instância, a tag <head> tem apenas uma tag filha direta, +a tag <title>:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + +Mas a tag <title> em si possui uma filha: a string "The Dormouse's story". +Existe uma percepção de que esta string também é filha da tag <head>. +O atributo ``.descendants`` permite que você itere sobre `todas` +as tags filhas, recursivamente: suas filhas diretas, as filhas de suas filhas, e assim por diante:: + + for child in head_tag.descendants: + print(child) + # <title>The Dormouse's story</title> + # The Dormouse's story + +A tag <head> possui apenas uma filha, mas também possui dois `descentendes`: +a tag <title> e a filha da tag <title>. O objeto ``BeautifulSoup`` possui apenas +uma filha direta (a tag <html>), mas ele possui vários descendentes:: + + len(list(soup.children)) + # 1 + len(list(soup.descendants)) + # 25 + +.. _.string: + +``.string`` +^^^^^^^^^^^ + +Se uma tag possui apenas uma filha, e esta filha é uma ``NavigableString``, +esta filha pode ser disponibilizada através de ``.string``:: + + title_tag.string + # u'The Dormouse's story' + +Se a filha única de uma tag é outra tag e esta tag possui uma +``.string``, então considera-se que a tag mãe tenha a mesma +``.string`` como sua filha:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + + head_tag.string + # u'The Dormouse's story' + +Se uma tag contém mais de uma coisa, então não fica claro a que +``.string`` deve se referir, portanto ``.string`` será definida como +``None``:: + + print(soup.html.string) + # None + +.. _string-generators: + +``.strings`` e ``stripped_strings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Se existe mais de alguma coisa dentro da tag, você pode continuar +olhando apenas as strings. Use o gerador ``.strings``:: + + for string in soup.strings: + print(repr(string)) + # u"The Dormouse's story" + # u'\n\n' + # u"The Dormouse's story" + # u'\n\n' + # u'Once upon a time there were three little sisters; and their names were\n' + # u'Elsie' + # u',\n' + # u'Lacie' + # u' and\n' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # u'...' + # u'\n' + +Estas strings tendem a ter muitos espaços em branco, os quais você +pode remover utilizando o gerador ``.stripped_strings`` como alternativa:: + + for string in soup.stripped_strings: + print(repr(string)) + # u"The Dormouse's story" + # u"The Dormouse's story" + # u'Once upon a time there were three little sisters; and their names were' + # u'Elsie' + # u',' + # u'Lacie' + # u'and' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'...' + +Aqui, strings formadas inteiramente por espaços em branco serão ignoradas, +e espaços em branco no início e no fim das strings serão removidos. + +Subindo na Árvore +----------------- + +Continuando a analogia da árvore como "família", toda tag e toda string possuem +`tags mães (parents)`: a tag que as contém. + +.. _.parent: + +``.parent`` +^^^^^^^^^^^ + +Você pode acessar o elemento mãe com o atributo ``.parent``. No +exemplo "three sisters", a tag <head> é mãe da tag <title>:: + + title_tag = soup.title + title_tag + # <title>The Dormouse's story</title> + title_tag.parent + # <head><title>The Dormouse's story</title></head> + +A string de title tem uma mãe: a tag <title> que a contém:: + + title_tag.string.parent + # <title>The Dormouse's story</title> + +A tag mãe de todo documento (<html>) é um objeto ``BeautifulSoup`` em si:: + + html_tag = soup.html + type(html_tag.parent) + # <class 'bs4.BeautifulSoup'> + +E o ``.parent`` de um objeto ``BeautifulSoup`` é definido como None:: + + print(soup.parent) + # None + +.. _.parents: + +``.parents`` +^^^^^^^^^^^^ +Você pode iterar sobre todos os elementos pais com +``.parents``. Este exemplo usa ``.parents`` para viajar de uma tag <a> +profunda no documento, para o elemento mais ao topo da árvore do documento:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + for parent in link.parents: + if parent is None: + print(parent) + else: + print(parent.name) + # p + # body + # html + # [document] + # None + +Navegando para os lados: +------------------------ + +Considere um simples documento como este:: + + sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>") + print(sibling_soup.prettify()) + # <html> + # <body> + # <a> + # <b> + # text1 + # </b> + # <c> + # text2 + # </c> + # </a> + # </body> + # </html> + +A tag <b> e a tag <c> estão no mesmo nível: ambas são filhas diretas +da mesma tag. Nós podemos chamá-las irmãs (`siblings`). +Quando um documento é pretty-printed, irmãs aparecem no mesmo nível de identação. +Você pode utilizar esta relação nos códigos que você escrever. + +``.next_sibling`` e ``.previous_sibling`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Você pode usar ``.next_sibling`` e ``.previous_sibling`` para navegar +entre os elementos da página que estão no mesmo nível da árvore:: + + sibling_soup.b.next_sibling + # <c>text2</c> + + sibling_soup.c.previous_sibling + # <b>text1</b> + +A tag <b> possui ``.next_sibling``, mas não ``.previous_sibling``, +porque não há nada antes da tag <b> `no mesmo nível na árvore`. +Pela mesma razão, a tag <c> possui ``.previous_sibling`` +mas não ``.next_sibling``:: + + print(sibling_soup.b.previous_sibling) + # None + print(sibling_soup.c.next_sibling) + # None + +As strings "text1" e "text2" `não` são irmãs, porque elas não tem a mesma tag mãe:: + + sibling_soup.b.string + # u'text1' + + print(sibling_soup.b.string.next_sibling) + # None + +No mundo real, ``.next_sibling`` ou ``.previous_sibling`` de uma tag +geralmente são strings contendo espaços em branco. Voltando ao documento +"three sisters":: + + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> + +Você pode pensar que o ``.next_sibling`` da primeira tag <a> será a segunda tag <a>. +Mas na verdade é uma string: a vírgula e um caracter de nova linha (\n) que separam +a primeira da segunda tag <a>:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + link.next_sibling + # u',\n' + +A segunda tag <a> é, na verdade, a ``.next_sibling`` da vírgula:: + + link.next_sibling.next_sibling + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + +.. _sibling-generators: + +``.next_siblings`` e ``.previous_siblings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Você pode iterar sobre as tag's filhas com ``.next_siblings`` +ou ``.previous_siblings``:: + + for sibling in soup.a.next_siblings: + print(repr(sibling)) + # u',\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u' and\n' + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + # u'; and they lived at the bottom of a well.' + # None + + for sibling in soup.find(id="link3").previous_siblings: + print(repr(sibling)) + # ' and\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u',\n' + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + # u'Once upon a time there were three little sisters; and their names were\n' + # None + +Indo e voltando +---------------- + +Dê uma olhada no início do documento "three sisters":: + + <html><head><title>The Dormouse's story</title></head> + <p class="title"><b>The Dormouse's story</b></p> + +Um parser HTML transforma estas strings em uma série de eventos: "abrir +uma tag <html>", "abrir uma tag <head>", "abrir uma tag <title>", +"adicionar uma string", "fechar uma tag <title>, +"abrir uma tag <p>", e daí por diante. O Beautiful Soup oferece ferramentas +para reconstruir a análise inicial do documento. + +.. _element-generators: + +``.next_element`` e ``.previous_element`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +O atributo ``.next_element`` de uma string ou tag aponta para +qualquer coisa que tenha sido interpretado posteriormente. +Isso deveria ser o mesmo que ``.next_sibling``, mas é +drasticamente diferente. + +Aqui está a tag <a> final no "three sisters". Sua +``.next_sibling`` é uma string: a conclusão da sentença +que foi interrompida pelo início da tag <a>.:: + + last_a_tag = soup.find("a", id="link3") + last_a_tag + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_a_tag.next_sibling + # '; and they lived at the bottom of a well.' + +Mas no ``.next_element`` da tag <a>, o que é analisado imediatamente +depois da tag <a> `não` é o resto da sentença: é a palavra "Tillie". + + last_a_tag.next_element + # u'Tillie' + +Isso porque na marcação original, a palavra "Tillie" apareceu +antes do ponto e virgula. O parser encontrou uma tag <a>, então +a palavra "Tillie", então fechando a tag </a>, então o ponto e vírgula e o +resto da sentença. O ponto e vírgula estão no mesmo nível que a tag <a>, +mas a palavra "Tillie" foi encontrada primeiro. + +O atributo ``.previous_element`` é exatamente o oposto de +``.next_element``. Ele aponta para qualquer elemento que +seja analisado antes do respectivo:: + + last_a_tag.previous_element + # u' and\n' + last_a_tag.previous_element.next_element + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +``.next_elements`` e ``.previous_elements`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Você deve ter entendido a idéia agora. Você pode usar estes iteradores +para andar para frente e para atrás no documento quando ele for analisado:: + + for element in last_a_tag.next_elements: + print(repr(element)) + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # <p class="story">...</p> + # u'...' + # u'\n' + # None + +Buscando na árvore +================== + +O Beautiful Soup define vários métodos para buscar na árvore que está sendo analisada, +mas eles são todos muito similares. Vou usar a maior parte do tempo para explicar os dois mais +populares métodos: ``find()`` e ``find_all()``. Os outros métodos recebem exatamente +os mesmos argumentos, portanto, vou cobrí-los apenas brevemente. + + +Mais uma vez, utilizarei o documento "three sisters" como exemplo:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + +Utilizando em um filtro um argumento como ``find_all()``, você pode +"dar um zoom" nas partes do documento que você está interessado. + +Tipos de filtros +---------------- + +Antes de entrar em detalhes sobre o ``find_all()`` e métodos similares, +quero mostrar exemplos de diferentes filtros que você pode passar dentro +destes métodos. Estes filtros aparecerão de novo e de novo por toda API +de pesquisa. Você pode usá-los para realizar filtros baseados nos nomes das tags, +nos seus atributos, no texto de uma strings ou em alguma combinação entre eles. + +.. _uma string: + +Uma string +^^^^^^^^^^ + +O filtro mais simples é uma string. Passando uma string para um método de pesquisa, +o Beautiful Soup irá buscar uma correspondência a esta exata string. O seguinte código +encontrará todas as tags <b> no documento:: + + soup.find_all('b') + # [<b>The Dormouse's story</b>] + +Se você passar uma byte string, o Beautiful Soup assumirá que a string +esta codificada como UTF-8. Você pode evitar isso passando ao invés disso +uma string Unicode. + +.. _uma expressão regular: + +Uma expressão regular (regex) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Se você passar um objeto `regex`, o Beautiful Soup irá +realizar um filtro com ela utilizando seu método ``search()``. +O código seguinte buscará todas as tags as quais os nomes comecem com +a letra "b"; neste caso, a tag <body> e a tag <b>:: + + import re + for tag in soup.find_all(re.compile("^b")): + print(tag.name) + # body + # b + +Este código buscará todas as tags cujo nome contenha a letra "t":: + + for tag in soup.find_all(re.compile("t")): + print(tag.name) + # html + # title + +.. _uma lista: + +Uma lista +^^^^^^^^^ + +Se você passar uma lista, o Beautiful Soup irá buscar +uma correspondência com qualquer item dessuma lista. +O código seguinte buscará todas as tags <a> e todas +as tags <b>:: + + soup.find_all(["a", "b"]) + # [<b>The Dormouse's story</b>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _the value True: + +``True`` +^^^^^^^^ + +O valor ``True`` irá corresponder com tudo. +O código abaixo encontrará ``todas`` as tags do documento, +mas nenhuma das strings:: + + for tag in soup.find_all(True): + print(tag.name) + # html + # head + # title + # body + # p + # b + # p + # a + # a + # a + # p + +.. _a function: + +Uma função +^^^^^^^^^^ + +Se nenhuma das opções anteriores funcionar para você, defina uma +função que pegará um elemento como seu único argumento. A função +deverá retornar ``True`` se o argumento corresponder e ``False`` +caso contrário. + +Aqui você tem uma função que irá retornar ``True`` se uma tag definir +o atributo `class`, mas não definir o atributo `id`:: + + def has_class_but_no_id(tag): + return tag.has_attr('class') and not tag.has_attr('id') + +Passe esta função dentro de ``find_all()`` e você irá retornar todas +as tags <p>:: + + soup.find_all(has_class_but_no_id) + # [<p class="title"><b>The Dormouse's story</b></p>, + # <p class="story">Once upon a time there were...</p>, + # <p class="story">...</p>] + +Esta função irá encontrar apenas as tags <p>. Não irá encontrar as tags <a>, +porque elas definem "class e "id" ao mesmo tempo. Ela não encontrará +as tags <html> e <title>, porque estas tags não definem um atributo +"class". + +Se você passar uma função para filtrar um atributo específico como +``href``, o argumento passado na função será o nome do atributo e +não toda a tag. Aqui vemos uma função que encontra todas as tags <a> +em que o atributo ``href`` não corresponde a expressão regular passada:: + + def not_lacie(href): + return href and not re.compile("lacie").search(href) + soup.find_all(href=not_lacie) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +A função pode ser tão complexa quanto você precise que seja. +Aqui temos uma função que retorna ``True`` se uma tag esta +cercada por objetos string:: + + from bs4 import NavigableString + def surrounded_by_strings(tag): + return (isinstance(tag.next_element, NavigableString) + and isinstance(tag.previous_element, NavigableString)) + + for tag in soup.find_all(surrounded_by_strings): + print tag.name + # p + # a + # a + # a + # p + +Agora nós estamos prontos para olhar os métodos de busca em detalhes. + +``find_all()`` +-------------- + +Definição: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +O método ``find_all()`` busca entre os decendentes de uma tag e retorna todos os decendentes +que correspondem a seus filtros. Dei diversos exemplos em `Tipos de filtros`_, +mas aqui estão mais alguns:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.find_all("p", "title") + # [<p class="title"><b>The Dormouse's story</b></p>] + + soup.find_all("a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find_all(id="link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + import re + soup.find(string=re.compile("sisters")) + # u'Once upon a time there were three little sisters; and their names were\n' + +Alguns podem parecer familiares, mas outros são novos. +O que significa passar um valor ``string`` ou ``id``? Por que +``find_all("p", "title")`` encontra uma tag <p> com a classe CSS "title"? +Vamos dar uma olhada nos argumentos de ``find_all()``. + +.. _name: + +O argumento ``name`` +^^^^^^^^^^^^^^^^^^^^ + +Passe um valor para ``name`` e você dirá para o Beautiful Soup +considerar apenas as tags com certos nomes. Strings de texto seão ignoradas, +assim como os nomes que não corresponderem ao argumento ``name`` + +Este é o uso mais simples:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + +Lembre-se de `Tipos de filtros`_ que o valor para ``name`` pode ser `uma +string`_, `uma expressão regular`_, `uma lista`_, `uma função`_, ou +:ref:`o valor True <the value True>`. + +.. _kwargs: + +Os argumentos "palavras-chave" +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Qualquer argumento que não for reconhecido se tornará um filtro +de atributos da tag. Se você passar um valor para um argumento +chamado ``id``, o Beautiful Soup irá buscar correspondentes entre +todas tags ``id``:: + + soup.find_all(id='link2') + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Se você passar um valor para ``href``, o Beautiful Soup buscar correspondentes +em cada tag que possua o atributo ``href``:: + + soup.find_all(href=re.compile("elsie")) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +Você pode filtrar um atributo baseado em `uma string`_, +`uma expressão regular`_, `uma lista`_, `uma função`_, ou +:ref:`o valor True <the value True>`. + +Este código encontra todas as tags em que o atributo ``id`` +possuem um valor, independente de qual valor seja:: + + soup.find_all(id=True) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Você pode filtrar múltiplos atributos de uma vez passando mais de um argumento +palavra-chave:: + + soup.find_all(href=re.compile("elsie"), id='link1') + # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] + +Alguns atributos, como o atributo data-* do HTML5, possuem nomes que não +podem ser usados como argumentos palavra-chave::: + + data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') + data_soup.find_all(data-foo="value") + # SyntaxError: keyword can't be an expression + +Você pode usar estes atributos para realizar buscas, colocando-os +em um dicionário e passando o dicionário em ``find_all()``, como o argumento +``attrs``:: + + data_soup.find_all(attrs={"data-foo": "value"}) + # [<div data-foo="value">foo!</div>] + +Você não pode utilizar um argumento palavra-chave para buscar pelo elemento +HTML "name", porque o Beautiful Soup utiliza o argumento ``name`` para +conter o nome da própria tag. Ao invés disso, você pode passar o valor para +"name" no argumento ``attrs``:: + + name_soup = BeautifulSoup('<input name="email"/>') + name_soup.find_all(name="email") + # [] + name_soup.find_all(attrs={"name": "email"}) + # [<input name="email"/>] + +.. _attrs: + +Buscando por uma classe CSS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +É muito útil buscar por uma tag que tem uma certa classe CSS, mas +o nome do atributo CSS, "class", é uma palavra reservada no Python. +Utilizar ``class`` como um argumento palavra-chave lhe trará um erro +de sintaxe. A partir do Beautiful Soup 4.1.2, você pode buscar por uma +classe CSS utilizando o argumento palavra-chave ``class_``:: + + soup.find_all("a", class_="sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Assim como qualquer argumento palavra-chave, você pode passar para ``class_`` +uma string, uma expressão regular (regex), uma função ou ``True``:: + + soup.find_all(class_=re.compile("itl")) + # [<p class="title"><b>The Dormouse's story</b></p>] + + def has_six_characters(css_class): + return css_class is not None and len(css_class) == 6 + + soup.find_all(class_=has_six_characters) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +:ref:`Lembre-se <multivalue>` que uma tag pode ter valores múltiplos +para seu atributo classe. Quando você buscar por uma tag que tenha +uma certa classe CSS, você esta buscando correspodência em `qualquer` +de suas classes CSS:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.find_all("p", class_="strikeout") + # [<p class="body strikeout"></p>] + + css_soup.find_all("p", class_="body") + # [<p class="body strikeout"></p>] + +Você pode também buscar por uma string exata como valor de ``class``:: + + css_soup.find_all("p", class_="body strikeout") + # [<p class="body strikeout"></p>] + +Mas ao procurar por variações de uma string, isso não irá funcionar:: + + css_soup.find_all("p", class_="strikeout body") + # [] + +Se voce quiser buscar por tags que correspondem a duas ou mais classes CSS, +você deverá utilizar um seletor CSS:: + + css_soup.select("p.strikeout.body") + # [<p class="body strikeout"></p>] + +Em versões mais antigas do Beautiful Soup, as quais não possuem o atalho ``class_`` +você pode utilizar o truque ``attrs`` conforme mencionado acima. Será criado um dicionário +do qual o valor para "class" seja uma string ( ou uma expressão regular, ou qualquer +outra coisa) que você queira procurar:: + + soup.find_all("a", attrs={"class": "sister"}) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _string: + +O argumento ``string`` +^^^^^^^^^^^^^^^^^^^^^^^ + +Com ``string`` você pode buscar por strings ao invés de tags. Assim como +``name`` e os argumentos palavras-chave, você pode passar `uma string`_, `uma +expressão regular`_, `uma lista`_, `uma função`_, ou +:ref:`o valor True <the value True>`. Aqui estão alguns exemplos:: + + soup.find_all(string="Elsie") + # [u'Elsie'] + + soup.find_all(string=["Tillie", "Elsie", "Lacie"]) + # [u'Elsie', u'Lacie', u'Tillie'] + + soup.find_all(string=re.compile("Dormouse")) + [u"The Dormouse's story", u"The Dormouse's story"] + + def is_the_only_string_within_a_tag(s): + """Return True if this string is the only child of its parent tag.""" + return (s == s.parent.string) + + soup.find_all(string=is_the_only_string_within_a_tag) + # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] + +Mesmo que ``string`` seja para encontrar strings, você pode combiná-lo com argumentos +para encontrar tags: o Beautiful Soup encontrará todas as tags as quais +``.string`` corresponder seu valor em ``string``. O código seguinte encontra +a tag <a>, a qual a ``.string`` é "Elsie":: + + soup.find_all("a", string="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +O argumento ``string`` é novo no Beautiful Soup 4.4.0. Em versões anteriores +ele era chamado de ``text``:: + + soup.find_all("a", text="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +.. _limit: + +O argumento ``limit`` +^^^^^^^^^^^^^^^^^^^^^^ + +``find_all()`` retorna todas as tags e strings que correspondem aos seus +filtros. Isso pode levar algum tmepo se o documento for extenso. Se você +não precisar de `todos` os resultados, você pode passar um número limite +(``limit``). Ele funciona assim como o parâmetro LIMIT utilizado em SQL. +Ele diz ao Beautiful Soup para parar de adquirir resultados assim que atingir +um certo número. + +Existem três links no documento "three sisters", mas este código encontra somente +os dois primeiros:: + + soup.find_all("a", limit=2) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +.. _recursive: + +O argumento ``recursive`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Se você chamar ``mytag.find_all()``, o Beautiful Soup irá examinar todos os descendentes +de ``mytag``: suas filhas, as filhas de suas filhas e daí em diante. Se você quer apenas que +o Beautiful Soup considere filhas diretas, você pode passar o parâmetro ``recursive=False``. +Veja a diferença aqui:: + + soup.html.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.html.find_all("title", recursive=False) + # [] + +Aqui está o trecho do documento:: + + <html> + <head> + <title> + The Dormouse's story + </title> + </head> + ... + +O tag <title> esta abaixo da tag <html>, mas não está `diretamente` +abaixo de <html>: a tag <head> está no caminho entre elas. O Beautiful Soup encontra a tag +<title> quando é autorizado a olhar todos os descendentes de <html>, mas +quando ``recursive=False`` é restringido o acesso as filhas imediatas de <html>. + +O Beautiful Soup oferece diversos métodos de busca na árvore (como vimos acima), e a maioria +deles recebe os mesmos argumentos que ``find_all()``: ``name``, +``attrs``, ``string``, ``limit``, e os argumentos palavras-chave. Mas o +argumento ``recursive`` é diferente: ``find_all()`` e ``find()`` são +os únicos métodos que o suportam. Passar ``recursive=False`` em um método +como ``find_parents()`` não seria muito útil. + +Chamar uma tag é como chamar ``find_all()`` +-------------------------------------------- + +Por ``find_all()`` ser o método mais popular na API de busca do +Beautiful Soup, você pode usar um atalho para ele. Se você tratar +o objeto ``BeautifulSoup`` ou um objeto ``Tag`` como se fosse uma +função, então é o mesmo que chamar ``find_all()`` para aquele objeto. +Estas duas linhas de código são equivalentes:: + + soup.find_all("a") + soup("a") + +Estas duas linhas também são equivalentes:: + + soup.title.find_all(string=True) + soup.title(string=True) + +``find()`` +---------- + +Signature: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +O método ``find_all()`` varre todo o documento em busca de resultados, +mas algumas vezes você irá querer apenas um resultado. Se você sabe que +o documento possui apenas uma tag <body>, é perda de tempo varrer todo o +o documento procurando por outras. Ao invés de passar ``limit=1`` +toda vez em que chamar ``find_all``, você pode usar o método ``find()``. +Estas duas linhas de código são `quase` equivalentes:: + + soup.find_all('title', limit=1) + # [<title>The Dormouse's story</title>] + + soup.find('title') + # <title>The Dormouse's story</title> + +A única diferença é que ``find_all()`` retorna uma lista contendo apenas +um resuldado, enquanto ``find()`` retorna o resultado. + +Se ``find_all()`` não encontrar nada, ele retornará uma lista vazia. Se +``find()`` não encontrar nada, ele retornará ``None``:: + + print(soup.find("nosuchtag")) + # None + +Lembre-se do truque ``soup.head.title`` de `Navegar usando os nomes das tags`_? +Aquele truque funciona chamando repetidamente ``find()``:: + + soup.head.title + # <title>The Dormouse's story</title> + + soup.find("head").find("title") + # <title>The Dormouse's story</title> + +``find_parents()`` e ``find_parent()`` +---------------------------------------- + +Signature: find_parents(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_parent(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Levei muito tempo cobrindo ``find_all()`` e ``find()`` acima. +O API do Beautiful Soup define dez outros métodos +para buscas na árvore, mas não tenha medo! Cinco destes métodos são +basicamente o mesmo que ``find_all()``, e os outros cinco são basicamente +o mesmo que ``find()``. A única diferença está em qual parte da árvore +eles procuram. + +Primeiro vamos considerar ``find_parents()`` e +``find_parent()``. Lembre-se que ``find_all()`` e ``find()`` trabalham +de sua própria maneira descendo através da árvore, procurando pelos +descendentes de uma tag. Estes métodos fazem o contrário: eles trabalham +`subindo` a árvore, procurando pelas `mães` de uma tag (ou string). +Vamos experimentá-los: começando por uma string "enterrada" no documento +"three daughters":: + + a_string = soup.find(string="Lacie") + a_string + # u'Lacie' + + a_string.find_parents("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + a_string.find_parent("p") + # <p class="story">Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; + # and they lived at the bottom of a well.</p> + + a_string.find_parents("p", class="title") + # [] + +Uma das três tags <a> é diretamente um nível superior da string em +questão, então nossa busca a encontra. Uma das três tags <p> é uma mãe +indireta da string e nossa busca também a encontra. Há uma tag <p> com +a classe CSS "title" em algum lugar no documento, mas não é nenhuma das tags mães +da string, portanto, não podemos encontrá-la com ``find_parents()``. + +Você já deve ter feito a conexão entre ``find_parent()`` e +``find_parents()``, e os atributos `.parent`_ e `.parents`_ mencionados +anteriormente. A conexão é muito forte. Estes métodos de busca utilizam ``.parents`` +para iterar sobre todos as mãesS e compara cada um com o filtro passado +para verificar se preenche o requisito. + +``find_next_siblings()`` e ``find_next_sibling()`` +---------------------------------------------------- + +Signature: find_next_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_next_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Estes métodos utilizam :ref:`.next_siblings <sibling-generators>` para +iterar sobre o resto dos filhos de um elemento da árvore. O método +``find_next_siblings()`` retornará todos os filhos que atendem o +requisito ``find_next_sibling()`` retorna apenas o primeiro:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_next_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_next_sibling("p") + # <p class="story">...</p> + +``find_previous_siblings()`` e ``find_previous_sibling()`` +------------------------------------------------------------ + +Signature: find_previous_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_previous_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Estes métodos utilizam :ref:`.previous_siblings <sibling-generators>` para iterar sobre os filhos de um elemento que +o precede na árvore. O método ``find_previous_siblings()`` +retorna todos os filhos que atendem o requisito e +``find_previous_sibling()`` retorna apenas o primeiro:: + + last_link = soup.find("a", id="link3") + last_link + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_link.find_previous_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_previous_sibling("p") + # <p class="title"><b>The Dormouse's story</b></p> + + +``find_all_next()`` e ``find_next()`` +--------------------------------------- + +Signature: find_all_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Estes métodos utilizam :ref:`.next_elements <element-generators>` para +iterar sobre qualquer tag e string que aparecer depois da atual no documento. +O método ``find_all_next()`` retorna todos os casos que atendem, e +``find_next()`` retorna somente o primeiro caso:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_next(string=True) + # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] + + first_link.find_next("p") + # <p class="story">...</p> + +No primeiro exemplo, a string "Elsie" foi encontrada, mesmo estando +dentro da tag <a>. No segundo exemplo, a última tag <p> do documento foi +encontrada, mesmo que não esteja na mesma parte da árvore que <a> onde começamos. +Para estes métodos, o que importa é que um elemento corresponda ao filtro e esteja +depois do elemento de início no documento. + +``find_all_previous()`` e ``find_previous()`` +----------------------------------------------- + +Signature: find_all_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Estes métodos utilizam :ref:`.previous_elements <element-generators>` para +iterar sobre as tags e strings que aparecem antes do elemento indicado no argumento. +O método ``find_all_previous()`` retorna todos que correspondem a busca e o método +``find_previous()`` apenas a primeira correspondência:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_previous("p") + # [<p class="story">Once upon a time there were three little sisters; ...</p>, + # <p class="title"><b>The Dormouse's story</b></p>] + + first_link.find_previous("title") + # <title>The Dormouse's story</title> + +Quando se chama ``find_all_previous("p")`` é encontrado não só o +primeiro parágrafo do documento (o que possui class="title"), mas também o +segundo parágrafo, a tag <p> que contém a tag <a> por onde começamos. +Isso não deveria ser tão surpreendente: nós estamos olhando para todas as tags +que apareceram anteriormente no documento incluindo aquela onde começamos. Uma +tag <p> que contenha uma tag <a> deve aparecer antes da tag <a> que ela contém. + +Seletores CSS +------------- + +A partir da versão 4.7.0, o Beautiful Soup suporta a maior parte dos seletores CSS4 +através do projeto `SoupSieve <https://facelessuser.github.io/soupsieve/>`_. Se você +instalou o Beautiful Soup através do ``pip``,o SoupSieve foi instalado ao mesmo tempo, +portanto você não precisará realizar nenhuma etapa adicional. + +``BeautifulSoup`` possui um método ``.select()`` o qual utiliza o SoupSieve para +executar um seletor CSS selector sobre um documento a ser analisado e retorna todos os +elementos correspondentes. ``Tag`` possui um método similar que executa um seletor CSS +sobre o conteúdo de uma única tag. + +(Versões anteriores do Beautiful Soup também possuem o método ``.select()``, + mas somente os seletores CSS mais populares são suportados. + +A `documentação <https://facelessuser.github.io/soupsieve/>`_ SoupSieve +lista todos os seletores suportados atualmente, mas aqui estão alguns dos +básicos: + +Você pode encontrar tags:: + + soup.select("title") + # [<title>The Dormouse's story</title>] + + soup.select("p:nth-of-type(3)") + # [<p class="story">...</p>] + +Encontrar tags aninhadas com outras:: + soup.select("body a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("html head title") + # [<title>The Dormouse's story</title>] + +Encontrar tags `diretamente` abaixo de outras tags no aninhamento:: + + soup.select("head > title") + # [<title>The Dormouse's story</title>] + + soup.select("p > a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("p > a:nth-of-type(2)") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + soup.select("p > #link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("body > a") + # [] + +Encontrar as irmãs de alguma tag:: + + soup.select("#link1 ~ .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("#link1 + .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Encontrar tags pela classe CSS:: + + soup.select(".sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("[class~=sister]") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Encontrar tags pelo ID:: + + soup.select("#link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("a#link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Encontrar tags que se relacionam com qualquer seletor em uma lista de seletores:: + + soup.select("#link1,#link2") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Testar a existência de um atributo:: + + soup.select('a[href]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Encontrar tags pelo valor do atributo:: + + soup.select('a[href="http://example.com/elsie"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select('a[href^="http://example.com/"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href$="tillie"]') + # [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href*=".com/el"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +Há outro método chamado ``select_one()``, o qual encontra somente +a primeira tag que combina com um seletor:: + + soup.select_one(".sister") + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +Se você analisou um XML que define namespaces, você pode +utilizar nos seletores CSS:: + + from bs4 import BeautifulSoup + xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/"> + <ns1:child>I'm in namespace 1</ns1:child> + <ns2:child>I'm in namespace 2</ns2:child> + </tag> """ + soup = BeautifulSoup(xml, "xml") + + soup.select("child") + # [<ns1:child>I'm in namespace 1</ns1:child>, <ns2:child>I'm in namespace 2</ns2:child>] + + soup.select("ns1|child", namespaces=namespaces) + # [<ns1:child>I'm in namespace 1</ns1:child>] + +Quando manipulando um seletor CSS que utiliza +namespaces,o Beautiful Soup utiliza a abreviação do namespace +que encontrou quando estava analisando o documento. Você pode evitar isso +passando um dicionário com suas próprias abreviações:: + + namespaces = dict(first="http://namespace1/", second="http://namespace2/") + soup.select("second|child", namespaces=namespaces) + # [<ns1:child>I'm in namespace 2</ns1:child>] + +Todo este negócio de seletor CSS é conveniente +para pessoas que já sabem a sintaxe do seletor CSS. +Você pode fazer tudo isso com a API do BeautifulSoup. +E se os seletores CSS são tudo o que você precisa, +você deveria analisar o documento com lxml: é mais rápido. Mas isso deixa você `combinar` +seletores CSS com a API do Beautiful Soup. + +Modificando a árvore +==================== + +O principal poder do Beautiful Soup está na busca pela árvore, mas você +pode também modificar a árvore e escrever suas modificações como um novo +documento HTML ou XML. + +Alterando nomes de tags e atributos +----------------------------------- + +Cobri este assunto anteriormente em `Atributos`_, mas vale a pena repetir. Você +pode renomear uma tag, alterar o valor de algum de seus atributos, adicionar novos +atributos e deletar qualquer um deles:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + + tag.name = "blockquote" + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + +Modificando ``.string`` +----------------------- + +Se você definir o um atributo ``.string`` de uma tag, o conteúdo da +tag será substituido pela string que foi passada:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + + tag = soup.a + tag.string = "New link text." + tag + # <a href="http://example.com/">New link text.</a> + +Cuidado: se a tag conter outra(s) tag(s), ela(s) e todo seu conteúdo +serão destruídos. + +``append()`` +------------ + +Você pode adicionar algo no conteúdo de uma tag com ``Tag.append()``. Funciona +da mesma maneira que ``.append()`` de uma lista:: + + soup = BeautifulSoup("<a>Foo</a>") + soup.a.append("Bar") + + soup + # <html><head></head><body><a>FooBar</a></body></html> + soup.a.contents + # [u'Foo', u'Bar'] + +``extend()`` +------------ + +Com início no Beautiful Soup 4.7.0, ``Tag`` também suporta um método chamado +``.extend()``, o qual funciona da mesma maneira que chamando ``.extend()`` em +uma lista:: + + soup = BeautifulSoup("<a>Soup</a>") + soup.a.extend(["'s", " ", "on"]) + + soup + # <html><head></head><body><a>Soup's on</a></body></html> + soup.a.contents + # [u'Soup', u''s', u' ', u'on'] + +``NavigableString()`` e ``.new_tag()`` +------------------------------------------------- + +Se você precisar adicionar uma string a um documento, sem problema -- você +pode passar uma string Python através de ``append()``, ou você pode chamar +o construtor ``NavigableString``:: + + soup = BeautifulSoup("<b></b>") + tag = soup.b + tag.append("Hello") + new_string = NavigableString(" there") + tag.append(new_string) + tag + # <b>Hello there.</b> + tag.contents + # [u'Hello', u' there'] + +Se você quiser criar um comentário ou alguma outra subclasse de +``NavigableString``, apenas chame o construtor:: + + from bs4 import Comment + new_comment = Comment("Nice to see you.") + tag.append(new_comment) + tag + # <b>Hello there<!--Nice to see you.--></b> + tag.contents + # [u'Hello', u' there', u'Nice to see you.'] + +(Esta é uma funcionalidade nova no Beautiful Soup 4.4.0.) + +E se você precisar criar uma nova tag? A melhor solução +é chamar o método ``BeautifulSoup.new_tag()``:: + + soup = BeautifulSoup("<b></b>") + original_tag = soup.b + + new_tag = soup.new_tag("a", href="http://www.example.com") + original_tag.append(new_tag) + original_tag + # <b><a href="http://www.example.com"></a></b> + + new_tag.string = "Link text." + original_tag + # <b><a href="http://www.example.com">Link text.</a></b> + +Somente o primeiro argumento (o nome da tag) é obrigatório. + +``insert()`` +------------ + +``Tag.insert()`` funciona assim como ``Tag.append()``, exceto que o novo elemento +não será inserido ao final do ``.contents`` de sua tag mãe. Ele será inserido em qualquer posição +numérica que você informar. Funciona assim como ``.insert()`` em uma lista:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.insert(1, "but did not endorse ") + tag + # <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a> + tag.contents + # [u'I linked to ', u'but did not endorse', <i>example.com</i>] + +``insert_before()`` e ``insert_after()`` +------------------------------------------ + +O método ``insert_before()`` insere tags ou strings imediatamente antes de algo +na árvore:: + + soup = BeautifulSoup("<b>stop</b>") + tag = soup.new_tag("i") + tag.string = "Don't" + soup.b.string.insert_before(tag) + soup.b + # <b><i>Don't</i>stop</b> + +O método ``insert_after()`` insere tags ou strings imediatamente após algo +na árvore:: + + div = soup.new_tag('div') + div.string = 'ever' + soup.b.i.insert_after(" you ", div) + soup.b + # <b><i>Don't</i> you <div>ever</div> stop</b> + soup.b.contents + # [<i>Don't</i>, u' you', <div>ever</div>, u'stop'] + +``clear()`` +----------- + +O ``Tag.clear()`` remove o conteúdo de uma tag:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.clear() + tag + # <a href="http://example.com/"></a> + +``extract()`` +------------- + +O ``PageElement.extract()`` remove uma tag ou string da árvore. Ele retorna +a tag ou string que foi extraída:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + i_tag = soup.i.extract() + + a_tag + # <a href="http://example.com/">I linked to</a> + + i_tag + # <i>example.com</i> + + print(i_tag.parent) + None + +Neste ponto você efetivamente tem duas árvores de análise: uma baseada no objeto +``BeautifulSoup`` que você usou para analisar o documento, e outra baseada na tag que foi +extraída. Você pode também chamar ``extract`` em um filho do elemento que você extraiu:: + + my_string = i_tag.string.extract() + my_string + # u'example.com' + + print(my_string.parent) + # None + i_tag + # <i></i> + + +``decompose()`` +--------------- + +O ``Tag.decompose()`` remove uma tag da árvore, então destrói `completamente` ela +e seu conteúdo:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + soup.i.decompose() + + a_tag + # <a href="http://example.com/">I linked to</a> + + +.. _replace_with(): + +``replace_with()`` +------------------ + +Um ``PageElement.replace_with()`` remove uma tag ou string da árvore e +substitui pela tag ou string que você escolher:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + new_tag = soup.new_tag("b") + new_tag.string = "example.net" + a_tag.i.replace_with(new_tag) + + a_tag + # <a href="http://example.com/">I linked to <b>example.net</b></a> + +``replace_with()`` retorna a tag ou string que foi substituída, então você pode +examiná-la ou adicioná-la novamente em outra parte da árvore. + +``wrap()`` +---------- + +O ``PageElement.wrap()`` envelopa um elemento na tag que você especificar. Ele +retornará o novo empacotador:: + + soup = BeautifulSoup("<p>I wish I was bold.</p>") + soup.p.string.wrap(soup.new_tag("b")) + # <b>I wish I was bold.</b> + + soup.p.wrap(soup.new_tag("div") + # <div><p><b>I wish I was bold.</b></p></div> + +Este método é novo no Beautiful Soup 4.0.5. + +``unwrap()`` +--------------------------- + +O ``Tag.unwrap()`` é o oposto de ``wrap()``. Ele substitui uma tag pelo +que estiver dentro dela. É uma boa maneira de remover marcações:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + a_tag.i.unwrap() + a_tag + # <a href="http://example.com/">I linked to example.com</a> + +Assim como ``replace_with()``, ``unwrap()`` retorna a tag que foi +substituída. + +``smooth()`` +--------------------------- + +Após chamar vários métodos que modificam a árvore, você pode acabar com um ou dois objetos ``NavigableString`` próximos um ao outro. O Beautiful Soup não tem nenhum problema com isso, mas como isso não pode acontecer em um documento que acabou de ser analisado, você não deve esperar um comportamento como o seguinte:: + + soup = BeautifulSoup("<p>A one</p>") + soup.p.append(", a two") + + soup.p.contents + # [u'A one', u', a two'] + + print(soup.p.encode()) + # <p>A one, a two</p> + + print(soup.p.prettify()) + # <p> + # A one + # , a two + # </p> + +Você pode chamar ``Tag.smooth()`` para limpar a árvore analisada, consolidando strings adjacentes:: + + soup.smooth() + + soup.p.contents + # [u'A one, a two'] + + print(soup.p.prettify()) + # <p> + # A one, a two + # </p> + +O método ``smooth()`` é novo no Beautiful Soup 4.8.0. + +Saída +====== + +.. _.prettyprinting: + +Pretty-printing +--------------- + +O método ``prettify()`` irá transformar uma árvore do Beautiful Soup em +uma string Unicode devidamente formatada, com uma linha para cada tag e cada string:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + soup.prettify() + # '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...' + + print(soup.prettify()) + # <html> + # <head> + # </head> + # <body> + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + # </body> + # </html> + +Você pode chamar ``prettify()`` no top-level do objeto ``BeautifulSoup``, +ou em qualquer de seus objetos ``Tag``:: + + print(soup.a.prettify()) + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + +Non-pretty printing +------------------- + +Se você quer apenas uma string, sem nenhuma formatação, você pode chamar +``unicode()`` ou ``str()`` para o objeto ``BeautifulSoup`` ou uma ``Tag`` +dentro dele:: + + str(soup) + # '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>' + + unicode(soup.a) + # u'<a href="http://example.com/">I linked to <i>example.com</i></a>' + +A função ``str()`` retorna uma string codificada em UTF-8. Veja +`Codificação (Encoding)`_ para outras opções. + +Você também pode chamar ``encode()`` para ter uma bytestring, e ``decode()`` +para ter Unicode. + +.. _output_formatters: + +Output formatters +----------------- + +Se você der para o Beautiful Soup um documento que contém entidades HTML como +"&lquot;", elas serão convertidades em caracteres Unicode:: + + soup = BeautifulSoup("“Dammit!” he said.") + unicode(soup) + # u'<html><head></head><body>\u201cDammit!\u201d he said.</body></html>' + +Se você converter o documento em uma string, os caracteres Unicode +serão codificados como UTF-8. Você não irá ter suas entidades HTML de volta:: + + str(soup) + # '<html><head></head><body>\xe2\x80\x9cDammit!\xe2\x80\x9d he said.</body></html>' + +Por padrão, os únicos caracteres que escapam desta saída são o & e os sinais de <>. +Eles são convertidos em "&", "<", +e ">", com isso o Beautiful Soup não gera HTML e XML inválidos de maneira inadvertida. + + soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>") + soup.p + # <p>The law firm of Dewey, Cheatem, & Howe</p> + + soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') + soup.a + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + +Você pode alterar este comportamento informando um valor para o argumento de +``formatter`` para ``prettify()``, ``encode()``, ou +``decode()``. Beautiful Soup reconhece cinco possiveis valores para ``formatter``. + +O padrão é ``formatter="minimal"``. Strings sempre serão processadas de maneira a garantir que o Beautiful Soup gere HTML/XML válidos:: + + french = "<p>Il a dit <<Sacré bleu!>></p>" + soup = BeautifulSoup(french) + print(soup.prettify(formatter="minimal")) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + +Se você passar ``formatter="html"``, Beautiful Soup irá converter caracteres +Unicode para entidades HTML sempre que possível:: + + print(soup.prettify(formatter="html")) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + +Se você passar um ``formatter="html5"``, é o mesmo que ``formatter="html"``, +mas o Beautiful Soup irá omitir a barra de fechamento HTML:: + + soup = BeautifulSoup("<br>") + + print(soup.encode(formatter="html")) + # <html><body><br/></body></html> + + print(soup.encode(formatter="html5")) + # <html><body><br></body></html> + +Se você passar ``formatter=None``, Beautiful Soup não irá modificar +as strings na saída. Esta é a opção mais rápida, mas permitirá que o +Beautiful Soup gere HTML/XML inválidos, como nestes exemplos:: + + print(soup.prettify(formatter=None)) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + + link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') + print(link_soup.a.encode(formatter=None)) + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + +Se você precisar de controles mais sofisticados sobre sua saída, +você pode usar a classe ``Formatter`` do Beautiful Soup. Aqui você pode ver um +formatter que converte strings para uppercase, quando elas ocorrem em um nó de texto +ou em um valor de algum atributo:: + + from bs4.formatter import HTMLFormatter + def uppercase(str): + return str.upper() + formatter = HTMLFormatter(uppercase) + + print(soup.prettify(formatter=formatter)) + # <html> + # <body> + # <p> + # IL A DIT <<SACRÉ BLEU!>> + # </p> + # </body> + # </html> + + print(link_soup.a.prettify(formatter=formatter)) + # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2"> + # A LINK + # </a> + +Dividindo em subclasses ``HTMLFormatter`` ou ``XMLFormatter`` darão a você ainda +mais controle sobre a saída. Por exemplo, o Beautiful Soup ordena os atributos em toda +tag por padrão:: + + attr_soup = BeautifulSoup(b'<p z="1" m="2" a="3"></p>') + print(attr_soup.p.encode()) + # <p a="3" m="2" z="1"></p> + +Para desabilitar esta opção, você pode criar uma subclasse do método ``Formatter.attributes()``, +o qual controla qual atributo será usado na saída e em que ordem. Esta +implementação também filtra o atributido chamado "m" quando ele aparece:: + + class UnsortedAttributes(HTMLFormatter): + def attributes(self, tag): + for k, v in tag.attrs.items(): + if k == 'm': + continue + yield k, v + print(attr_soup.p.encode(formatter=UnsortedAttributes())) + # <p z="1" a="3"></p> + +Um último conselho: se você criar um objeto ``CDATA``, o texto dentro deste objeto +sempre estará presente `exatamente como aparenta, com nenhuma formatação`. +O Beautiful Soup irá chamar sua função de substituição da entidade, apenas +no caso de você ter escrito uma função personalizada que conta todas as strings +que existem no documento ou algo do tipo, mas ele irá ignorar o valor de retorno:: + + from bs4.element import CData + soup = BeautifulSoup("<a></a>") + soup.a.string = CData("one < three") + print(soup.a.prettify(formatter="xml")) + # <a> + # <![CDATA[one < three]]> + # </a> + + +``get_text()`` +-------------- + +Se você quer apenas o texto contido no documento ou em um par de tags, você +pode utilizar o método ``get_text()``. Ele retornará todo texto em um documento +ou dentro das tags como uma string Unicode:: + + markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>' + soup = BeautifulSoup(markup) + + soup.get_text() + u'\nI linked to example.com\n' + soup.i.get_text() + u'example.com' + +Você pode especificar uma string a ser usada para unir as partes do texto:: + + # soup.get_text("|") + u'\nI linked to |example.com|\n' + +Você pode dizer ao Beautiful Soup para excluir espaços em branco do início +e fim de cada parte de texto:: + + # soup.get_text("|", strip=True) + u'I linked to|example.com' + +Contudo para isso, você pode querer utilizar o gerador :ref:`.stripped_strings <string-generators>` +e processar o texto você mesmo:: + + [text for text in soup.stripped_strings] + # [u'I linked to', u'example.com'] + +Especificando um interpretador (parser) para uso +================================================ + +Se você precisa analisar um pequeno HTML, você pode passá-lo no construtor do +``BeautifulSoup`` e será o suficiente. O Beautiful Soup irá escolher um parser +para você e irá interpretar o dado. Mas existem alguns argumentos adicionais que você +pode passar no construtor para alterar qual parser será usado. + +O primeiro argumento do construtor ``BeautifulSoup`` é uma string ou uma variável contendo o +conteúdo do que você quer analisar. O segundo argumento é `como` você quer interpretar aquele +conteúdo. + +Se você não especificar nada, você irá utilizar o melhor analisador HTML instalado. +O Beautiful Soup classifica o lxml's como sendo o melhor, logo em seguida o html5lib, +e então o parser nativo do Python. Você pode substituí-lo, especificando de acordo +com as seguintes características: + +* O tipo de marcação que você quer analisar. Atualmente são suportados + "html", "xml", and "html5". +* O nome do parser que você quer utilizar. Atualmente são suportadas + as opções "lxml", "html5lib", e "html.parser" (parser nativo do Python). + +A seção `Instalando um interpretador (parser)` compara os parsers suportados. + +Se você não tem um parser apropriado instalado, o Beautiful Soup irá +ignorar sua solicitação e escolher um diferente. Atualmente, o único parser +XML suportado é o lxml. Se você não possui o lxml instalado, pedir um parser +XML não trará um e pedir por "lxml" não funcionará também. + + +.. _differences-between-parsers: + +Diferenças entre os interpretadores (parsers) +--------------------------------------------- + +O Beautiful Soup apresenta a mesma interface para diferentes parsers, +mas cada um é diferente. Diferentes parsers irão criar diferentes análises da árvore +do mesmo documento. As maiores diferenças estão entre os parsers HTML e XML. +Aqui está um pequeno documento analisado como HTML:: + + BeautifulSoup("<a><b /></a>") + # <html><head></head><body><a><b></b></a></body></html> + +Como uma tag <b /> vazia não é um HTML válido, o analisador a transforma +em um par <b></b>. + +Aqui está o mesmo documento analisado como XML (partindo do princípio +que você tenha o lxml instalado). Note que o a tag vazia <b /> é deixada sozinha, +e que é dada ao documento uma declaração XML ao invés de ser colocada dentro de uma tag <html>.:: + + BeautifulSoup("<a><b /></a>", "xml") + # <?xml version="1.0" encoding="utf-8"?> + # <a><b/></a> + +Há também diferenças entre analisadores HTML. Se você der ao Beautiful +Soup um documento HTML perfeitamente formatado, estas diferenças não irão +importar. Um analisador será mais rápido que outro, mas todos irão lhe +retornar uma estrutura de dados que se parece exatamente como o HTML original. + +Mas se o documento não estiver perfeitamente formatado, diferentes analisadores +irão retornar diferentes resultados. Aqui está um pequeno e inválido documento +analisado utilizando o analisador lxml HTML. Note que a tag pendente </p> é +simplesmente ignorada:: + + BeautifulSoup("<a></p>", "lxml") + # <html><body><a></a></body></html> + +Aqui está o mesmo documento analisado utilizando html5lib:: + + BeautifulSoup("<a></p>", "html5lib") + # <html><head></head><body><a><p></p></a></body></html> + +Ao invés de ignorar a tag </p> pendente, o html5lib a equipara a uma tag +<p> aberta. Este parser também adiciona uma tag <head> vazia ao documento. + +Aqui está o mesmo documento analisado com o parser HTML nativo do Python:: + + BeautifulSoup("<a></p>", "html.parser") + # <a></a> + +Assim como html5lib, este parser ignora a tag de fechamento </p>. +Este parser também não realiza nenhuma tentatida de criar um HTML bem +formatado adicionando uma tag <body>. Como lxml, ele nem se importa em +adicionar uma tag <html>. + +Sendo o documento "<a></p>" inválido, nenhuma dessas técnicas é a maneira +"correta" de lidar com isso. O html5lib utiliza técnicas que são parte +do padrão HTML5, portanto vendo sendo definido como a maneira "mais correta", +mas todas as três técnicas são legítimas. + +Diferenças entre analisadores podem afetar o seu script. Se você está +planejando distribuir seu script para outras pessoas, ou rodá-lo em +múltiplas máquinas, você deve especificar o analisador no construtor +``BeautifulSoup``. Isso irá reduzir as chances de que seus usuários +analisem um documento de forma diferente da maneira como você analisou. + + +Codificação (Encoding) +====================== + +Todo documento HTML ou XML é escrito em uma codificação (encoding) específica como ASCII +ou UTF-8. Mas quando você carrega um documento no BeautifulSoup, você irá descobrir +que ele foi convertido para Unicode:: + + markup = "<h1>Sacr\xc3\xa9 bleu!</h1>" + soup = BeautifulSoup(markup) + soup.h1 + # <h1>Sacré bleu!</h1> + soup.h1.string + # u'Sacr\xe9 bleu!' + +Não é mágica (Seria bem legal que fosse). O BeautifulSoup utiliza uma +sub-biblioteca chamada `Unicode, Dammit`_ para detectar a codificação de +um documento e convertê-lo para Unicode. A codificação detectada automaticamente está +disponível como objeto ``.original_encoding`` atributo do objeto ``BeautifulSoup`` :: + + soup.original_encoding + 'utf-8' + +`Unicode, Dammit` acerta na maioria das vezes, mas pode errar em algumas. +Outras vezes acerta, porém somente após uma busca byte a byte no documento, +o leva muito tempo. Se você souber com antecedência a codificação, você poderá +evitar erros ou demora passando-o para o contrutor do ``BeautifulSoup`` +através de ``from_encoding``. + +Abaixo você tem um documento escrito em ISO-8859-8. O documento é tão +pequeno que o `Unicode, Dammit` não consegue verificar sua codificação +e acaba fazendo a identificação como ISO-8859-7:: + + markup = b"<h1>\xed\xe5\xec\xf9</h1>" + soup = BeautifulSoup(markup) + soup.h1 + <h1>νεμω</h1> + soup.original_encoding + 'ISO-8859-7' + +Podemos consertar isso passando a codificação correta com ``from_encoding``:: + + soup = BeautifulSoup(markup, from_encoding="iso-8859-8") + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'iso8859-8' + +Se você não sabe qual a codificação correta, mas você sabe que o +`Unicode, Dammit` está errado, você pode passar as opções excluentes +como ``exclude_encodings``:: + + soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"]) + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'WINDOWS-1255' + +Windows-1255 não é 100% correto, mas é um superconjunto compatível com +ISO-8859-8, portanto é mais próximo do ideal. (``exclude_encodings`` +é uma opção nova no Beautiful Soup 4.4.0.) + +Em casos raros (geralmente quando um documento UTF-8 contém texto escrito +em uma codificação completamente diferente), a única maneira de ser convertido para +Unicode é convertendo alguns caracteres com o caractere especial Unicode +"REPLACEMENT CHARACTER" (U+FFFD, �). Se o `Unicode, Dammit` precisar utilizá-lo, +ele será armazenado no atributo ``.contains_replacement_characters`` como +``True`` no ``UnicodeDammit`` ou objeto ``BeautifulSoup``. Isso deixa você ciente +que a representação Unicode não é uma representação exata do original - algum dado +foi perdido. Se um documento possui �, mas ``.contains_replacement_characters`` é ``False``, +você poderá concluir então que o � já estava ali originalmente e não representa dados +perdidos. + +Codificação de Saída +-------------------- + +Quando um documento é gerado pelo Beautiful Soup, ele é gerado como UTF-8, +mesmo que o documento não for um UTF-8 de início. Aqui está um documento gerado +com codificação Latin-1:: + + markup = b''' + <html> + <head> + <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /> + </head> + <body> + <p>Sacr\xe9 bleu!</p> + </body> + </html> + ''' + + soup = BeautifulSoup(markup) + print(soup.prettify()) + # <html> + # <head> + # <meta content="text/html; charset=utf-8" http-equiv="Content-type" /> + # </head> + # <body> + # <p> + # Sacré bleu! + # </p> + # </body> + # </html> + +Note que a tag <meta> foi reescrita para refletir o fato que o documento +é agora um UTF-8. + +Se você não quiser um UTF-8, você pode passar a codificação desejada como parâmetro de +``prettify()``:: + + print(soup.prettify("latin-1")) + # <html> + # <head> + # <meta content="text/html; charset=latin-1" http-equiv="Content-type" /> + # ... + +Você também pode chamar encode() no objeto ``BeautifulSoup`` ou em qualquer elemento +do objeto, assim como se faz em uma string Python:: + + soup.p.encode("latin-1") + # '<p>Sacr\xe9 bleu!</p>' + + soup.p.encode("utf-8") + # '<p>Sacr\xc3\xa9 bleu!</p>' + +Qualquer caractere que não pode ser representado na codificação escolhida +irá ser convertida para uma entidade de referência numérica XML. Abaixo você +tem um documento que inclui o caractere Unicode SNOWMAN:: + + markup = u"<b>\N{SNOWMAN}</b>" + snowman_soup = BeautifulSoup(markup) + tag = snowman_soup.b + +O caractere SNOWMAN faz parte da documentação UTF-8 (algo como +☃), mas não possui representação para este caractere em ISO-latin-1 ou +ASCII, portanto ele é convertido para "☃" para as essas codificações:: + + print(tag.encode("utf-8")) + # <b>☃</b> + + print tag.encode("latin-1") + # <b>☃</b> + + print tag.encode("ascii") + # <b>☃</b> + +Unicode, Dammit +--------------- + +Você pode usar o `Unicode, Dammit` fora do Beautiful Soup. É útil +quando você possui dados em uma codificação desconhecida e quer +simplesmente convertê-la para Unicode:: + + from bs4 import UnicodeDammit + dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'utf-8' + + +As respostas do `Unicode, Dammit` serão um pouco mais precisas se você +instalar as bibliotecas ``chardet`` ou ``cchardet``. Quanto maior a quantidade +de dados no arquivo que você passar para o `Unicode, Dammit`, mais precisas serão +as conversões. Se você possui suas suspeitas sobre qual a codificação original, +você pode passar as opções em uma lista:: + + dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'latin-1' + +`Unicode, Dammit` possui duas características que o Beautiful Soup não utiliza. + +Smart quotes +^^^^^^^^^^^^ + +Você pode utilizar `Unicode, Dammit` para converter Microsoft smart quotes para +entidades HTML ou XML:: + + markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>" + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup + # u'<p>I just “love” Microsoft Word’s smart quotes</p>' + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup + # u'<p>I just “love” Microsoft Word’s smart quotes</p>' + +Você também pode converter Microsoft smart quotes para ASCII:: + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup + # u'<p>I just "love" Microsoft Word\'s smart quotes</p>' + +Espero que você ache estas características úteis, mas o Beautiful Soup não +as usa.O Beautiful Soup dá preferência ao comportamento padrão, que é +converter para caracteres Unicode:: + + UnicodeDammit(markup, ["windows-1252"]).unicode_markup + # u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>' + +Codificação Inconsistente +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Algumas vezes um documento é em sua maioria UTF-8, mas contém caracteres +Windows-1252 assim como (de novo) Microsoft smart quotes. Isso pode acontecer +quando um website compostos de dados de muitas fontes diferentes. Você pode +utilizar ``UnicodeDammit.detwingle()`` para transformar este documento em um +UTF-8 puro. Aqui está um exemplo:: + + snowmen = (u"\N{SNOWMAN}" * 3) + quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") + doc = snowmen.encode("utf8") + quote.encode("windows_1252") + +Este documento é uma bagunça. O snowmen é um UTF-8 e as aspas são Windows-1252. +Você pode exibir o snowmen ou as aspas, mas não os dois ao mesmo tempo:: + + print(doc) + # ☃☃☃�I like snowmen!� + + print(doc.decode("windows-1252")) + # ☃☃☃“I like snowmen!” + +Decodificar um documento como UTF-8 gera um ``UnicodeDecodeError``, e +como um Windows-1252 lhe tras algo sem sentido. Felizmente, +``UnicodeDammit.detwingle()`` irá converter a string para UTF-8 puro, +permitindo a você decodificá-la para Unicode e exibir o snowmen e as +aspas simultaneamente:: + + new_doc = UnicodeDammit.detwingle(doc) + print(new_doc.decode("utf8")) + # ☃☃☃“I like snowmen!” + +``UnicodeDammit.detwingle()`` sabe apenas como trabalhar com Windows-1252 +contido em UTF-8 (ou vice versa, eu suponho), mas este é o caso mais comum. + +Note que você deve chamar ``UnicodeDammit.detwingle()`` em seu dado +antes de passá-lo para ``BeautifulSoup`` ou para o construtor ``UnicodeDammit``. +O Beautiful Soup assume que um documento possui apenas uma codificação, +independente de qual ela seja. Se você passar um documento que +contém ambos UTF-8 e Windows-1252, é provável que ele pense que todo +o documento seja Windows-1252, e o documento parecerá ``☃☃☃“I like snowmen!”``. + +``UnicodeDammit.detwingle()`` é novo no Beautiful Soup 4.1.0. + +Linhas numeradas +================ + +Os interpretadores ``html.parser` e ``html5lib`` podem rastrear onde, no +documento original, cada tag foi encontrada. Você pode acessar esta +informação através de ``Tag.sourceline`` (número da linha) e ``Tag.sourcepos`` +(posição do início da tag na linha):: + + markup = "<p\n>Paragraph 1</p>\n <p>Paragraph 2</p>" + soup = BeautifulSoup(markup, 'html.parser') + for tag in soup.find_all('p'): + print(tag.sourceline, tag.sourcepos, tag.string) + # (1, 0, u'Paragraph 1') + # (2, 3, u'Paragraph 2') + +Note que os dois interpretadores significam coisas levemente diferentes por +``sourceline`` e ``sourcepos``. Para html.parser, estes números representam +a posição do sinal `menor que`inicial. Para html5lib, representa a posição +do sinal `maior que` final:: + + soup = BeautifulSoup(markup, 'html5lib') + for tag in soup.find_all('p'): + print(tag.sourceline, tag.sourcepos, tag.string) + # (2, 1, u'Paragraph 1') + # (3, 7, u'Paragraph 2') + +Você pode desabilitar esta característica passando ``store_line_numbers=False` +no construtor ``BeautifulSoup``:: + + markup = "<p\n>Paragraph 1</p>\n <p>Paragraph 2</p>" + soup = BeautifulSoup(markup, 'html.parser', store_line_numbers=False) + soup.p.sourceline + # None + +Esta característica é nova no 4.8.1 e os analisadores baseados no lxml +não a suportam. + +Comparando objetos por igualdade +================================ + +O Beautiful Soup diz que dois objetos ``NavigableString`` ou ``Tag`` são +iguais quando eles apresentam as mesma marcação HTML ou XML. No exemplo +abaixo, as duas tags <b> são tratadas como iguais, mesmo estando em partes +diferentes da árvore do objeto, porque ambas estão como "<b>pizza</b>":: + + markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>" + soup = BeautifulSoup(markup, 'html.parser') + first_b, second_b = soup.find_all('b') + print first_b == second_b + # True + + print first_b.previous_element == second_b.previous_element + # False + +Se você quiser verificar se duas variáveis se referem exatamente ao +mesmo objeto, use `is`:: + + print first_b is second_b + # False + +Copiando objetos Beautiful Soup +=============================== + +Você pode utilizar ``copy.copy()`` para criar uma cópia de qualquer ``Tag`` ou +``NavigableString``:: + + import copy + p_copy = copy.copy(soup.p) + print p_copy + # <p>I want <b>pizza</b> and more <b>pizza</b>!</p> + + +A cópia será considerada igual ao original, desde que ela apresente a mesma +marcação que o original, mas não será o mesmo objeto:: + + print soup.p == p_copy + # True + + print soup.p is p_copy + # False + +A única diferença real é que a cópia é completamente separada da árvore +original do Beautiful Soup, como se ``extract()`` fosse chamado para ela:: + + print p_copy.parent + # None + +Isso acontece porque dois objetos ``Tag`` diferentes não podem ocupar o mesmo +espaço ao mesmo tempo. + + +Analisando apenas parte de um documento +======================================= + +Suponhamos que você queira que o Beautiful Soup olhe apenas para as +tags <a> de um documento. É um desperdício de tempo e memória analisar +todo o documento e, posteriormente, analisar novamente apenas para buscar +as tags <a>. Seria muito mais rápido ignorar tudo o que não for <a> em +primeiro lugar. A classe ``SoupStrainer`` permite que você escolha +qual partes do documento serão analisadas. Você deverá penas criar uma +instância de ``SoupStrainer`` e passá-la ao construtor ``BeautifulSoup`` +no argumento ``parse_only``. + +(Note que *esta característica não funcionará se você estiver utilizando +o html5lib*. Se você utilizar o html5lib, todo o documento será analisado. +Isso acontece porque html5lib constantemente reorganiza a árvore de análise +e se alguma parte do documento realmente não fizer parte dela, ela irá quebrar. +Para evitar confusão, no exemplo abaixo, forçarei o Beautiful Soup a usar o +analisador nativo do Python). + +``SoupStrainer`` +---------------- + +A classe ``SoupStrainer`` recebe os mesmos argumentos que qualquer método em `Buscando na árvore`_: :ref:`name <name>`, :ref:`attrs +<attrs>`, :ref:`string <string>`, e :ref:`**kwargs <kwargs>`. Aqui temos três objetos ``SoupStrainer`` :: + + from bs4 import SoupStrainer + + only_a_tags = SoupStrainer("a") + + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return len(string) < 10 + + only_short_strings = SoupStrainer(string=is_short_string) + +Irei trazer de volta o documento "three sisters" mais uma vez e veremos +como o documento se parece quando é analisado com estes três objetos ``SoupStrainer`` +diferentes:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) + # Elsie + # , + # Lacie + # and + # Tillie + # ... + # + +Você pode também passar um ``SoupStrainer`` em qualquer método coberto em `Buscando na árvore`_. +Este uso provavelmente não seja muito útil, mas pensei que deveria mencioná-lo:: + + soup = BeautifulSoup(html_doc) + soup.find_all(only_short_strings) + # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u'\n\n', u'...', u'\n'] + +Solucionando Problemas +====================== + +.. _diagnose: + +``diagnose()`` +-------------- + +Se você está tendo problemas em entender o que o Beautiful Soup está +fazendo com um documento, passe o documento pela função ``diagnose()``. (Nova no Beautiful Soup 4.2.0.) +O Beautiful Soup irá retornar um relatório mostrando como diferentes parsers +lidam com o documento e irá lhe dizer o Beautiful Soup poderia estar utilizando outro parser:: + + from bs4.diagnose import diagnose + with open("bad.html") as fp: + data = fp.read() + diagnose(data) + + # Diagnostic running on Beautiful Soup 4.2.0 + # Python version 2.7.3 (default, Aug 1 2012, 05:16:07) + # I noticed that html5lib is not installed. Installing it may help. + # Found lxml version 2.3.2.0 + # + # Trying to parse your data with html.parser + # Here's what html.parser did with the document: + # ... + +Olhando para o que diagnose() retorna, poderá lhe dizer como resolver +o seu problema. Mesmo que não consiga, você poderá colar a saída de ``diagnose()`` +quando solicitar ajuda. + +Erros enquanto se analisa um documento +-------------------------------------- + +Existem dois tipos diferentes de erros de análise. Existem quebras +quando você passa para o Beautiful Soup um documento e ele retorna uma +exceção, geralmente um ``HTMLParser.HTMLParseError``. E existe o comportamento +inesperado, quando uma árvore de análise parece um pouco diferente do +documento usado para criá-la. + +Quase nenhum destes problemas são parte do Beautiful Soup. Não é +porque o Beautiful Soup é maravilhosamente um software bem escrito. É +porque o Beautiful Soup não inclui nenhum código de análise. Ao invés disso, +ele depende de analisadores externos. Se um analisador não funciona com +certo documento, a melhor solução é tentar um analisador diferente. Veja +:ref:`Instalando um interpretador <parser-installation>` para detalhes e uma comparação entre eles. + +Os erros de interpretação mais comuns são ``HTMLParser.HTMLParseError: +malformed start tag`` e ``HTMLParser.HTMLParseError: bad end +tag``. Existem dois parsers gerados para o parser built in do Python +e a solução é :ref:`install lxml ou html5lib. <parser-installation>` + +Os tipos de erros de comportamento inesperado mais comuns acontecem +quando não é encontrada a tag buscada no documento. Você vê a busca +sendo executada, mas ``find_all()`` retorna ``[]`` ou ``find()`` retorna ``None``. +Este é um problema comum com o analisador HTML nativo do Python que algumas +vezes pula tags que ele não entende. Novamente, a solução é +:ref:`instalar o lxml ou html5lib.<parser-installation>` + +Problemas de incompatibilidade de versões +----------------------------------------- + +* ``SyntaxError: Invalid syntax`` (on the line ``ROOT_TAG_NAME = + u'[document]'``): Causado por rodar a versão Python 2 do + Beautiful Soup no Python 3, sem converter o código. + +* ``ImportError: No module named HTMLParser`` - Causado por rodar a + versão Python 2 do Beautiful Soup no Python 3. + +* ``ImportError: No module named html.parser`` - Causado por rodar a + versão Python 3 do Beautiful Soup no Python 2. + +* ``ImportError: No module named BeautifulSoup`` - Causado por rodar + código do Beautiful Soup 3 em um sistema que não possui o BS3 + instalado. Ou por escrever código Beautiful Soup 4 sem saber que + o nome do pacote é diferente no ``bs4``. + +* ``ImportError: No module named bs4`` - Causado por rodar código Beautiful + Soup 4 em um sistema que não possui o BS4 instalado. + +.. _parsing-xml: + +Analisando um XML +----------------- + +Por padrão, o Beautiful Soup analisa documento como HTML. Para analisar um documento +como XML, passe "xml" como um segundo argumento ao construtor ``BeautifulSoup`` :: + + soup = BeautifulSoup(markup, "xml") + +Você precisará ter :ref:` lxml instalado <parser-installation>`. + +Outros problemas com analisadores +--------------------------------- + +* Se seu script funciona em um computador, mas não em outro, + ou em um ambiente virtual mas não em outro, ou fora do ambiente + virtual mas não dentro dele, provavelmente porque ambos os ambientes + possuem bibliotecas de analisadores difererentes. Por exemplo, você pode + ter desenvolvido um script em um computador que possui lxml instalado, + e então estar tentando rodá-lo no seu computador que possui apenas html5lib + instalado. Veja :ref:`Diferenças entre os interpretadores (parsers) <differences-between-parsers>` para entender porque isso importa, + e corrija o problema mencionando uma biblioteca específica no construtor ``BeautifulSoup``. + +* Por tags `HTML e atributos serem case-insensitive + <http://www.w3.org/TR/html5/syntax.html#syntax>`_, todos os três + parsers HTML convertem tags e atributos para lowercase. Isso é, + a marcação <TAG></TAG> é convertida para <tag></tag>. Se você quiser + preservar a formatação anterior das tags e atributos, você precisará + :ref:`analisar o documento como XML. <parsing-xml>` + +.. _misc: + +Diversos +-------- + +* ``UnicodeEncodeError: 'charmap' codec can't encode character + u'\xfoo' in position bar`` (ou qualquer outro + ``UnicodeEncodeError``) - Este não é um problema do Beautiful Soup. + Este problema poderá surgir em duas situações: a primeira quando você + tentar imprimir um caractere Unicode que seu console não sabe como + exibir. (Veja `Esta página na wiki do Python + <http://wiki.python.org/moin/PrintFails>`_ para saber mais.). A segunda, + quando você está gravando um arquivo e passa um caractere Unicode que + não é suportado pelo seu codificador padrão. Neste caso, a solução mais + simples é explicitamente converter a string Unicode em UTF-8 com + ``u.encode("utf8")``. + +* ``KeyError: [attr]`` - Caused by accessing ``tag['attr']`` quando a + tag em questão não define o atributo ``attr``. Os erros mais comuns são + ``KeyError: 'href'`` e ``KeyError: + 'class'``. Use ``tag.get('attr')`` se você não tem certeza se ``attr`` está + definido, assim como você faria em um dicionário Python. + +* ``AttributeError: 'ResultSet' object has no attribute 'foo'`` - Isso + geralmente ocorre quando você espera que ``find_all()`` retorne + uma única tag ou string. Mas ``find_all()`` retorn uma _lista_ de tags + e strings--um objeto ``ResultSet``. Você precisa iterar sobre a lista e + buscar ``.foo`` para cada um. Ou, se você realmente quiser apenas um resultado, + deverá usar ``find()`` ao invés de ``find_all()``. + +* ``AttributeError: 'NoneType' object has no attribute 'foo'`` - Isso + geralmente acontece quando é chamado ``find()`` e então se tenta acessar + o atributo `.foo`` o resultado. Mas no seu caso, ``find()`` não encontra nada, + então retorna ``None`` ao invés de retornar uma tag ou uma string. Você precisa + descobrir porque ``find()`` não está retornando nada. + +Melhorando a performance +------------------------ + +O Beautiful Soup nunca será tão rápido quanto os parsers em que +ele foi construido em cima. Se o tempo de resposta se tornar crítico, +se você estiver pagando por hora de uso de um computador ou se há +qualquer outra razão para que o tempo de processamento seja mais +valioso que o tempo de programação, você deve esquecer o Beautiful Soup +e trabalhar diretamente em cima do `lxml <http://lxml.de/>`_. + +Dito isso, existem algumas coisas que você pode fazer para acelerar o +Beautiful Soup. Se você não está utilizando o lxml como seu parser, +meu conselho é que o faça :ref:`start <parser-installation>`. +O Beautiful Soup analisa documentos significativamente mais rápido +utilizando o lxml do que usando o html.parser ou html5lib. + +Você pode acelerar a detecção da codificação significativamente instalando +a biblioteca `cchardet <http://pypi.python.org/pypi/cchardet/>`_ . + +`Analisando apenas parte de um documento`_ não irá lhe poupar muito tempo de +análise, mas irá poupar muita memória e fará a `busca` no documento muito +mais rápida. + +Beautiful Soup 3 +================ + +O Beautiful Soup 3 é a versão anterior e não é mais desenvolvida +ativamente. Ela atualmente faz parte da maioria das distribuições +Linux: + +:kbd:`$ apt-get install python-beautifulsoup` + +Também está publicada no PyPi como ``BeautifulSoup``.: + +:kbd:`$ easy_install BeautifulSoup` + +:kbd:`$ pip install BeautifulSoup` + +Você também pode fazer o `download de um tarball do Beautiful Soup 3.2.0 +<http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_. + +Se você rodar ``easy_install beautifulsoup`` ou ``easy_install +BeautifulSoup``, mas seu código não funcionar, você instalou o Beautiful +Soup 3 por engano. Você precisa executar ``easy_install beautifulsoup4``. + +`A documentação do Beautiful Soup 3 está arquivada online +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. + +Portabilidade de código para BS4 +-------------------------------- + +A maioria dos códigos escritos em Beautiful Soup 3 irá funcionar no +Beautiful Soup 4 com uma pequena alteração. Tudo que você precisa +fazer é alterar o nome do pacote de ``BeautifulSoup`` para ``bs4``. Então:: + + from BeautifulSoup import BeautifulSoup + +deverá ser assim:: + + from bs4 import BeautifulSoup + +* Se for gerado um ``ImportError`` "No module named BeautifulSoup", o + problema é que você está tentando executar um código Beautiful Soup 3, + mas possui apenas o Beautiful Soup 4 instalado. + +* Se for gerado um ``ImportError`` "No module named bs4", o problema + é que você está tentando executar um código Beautiful Soup 4, mas + possui apenas o Beautiful Soup 3 instalado. + +Apesar do BS4 ser quase totalmente compativel com BS3, a maioria de seus +métodos foram depreciados e renomeados para atender o padrão `PEP 8 +<http://www.python.org/dev/peps/pep-0008/>`_. Existem muitas outras +renomeações e alterações, e algumas delas quebram esta compatibilidade. + +Aqui está o que você irá precisar saber para converter seu código BS3 para BS4: + +Você precisa de um interpretador (parser) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +O Beautiful Soup 3 utilizava o ``SGMLParser`` do Python, um módulo que +foi depreciado e removido no Python 3.0. O Beautiful Soup 4 utiliza o +``html.parser`` por padrão, mas você pode adicionar o lxml ou html5lib +e utilizá-los como alternativa. Veja :ref:`Instalando um interpretador <parser-installation>` para +comparação. + +Como o ``html.parser`` não é o mesmo analisador que ``SGMLParser``, é possível +que o Beautiful Soup 4 retorne uma árvore de análise diferente da +gerada pelo Beautiful Soup 3 para as mesmas marcações. Se você trocar +``html.parser`` por lxml ou html5lib, você poderá descorbrir que a árvore também +mudará. Se isso acontecer, você precisará atualizar seu código para lidar com a +nova árvore. + +Nomes dos Métodos +^^^^^^^^^^^^^^^^^ + +* ``renderContents`` -> ``encode_contents`` +* ``replaceWith`` -> ``replace_with`` +* ``replaceWithChildren`` -> ``unwrap`` +* ``findAll`` -> ``find_all`` +* ``findAllNext`` -> ``find_all_next`` +* ``findAllPrevious`` -> ``find_all_previous`` +* ``findNext`` -> ``find_next`` +* ``findNextSibling`` -> ``find_next_sibling`` +* ``findNextSiblings`` -> ``find_next_siblings`` +* ``findParent`` -> ``find_parent`` +* ``findParents`` -> ``find_parents`` +* ``findPrevious`` -> ``find_previous`` +* ``findPreviousSibling`` -> ``find_previous_sibling`` +* ``findPreviousSiblings`` -> ``find_previous_siblings`` +* ``getText`` -> ``get_text`` +* ``nextSibling`` -> ``next_sibling`` +* ``previousSibling`` -> ``previous_sibling`` + +Alguns argumentos do construtor do Beautiful Soup foram renomeados pelas +mesmas razões: + +* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` +* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` + +Eu renomeei um método para compatibilidade com Python 3: + +* ``Tag.has_key()`` -> ``Tag.has_attr()`` + +Eu renomeei um atributo para utilizar uma terminologia mais precisa: + +* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` + +Eu renomeei três atributos para evitar utilizar palavras reservadas do +Python. Ao contrário das outras, estas alterações *não são compativeis com +versões anteriores.* Se você utilizar estes atributos no BS3, seu código +irá quebrar no BS4 até você corrigí-los. + +* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` +* ``Tag.next`` -> ``Tag.next_element`` +* ``Tag.previous`` -> ``Tag.previous_element`` + +Geradores +^^^^^^^^^ + +Eu dei nomes aos geradores de acordo com o PEP-8 e transformei-os +em propriedades: + +* ``childGenerator()`` -> ``children`` +* ``nextGenerator()`` -> ``next_elements`` +* ``nextSiblingGenerator()`` -> ``next_siblings`` +* ``previousGenerator()`` -> ``previous_elements`` +* ``previousSiblingGenerator()`` -> ``previous_siblings`` +* ``recursiveChildGenerator()`` -> ``descendants`` +* ``parentGenerator()`` -> ``parents`` + +Então, ao invés de:: + + for parent in tag.parentGenerator(): + ... + +Você pode escrever:: + + for parent in tag.parents: + ... + +(Mas a versão antiga ainda funcionará.) + +Alguns dos geradores eram utilizados para gerar ``None`` após +finalizado e então parar. Isso era um bug. Agora os geradores +apenas param. + +Existem dois novos geradores, :ref:`.strings e +.stripped_strings <string-generators>`. ``.strings`` gera objetos +NavigableString, e ``.stripped_strings`` gera strings Python com +espaços em branco removidos. + +XML +^^^ +Não existe mais uma classe ``BeautifulStoneSoup`` para analisar XML. Para +analisar XML você deverá passar "xml" como segundo argumento ao construtor +``BeautifulSoup``. Pela mesma razão, o construtor ``BeautifulSoup`` não +reconhece mais o argumento ``isHTML``. + +A manipulação do Beautiful Soup's de tags XML vazias foi melhorada. +Anteriormente, quando você analisava um XML, deveria explicitamente +dizer quais tags seriam consideradas elementos de tag vazios. O +argumento ``selfClosingTags`` não é mais reconhecido. Ao invés disso, +o Beautiful Soup considera qualquer tag vazia como um elemento de tag vazia. +Se você adicionar uma filha a um elemento de tag vazia, ela deixará de ser vazia. + +Entidades +^^^^^^^^^ + +Uma entidade HTML ou XML de entrada é sempre convertida em +seu caractere Unicode correspondente. O Beautiful Soup 3 possuia +inúmeras maneiras redundantes de lidar com entidades, as quais foram +removidas. O construtor ``BeautifulSoup`` não reconhece mais os argumentos +``smartQuotesTo`` ou ``convertEntities``. (`Unicode, +Dammit`_ ainda possui ``smart_quotes_to``, mas seu padrão agora é converter +smart quotes em Unicode.) As constantes ``HTML_ENTITIES``, +``XML_ENTITIES``, e ``XHTML_ENTITIES`` foram removidas, desde que elas +se referiam a uma feature (transformar algumas, mas não todas as entidades +em caracteres Unicode) que não existe mais. +Se você quiser transformar caracteres Unicode novamente em entidades HTML +na saída, ao invés de transformá-las em caracteres UTF-8, você precisará +utilizar um :ref:`output formatter <output_formatters>`. + +Variados +^^^^^^^^ + +:ref:`Tag.string <.string>` agora opera recursivamente. Se a tag A +contém apenas uma tag B e nada mais, então A.string é o mesmo que +B.string. (Anteriormente era None) + +`Atributos com múltiplos valores`_ como ``class`` possuem listas de strings +como valores e não strings. Isso deverá afetar a maneira que você buscará +por classes CSS. + +Se você passar um dos métodos ``find*``, ambos :ref:`string <string>` `e` +um argumento específico de uma tag como :ref:`name <name>`, o Beautiful Soup +irá buscar por tags que atentem o seu critério de argumento específico e que +:ref:`Tag.string <.string>` atenda o valor para :ref:`string <string>`. Isso +`não` irá encontrar as strings por si. Anteriormente, Beautiful Soup ignorava +o argumento específico de uma tag e olhava apenas para as strings. + +O construtor ``BeautifulSoup`` não reconhece mais o argumento `markupMassage`. +É agora responsabilidade do parser de manipular a marcação corretamente. + +As classes raramente usadas do analisador como +``ICantBelieveItsBeautifulSoup`` e ``BeautifulSOAP`` foram removidas. +é agora decisão do analisador como manipular marcações ambiguas. + +O método ``prettify()`` agora retorna uma string Unicode, e não bytestring. diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/Makefile b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/Makefile new file mode 100644 index 00000000000..8c833d2cedb --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/Makefile @@ -0,0 +1,130 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/6.1.jpg b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/6.1.jpg Binary files differnew file mode 100644 index 00000000000..97014f0ec04 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/6.1.jpg diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/bs4ru.rst b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/bs4ru.rst new file mode 100644 index 00000000000..f39a6e82986 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/bs4ru.rst @@ -0,0 +1,3360 @@ +Документация Beautiful Soup +=========================== + +.. image:: 6.1.jpg + :align: right + :alt: "Лакей Карась начал с того, что вытащил из-под мышки огромный конверт (чуть ли не больше его самого)." + +`Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/>`_ — это +библиотека Python для извлечения данных из файлов HTML и XML. Она работает +с вашим любимым парсером, чтобы дать вам естественные способы навигации, +поиска и изменения дерева разбора. Она обычно экономит программистам +часы и дни работы. + +Эти инструкции иллюстрируют все основные функции Beautiful Soup 4 +на примерах. Я покажу вам, для чего нужна библиотека, как она работает, +как ее использовать, как заставить ее делать то, что вы хотите, и что нужно делать, когда она +не оправдывает ваши ожидания. + +Примеры в этой документации работают одинаково на Python 2.7 +и Python 3.2. + +Возможно, вы ищете документацию для `Beautiful Soup 3 +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. +Если это так, имейте в виду, что Beautiful Soup 3 больше не +развивается, и что поддержка этой версии будет прекращена +31 декабря 2020 года или немногим позже. Если вы хотите узнать о различиях между Beautiful Soup 3 +и Beautiful Soup 4, читайте раздел `Перенос кода на BS4`_. + +Эта документация переведена на другие языки +пользователями Beautiful Soup: + +* `这篇文档当然还有中文版. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/>`_ +* このページは日本語で利用できます(`外部リンク <http://kondou.com/BS4/>`_) +* `이 문서는 한국어 번역도 가능합니다. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ko/>`_ +* `Este documento também está disponível em Português do Brasil. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ptbr/>`_ + + +Техническая поддержка +--------------------- + +Если у вас есть вопросы о Beautiful Soup или возникли проблемы, +`отправьте сообщение в дискуссионную группу +<https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_. Если +ваша проблема связана с разбором HTML-документа, не забудьте упомянуть, +:ref:`что говорит о нем функция diagnose() <diagnose>`. + +Быстрый старт +============= + +Вот HTML-документ, который я буду использовать в качестве примера в этой +документации. Это фрагмент из `«Алисы в стране чудес»`:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + +Прогон документа через Beautiful Soup дает нам +объект ``BeautifulSoup``, который представляет документ в виде +вложенной структуры данных:: + + from bs4 import BeautifulSoup + soup = BeautifulSoup (html_doc, 'html.parser') + + print(soup.prettify()) + # <html> + # <head> + # <title> + # The Dormouse's story + # </title> + # </head> + # <body> + # <p class="title"> + # <b> + # The Dormouse's story + # </b> + # </p> + # <p class="story"> + # Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # , + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # and + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + # ; and they lived at the bottom of a well. + # </p> + # <p class="story"> + # ... + # </p> + # </body> + # </html> + +Вот несколько простых способов навигации по этой структуре данных:: + + soup.title + # <title>The Dormouse's story</title> + + soup.title.name + # u'title' + + soup.title.string + # u'The Dormouse's story' + + soup.title.parent.name + # u'head' + + soup.p + # <p class="title"><b>The Dormouse's story</b></p> + + soup.p['class'] + # u'title' + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find(id="link3") + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +Одна из распространенных задач — извлечь все URL-адреса, найденные на странице в тегах <a>:: + + for link in soup.find_all('a'): + print(link.get('href')) + # http://example.com/elsie + # http://example.com/lacie + # http://example.com/tillie + +Другая распространенная задача — извлечь весь текст со страницы:: + + print(soup.get_text()) + # The Dormouse's story + # + # The Dormouse's story + # + # Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well. + # + # ... + +Это похоже на то, что вам нужно? Если да, продолжайте читать. + +Установка Beautiful Soup +======================== + +Если вы используете последнюю версию Debian или Ubuntu Linux, вы можете +установить Beautiful Soup с помощью системы управления пакетами: + +:kbd:`$ apt-get install python-bs4` (для Python 2) + +:kbd:`$ apt-get install python3-bs4` (для Python 3) + +Beautiful Soup 4 публикуется через PyPi, поэтому, если вы не можете установить библиотеку +с помощью системы управления пакетами, можно установить с помощью ``easy_install`` или +``pip``. Пакет называется ``beautifulsoup4``. Один и тот же пакет +работает как на Python 2, так и на Python 3. Убедитесь, что вы используете версию +``pip`` или ``easy_install``, предназначенную для вашей версии Python (их можно назвать +``pip3`` и ``easy_install3`` соответственно, если вы используете Python 3). + +:kbd:`$ easy_install beautifulsoup4` + +:kbd:`$ pip install beautifulsoup4` + +(``BeautifulSoup`` — это, скорее всего, `не тот` пакет, который вам нужен. Это +предыдущий основной релиз, `Beautiful Soup 3`_. Многие программы используют +BS3, так что он все еще доступен, но если вы пишете новый код, +нужно установить ``beautifulsoup4``.) + +Если у вас не установлены ``easy_install`` или ``pip``, вы можете +`скачать архив с исходным кодом Beautiful Soup 4 +<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ и +установить его с помощью ``setup.py``. + +:kbd:`$ python setup.py install` + +Если ничего не помогает, лицензия на Beautiful Soup позволяет +упаковать библиотеку целиком вместе с вашим приложением. Вы можете скачать +tar-архив, скопировать из него в кодовую базу вашего приложения каталог ``bs4`` +и использовать Beautiful Soup, не устанавливая его вообще. + +Я использую Python 2.7 и Python 3.2 для разработки Beautiful Soup, но библиотека +должна работать и с более поздними версиями Python. + +Проблемы после установки +------------------------ + +Beautiful Soup упакован как код Python 2. Когда вы устанавливаете его для +использования с Python 3, он автоматически конвертируется в код Python 3. Если +вы не устанавливаете библиотеку в виде пакета, код не будет сконвертирован. Были +также сообщения об установке неправильной версии на компьютерах с +Windows. + +Если выводится сообщение ``ImportError`` "No module named HTMLParser", ваша +проблема в том, что вы используете версию кода на Python 2, работая на +Python 3. + +Если выводится сообщение ``ImportError`` "No module named html.parser", ваша +проблема в том, что вы используете версию кода на Python 3, работая на +Python 2. + +В обоих случаях лучше всего полностью удалить Beautiful +Soup с вашей системы (включая любой каталог, созданный +при распаковке tar-архива) и запустить установку еще раз. + +Если выводится сообщение ``SyntaxError`` "Invalid syntax" в строке +``ROOT_TAG_NAME = u'[document]'``, вам нужно конвертировать код из Python 2 +в Python 3. Вы можете установить пакет: + +:kbd:`$ python3 setup.py install` + +или запустить вручную Python-скрипт ``2to3`` +в каталоге ``bs4``: + +:kbd:`$ 2to3-3.2 -w bs4` + +.. _parser-installation: + + +Установка парсера +----------------- + +Beautiful Soup поддерживает парсер HTML, включенный в стандартную библиотеку Python, +а также ряд сторонних парсеров на Python. +Одним из них является `парсер lxml <http://lxml.de/>`_. В зависимости от ваших настроек, +вы можете установить lxml с помощью одной из следующих команд: + +:kbd:`$ apt-get install python-lxml` + +:kbd:`$ easy_install lxml` + +:kbd:`$ pip install lxml` + +Другая альтернатива — написанный исключительно на Python `парсер html5lib +<http://code.google.com/p/html5lib/>`_, который разбирает HTML таким же образом, +как это делает веб-браузер. В зависимости от ваших настроек, вы можете установить html5lib +с помощью одной из этих команд: + +:kbd:`$ apt-get install python-html5lib` + +:kbd:`$ easy_install html5lib` + +:kbd:`$ pip install html5lib` + +Эта таблица суммирует преимущества и недостатки каждого парсера: + ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| Парсер | Типичное использование | Преимущества | Недостатки | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| html.parser от Python| ``BeautifulSoup(markup, "html.parser")`` | * Входит в комплект | * Не такой быстрый, как | +| | | * Приличная скорость | lxml, более строгий, | +| | | * Нестрогий (по крайней мере, | чем html5lib. | +| | | в Python 2.7.3 и 3.2.) | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| HTML-парсер в lxml | ``BeautifulSoup(markup, "lxml")`` | * Очень быстрый | * Внешняя зависимость | +| | | * Нестрогий | от C | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| XML-парсер в lxml | ``BeautifulSoup(markup, "lxml-xml")`` | * Очень быстрый | * Внешняя зависимость | +| | ``BeautifulSoup(markup, "xml")`` | * Единственный XML-парсер, | от C | +| | | который сейчас поддерживается| | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| html5lib | ``BeautifulSoup(markup, "html5lib")`` | * Очень нестрогий | * Очень медленный | +| | | * Разбирает страницы так же, | * Внешняя зависимость | +| | | как это делает браузер | от Python | +| | | * Создает валидный HTML5 | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ + +Я рекомендую по возможности установить и использовать lxml для быстродействия. Если вы +используете версию Python 2 более раннюю, чем 2.7.3, или версию Python 3 +более раннюю, чем 3.2.2, `необходимо` установить lxml или +html5lib, потому что встроенный в Python парсер HTML просто недостаточно хорош в старых +версиях. + +Обратите внимание, что если документ невалиден, различные парсеры будут генерировать +дерево Beautiful Soup для этого документа по-разному. Ищите подробности в разделе `Различия +между парсерами`_. + +Приготовление супа +================== + +Чтобы разобрать документ, передайте его в +конструктор ``BeautifulSoup``. Вы можете передать строку или открытый дескриптор файла:: + + from bs4 import BeautifulSoup + + with open("index.html") as fp: + soup = BeautifulSoup(fp) + + soup = BeautifulSoup("<html>data</html>") + +Первым делом документ конвертируется в Unicode, а HTML-мнемоники +конвертируются в символы Unicode:: + + BeautifulSoup("Sacré bleu!") + <html><head></head><body>Sacré bleu!</body></html> + +Затем Beautiful Soup анализирует документ, используя лучший из доступных +парсеров. Библиотека будет использовать HTML-парсер, если вы явно не укажете, +что нужно использовать XML-парсер. (См. `Разбор XML`_.) + +Виды объектов +============= + +Beautiful Soup превращает сложный HTML-документ в сложное дерево +объектов Python. Однако вам придется иметь дело только с четырьмя +`видами` объектов: ``Tag``, ``NavigableString``, ``BeautifulSoup`` +и ``Comment``. + +.. _Tag: + +``Tag`` +------- + +Объект ``Tag`` соответствует тегу XML или HTML в исходном документе:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + type(tag) + # <class 'bs4.element.Tag'> + +У объекта Tag (далее «тег») много атрибутов и методов, и я расскажу о большинстве из них +в разделах `Навигация по дереву`_ и `Поиск по дереву`_. На данный момент наиболее +важными особенностями тега являются его имя и атрибуты. + +Имя +^^^ + +У каждого тега есть имя, доступное как ``.name``:: + + tag.name + # u'b' + +Если вы измените имя тега, это изменение будет отражено в любой HTML- +разметке, созданной Beautiful Soup:: + + tag.name = "blockquote" + tag + # <blockquote class="boldest">Extremely bold</blockquote> + +Атрибуты +^^^^^^^^ + +У тега может быть любое количество атрибутов. Тег ``<b +id = "boldest">`` имеет атрибут "id", значение которого равно +"boldest". Вы можете получить доступ к атрибутам тега, обращаясь с тегом как +со словарем:: + + tag['id'] + # u'boldest' + +Вы можете получить доступ к этому словарю напрямую как к ``.attrs``:: + + tag.attrs + # {u'id': 'boldest'} + +Вы можете добавлять, удалять и изменять атрибуты тега. Опять же, это +делается путем обращения с тегом как со словарем:: + + tag['id'] = 'verybold' + tag['another-attribute'] = 1 + tag + # <b another-attribute="1" id="verybold"></b> + + del tag['id'] + del tag['another-attribute'] + tag + # <b></b> + + tag['id'] + # KeyError: 'id' + print(tag.get('id')) + # None + +.. _multivalue: + +Многозначные атрибуты +&&&&&&&&&&&&&&&&&&&&& + +В HTML 4 определено несколько атрибутов, которые могут иметь множество значений. В HTML 5 +пара таких атрибутов удалена, но определено еще несколько. Самый распространённый из +многозначных атрибутов — это ``class`` (т. е. тег может иметь более +одного класса CSS). Среди прочих ``rel``, ``rev``, ``accept-charset``, +``headers`` и ``accesskey``. Beautiful Soup представляет значение(я) +многозначного атрибута в виде списка:: + + css_soup = BeautifulSoup('<p class="body"></p>') + css_soup.p['class'] + # ["body"] + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.p['class'] + # ["body", "strikeout"] + +Если атрибут `выглядит` так, будто он имеет более одного значения, но это не +многозначный атрибут, определенный какой-либо версией HTML- +стандарта, Beautiful Soup оставит атрибут как есть:: + + id_soup = BeautifulSoup('<p id="my id"></p>') + id_soup.p['id'] + # 'my id' + +Когда вы преобразовываете тег обратно в строку, несколько значений атрибута +объединяются:: + + rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>') + rel_soup.a['rel'] + # ['index'] + rel_soup.a['rel'] = ['index', 'contents'] + print(rel_soup.p) + # <p>Back to the <a rel="index contents">homepage</a></p> + +Вы можете отключить объединение, передав ``multi_valued_attributes = None`` в качестве +именованного аргумента в конструктор ``BeautifulSoup``:: + + no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html', multi_valued_attributes=None) + no_list_soup.p['class'] + # u'body strikeout' + +Вы можете использовать ``get_attribute_list``, того чтобы получить значение в виде списка, +независимо от того, является ли атрибут многозначным или нет:: + + id_soup.p.get_attribute_list('id') + # ["my id"] + +Если вы разбираете документ как XML, многозначных атрибутов не будет:: + + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') + xml_soup.p['class'] + # u'body strikeout' + +Опять же, вы можете поменять настройку, используя аргумент ``multi_valued_attributes``:: + + class_is_multi= { '*' : 'class'} + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi) + xml_soup.p['class'] + # [u'body', u'strikeout'] + +Вряд ли вам это пригодится, но если все-таки будет нужно, руководствуйтесь значениями +по умолчанию. Они реализуют правила, описанные в спецификации HTML:: + + from bs4.builder import builder_registry + builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES + + +``NavigableString`` +------------------- + +Строка соответствует фрагменту текста в теге. Beautiful Soup +использует класс ``NavigableString`` для хранения этих фрагментов текста:: + + tag.string + # u'Extremely bold' + type(tag.string) + # <class 'bs4.element.NavigableString'> + +``NavigableString`` похожа на строку Unicode в Python, не считая того, +что она также поддерживает некоторые функции, описанные в +разделах `Навигация по дереву`_ и `Поиск по дереву`_. Вы можете конвертировать +``NavigableString`` в строку Unicode с помощью ``unicode()``:: + + unicode_string = unicode(tag.string) + unicode_string + # u'Extremely bold' + type(unicode_string) + # <type 'unicode'> + +Вы не можете редактировать строку непосредственно, но вы можете заменить одну строку +другой, используя :ref:`replace_with()`:: + + tag.string.replace_with("No longer bold") + tag + # <blockquote>No longer bold</blockquote> + +``NavigableString`` поддерживает большинство функций, описанных в +разделах `Навигация по дереву`_ и `Поиск по дереву`_, но +не все. В частности, поскольку строка не может ничего содержать (в том смысле, +в котором тег может содержать строку или другой тег), строки не поддерживают +атрибуты ``.contents`` и ``.string`` или метод ``find()``. + +Если вы хотите использовать ``NavigableString`` вне Beautiful Soup, +вам нужно вызвать метод ``unicode()``, чтобы превратить ее в обычную для Python +строку Unicode. Если вы этого не сделаете, ваша строка будет тащить за собой +ссылку на все дерево разбора Beautiful Soup, даже когда вы +закончите использовать Beautiful Soup. Это большой расход памяти. + +``BeautifulSoup`` +----------------- + +Объект ``BeautifulSoup`` представляет разобранный документ как единое +целое. В большинстве случаев вы можете рассматривать его как объект +:ref:`Tag`. Это означает, что он поддерживает большинство методов, описанных +в разделах `Навигация по дереву`_ и `Поиск по дереву`_. + +Вы также можете передать объект ``BeautifulSoup`` в один из методов, +перечисленных в разделе `Изменение дерева`_, по аналогии с передачей объекта :ref:`Tag`. Это +позволяет вам делать такие вещи, как объединение двух разобранных документов:: + + doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml") + footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml") + doc.find(text="INSERT FOOTER HERE").replace_with(footer) + # u'INSERT FOOTER HERE' + print(doc) + # <?xml version="1.0" encoding="utf-8"?> + # <document><content/><footer>Here's the footer</footer></document> + +Поскольку объект ``BeautifulSoup`` не соответствует действительному +HTML- или XML-тегу, у него нет имени и атрибутов. Однако иногда +бывает полезно взглянуть на ``.name`` объекта ``BeautifulSoup``, поэтому ему было присвоено специальное «имя» +``.name`` "[document]":: + + soup.name + # u'[document]' + +Комментарии и другие специфичные строки +--------------------------------------- + +``Tag``, ``NavigableString`` и ``BeautifulSoup`` охватывают почти +все, с чем вы столкнётесь в файле HTML или XML, но осталось +ещё немного. Пожалуй, единственное, о чем стоит волноваться, +это комментарий:: + + markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" + soup = BeautifulSoup(markup) + comment = soup.b.string + type(comment) + # <class 'bs4.element.Comment'> + +Объект ``Comment`` — это просто особый тип ``NavigableString``:: + + comment + # u'Hey, buddy. Want to buy a used parser' + +Но когда он появляется как часть HTML-документа, ``Comment`` +отображается со специальным форматированием:: + + print(soup.b.prettify()) + # <b> + # <!--Hey, buddy. Want to buy a used parser?--> + # </b> + +Beautiful Soup определяет классы для всего, что может появиться в +XML-документе: ``CData``, ``ProcessingInstruction``, +``Declaration`` и ``Doctype``. Как и ``Comment``, эти классы +являются подклассами ``NavigableString``, которые добавляют что-то еще к +строке. Вот пример, который заменяет комментарий блоком +CDATA:: + + from bs4 import CData + cdata = CData("A CDATA block") + comment.replace_with(cdata) + + print(soup.b.prettify()) + # <b> + # <![CDATA[A CDATA block]]> + # </b> + + +Навигация по дереву +=================== + +Вернемся к HTML-документу с фрагментом из «Алисы в стране чудес»:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup (html_doc, 'html.parser') + +Я буду использовать его в качестве примера, чтобы показать, как перейти от одной части +документа к другой. + +Проход сверху вниз +------------------ + +Теги могут содержать строки и другие теги. Эти элементы являются +дочерними (`children`) для тега. Beautiful Soup предоставляет множество различных атрибутов для +навигации и перебора дочерних элементов. + +Обратите внимание, что строки Beautiful Soup не поддерживают ни один из этих +атрибутов, потому что строка не может иметь дочерних элементов. + +Навигация с использованием имен тегов +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Самый простой способ навигации по дереву разбора — это указать имя +тега, который вам нужен. Если вы хотите получить тег <head>, просто напишите ``soup.head``:: + + soup.head + # <head><title>The Dormouse's story</title></head> + + soup.title + # <title>The Dormouse's story</title> + +Вы можете повторять этот трюк многократно, чтобы подробнее рассмотреть определенную часть +дерева разбора. Следующий код извлекает первый тег <b> внутри тега <body>:: + + soup.body.b + # <b>The Dormouse's story</b> + +Использование имени тега в качестве атрибута даст вам только `первый` тег с таким +именем:: + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +Если вам нужно получить `все` теги <a> или что-нибудь более сложное, +чем первый тег с определенным именем, вам нужно использовать один из +методов, описанных в разделе `Поиск по дереву`_, такой как `find_all()`:: + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +``.contents`` и ``.children`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Дочерние элементы доступны в списке под названием ``.contents``:: + + head_tag = soup.head + head_tag + # <head><title>The Dormouse's story</title></head> + + head_tag.contents + [<title>The Dormouse's story</title>] + + title_tag = head_tag.contents[0] + title_tag + # <title>The Dormouse's story</title> + title_tag.contents + # [u'The Dormouse's story'] + +Сам объект ``BeautifulSoup`` имеет дочерние элементы. В этом случае +тег <html> является дочерним для объекта ``BeautifulSoup``:: + + len(soup.contents) + # 1 + soup.contents[0].name + # u'html' + +У строки нет ``.contents``, потому что она не может содержать +ничего:: + + text = title_tag.contents[0] + text.contents + # AttributeError: У объекта 'NavigableString' нет атрибута 'contents' + +Вместо того, чтобы получать дочерние элементы в виде списка, вы можете перебирать их +с помощью генератора ``.children``:: + + for child in title_tag.children: + print(child) + # The Dormouse's story + +``.descendants`` +^^^^^^^^^^^^^^^^ + +Атрибуты ``.contents`` и ``.children`` применяются только в отношении +`непосредственных` дочерних элементов тега. Например, тег <head> имеет только один непосредственный +дочерний тег <title>:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + +Но у самого тега <title> есть дочерний элемент: строка "The Dormouse's +story". В некотором смысле эта строка также является дочерним элементом +тега <head>. Атрибут ``.descendants`` позволяет перебирать `все` +дочерние элементы тега рекурсивно: его непосредственные дочерние элементы, дочерние элементы +дочерних элементов и так далее:: + + for child in head_tag.descendants: + print(child) + # <title>The Dormouse's story</title> + # The Dormouse's story + +У тега <head> есть только один дочерний элемент, но при этом у него два потомка: +тег <title> и его дочерний элемент. У объекта ``BeautifulSoup`` +только один прямой дочерний элемент (тег <html>), зато множество +потомков:: + + len(list(soup.children)) + # 1 + len(list(soup.descendants)) + # 25 + +.. _.string: + +``.string`` +^^^^^^^^^^^ + +Если у тега есть только один дочерний элемент, и это ``NavigableString``, +его можно получить через ``.string``:: + + title_tag.string + # u'The Dormouse's story' + +Если единственным дочерним элементом тега является другой тег, и у этого `другого` тега есть строка +``.string``, то считается, что родительский тег содержит ту же строку +``.string``, что и дочерний тег:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + + head_tag.string + # u'The Dormouse's story' + +Если тег содержит больше чем один элемент, то становится неясным, какая из строк +``.string`` относится и к родительскому тегу, поэтому ``.string`` родительского тега имеет значение +``None``:: + + print(soup.html.string) + # None + +.. _string-generators: + +``.strings`` и ``.stripped_strings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Если внутри тега есть более одного элемента, вы все равно можете посмотреть только на +строки. Используйте генератор ``.strings``:: + + for string in soup.strings: + print(repr(string)) + # u"The Dormouse's story" + # u'\n\n' + # u"The Dormouse's story" + # u'\n\n' + # u'Once upon a time there were three little sisters; and their names were\n' + # u'Elsie' + # u',\n' + # u'Lacie' + # u' and\n' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # u'...' + # u'\n' + +В этих строках много лишних пробелов, которые вы можете +удалить, используя генератор ``.stripped_strings``:: + + for string in soup.stripped_strings: + print(repr(string)) + # u"The Dormouse's story" + # u"The Dormouse's story" + # u'Once upon a time there were three little sisters; and their names were' + # u'Elsie' + # u',' + # u'Lacie' + # u'and' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'...' + +Здесь строки, состоящие исключительно из пробелов, игнорируются, а +пробелы в начале и конце строк удаляются. + +Проход снизу вверх +------------------ + +В продолжение аналогии с «семейным деревом», каждый тег и каждая строка имеет +родителя (`parent`): тег, который его содержит. + +.. _.parent: + +``.parent`` +^^^^^^^^^^^ + +Вы можете получить доступ к родительскому элементу с помощью атрибута ``.parent``. В +примере документа с фрагментом из «Алисы в стране чудес» тег <head> является родительским +для тега <title>:: + + title_tag = soup.title + title_tag + # <title>The Dormouse's story</title> + title_tag.parent + # <head><title>The Dormouse's story</title></head> + +Строка заголовка сама имеет родителя: тег <title>, содержащий +ее:: + + title_tag.string.parent + # <title>The Dormouse's story</title> + +Родительским элементом тега верхнего уровня, такого как <html>, является сам объект +``BeautifulSoup``:: + + html_tag = soup.html + type(html_tag.parent) + # <class 'bs4.BeautifulSoup'> + +И ``.parent`` объекта ``BeautifulSoup`` определяется как None:: + + print(soup.parent) + # None + +.. _.parents: + +``.parents`` +^^^^^^^^^^^^ + +Вы можете перебрать всех родителей элемента с помощью +``.parents``. В следующем примере ``.parents`` используется для перемещения от тега <a>, +закопанного глубоко внутри документа, до самого верха документа:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + for parent in link.parents: + if parent is None: + print(parent) + else: + print(parent.name) + # p + # body + # html + # [document] + # None + +Перемещение вбок +---------------- + +Рассмотрим простой документ:: + + sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>") + print(sibling_soup.prettify()) + # <html> + # <body> + # <a> + # <b> + # text1 + # </b> + # <c> + # text2 + # </c> + # </a> + # </body> + # </html> + +Тег <b> и тег <c> находятся на одном уровне: они оба непосредственные +дочерние элементы одного и того же тега. Мы называем их `одноуровневые`. Когда документ +красиво отформатирован, одноуровневые элементы выводятся с одинаковым отступом. Вы +также можете использовать это отношение в написанном вами коде. + +``.next_sibling`` и ``.previous_sibling`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Вы можете использовать ``.next_sibling`` и ``.previous_sibling`` для навигации +между элементами страницы, которые находятся на одном уровне дерева разбора:: + + sibling_soup.b.next_sibling + # <c>text2</c> + + sibling_soup.c.previous_sibling + # <b>text1</b> + +У тега <b> есть ``.next_sibling``, но нет ``.previous_sibling``, +потому что нет ничего до тега <b> `на том же уровне +дерева`. По той же причине у тега <c> есть ``.previous_sibling``, +но нет ``.next_sibling``:: + + print(sibling_soup.b.previous_sibling) + # None + print(sibling_soup.c.next_sibling) + # None + +Строки "text1" и "text2" `не являются` одноуровневыми, потому что они не +имеют общего родителя:: + + sibling_soup.b.string + # u'text1' + + print(sibling_soup.b.string.next_sibling) + # None + +В реальных документах ``.next_sibling`` или ``.previous_sibling`` +тега обычно будет строкой, содержащей пробелы. Возвращаясь к +фрагменту из «Алисы в стране чудес»:: + + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> + +Вы можете подумать, что ``.next_sibling`` первого тега <a> +должен быть второй тег <a>. Но на самом деле это строка: запятая и +перевод строки, отделяющий первый тег <a> от второго:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + link.next_sibling + # u',\n' + +Второй тег <a> на самом деле является ``.next_sibling`` запятой :: + + link.next_sibling.next_sibling + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + +.. _sibling-generators: + +``.next_siblings`` и ``.previous_siblings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Вы можете перебрать одноуровневые элементы данного тега с помощью ``.next_siblings`` или +``.previous_siblings``:: + + for sibling in soup.a.next_siblings: + print(repr(sibling)) + # u',\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u' and\n' + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + # u'; and they lived at the bottom of a well.' + # None + + for sibling in soup.find(id="link3").previous_siblings: + print(repr(sibling)) + # ' and\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u',\n' + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + # u'Once upon a time there were three little sisters; and their names were\n' + # None + +Проход вперед и назад +--------------------- + +Взгляните на начало фрагмента из «Алисы в стране чудес»:: + + <html><head><title>The Dormouse's story</title></head> + <p class="title"><b>The Dormouse's story</b></p> + +HTML-парсер берет эту строку символов и превращает ее в +серию событий: "открыть тег <html>", "открыть тег <head>", "открыть +тег <html>", "добавить строку", "закрыть тег <title>", "открыть +тег <p>" и так далее. Beautiful Soup предлагает инструменты для реконструирование +первоначального разбора документа. + +.. _element-generators: + +``.next_element`` и ``.previous_element`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Атрибут ``.next_element`` строки или тега указывает на то, +что было разобрано непосредственно после него. Это могло бы быть тем же, что и +``.next_sibling``, но обычно результат резко отличается. + +Возьмем последний тег <a> в фрагменте из «Алисы в стране чудес». Его +``.next_sibling`` является строкой: конец предложения, которое было +прервано началом тега <a>:: + + last_a_tag = soup.find("a", id="link3") + last_a_tag + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_a_tag.next_sibling + # '; and they lived at the bottom of a well.' + +Но ``.next_element`` этого тега <a> — это то, что было разобрано +сразу после тега <a>, `не` остальная часть этого предложения: +это слово "Tillie":: + + last_a_tag.next_element + # u'Tillie' + +Это потому, что в оригинальной разметке слово «Tillie» появилось +перед точкой с запятой. Парсер обнаружил тег <a>, затем +слово «Tillie», затем закрывающий тег </a>, затем точку с запятой и оставшуюся +часть предложения. Точка с запятой находится на том же уровне, что и тег <a>, но +слово «Tillie» встретилось первым. + +Атрибут ``.previous_element`` является полной противоположностью +``.next_element``. Он указывает на элемент, который был встречен при разборе +непосредственно перед текущим:: + + last_a_tag.previous_element + # u' and\n' + last_a_tag.previous_element.next_element + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +``.next_elements`` и ``.previous_elements`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Вы уже должны были уловить идею. Вы можете использовать их для перемещения +вперед или назад по документу, в том порядке, в каком он был разобран парсером:: + + for element in last_a_tag.next_elements: + print(repr(element)) + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # <p class="story">...</p> + # u'...' + # u'\n' + # None + +Поиск по дереву +=============== + +Beautiful Soup определяет множество методов поиска по дереву разбора, +но они все очень похожи. Я буду долго объяснять, как работают +два самых популярных метода: ``find()`` и ``find_all()``. Прочие +методы принимают практически те же самые аргументы, поэтому я расскажу +о них вкратце. + +И опять, я буду использовать фрагмент из «Алисы в стране чудес» в качестве примера:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup (html_doc, 'html.parser') + +Передав фильтр в аргумент типа ``find_all()``, вы можете +углубиться в интересующие вас части документа. + +Виды фильтров +------------- + +Прежде чем подробно рассказывать о ``find_all()`` и подобных методах, я +хочу показать примеры различных фильтров, которые вы можете передать в эти +методы. Эти фильтры появляются снова и снова в +поисковом API. Вы можете использовать их для фильтрации по имени тега, +по его атрибутам, по тексту строки или по некоторой их +комбинации. + +.. _a string: + +Строка +^^^^^^ + +Самый простой фильтр — это строка. Передайте строку в метод поиска, и +Beautiful Soup выполнит поиск соответствия этой строке. Следующий +код находит все теги <b> в документе:: + + soup.find_all('b') + # [<b>The Dormouse's story</b>] + +Если вы передадите байтовую строку, Beautiful Soup будет считать, что строка +кодируется в UTF-8. Вы можете избежать этого, передав вместо нее строку Unicode. + +.. _a regular expression: + +Регулярное выражение +^^^^^^^^^^^^^^^^^^^^ + +Если вы передадите объект с регулярным выражением, Beautiful Soup отфильтрует результаты +в соответствии с этим регулярным выражением, используя его метод ``search()``. Следующий код +находит все теги, имена которых начинаются с буквы "b"; в нашем +случае это теги <body> и <b>:: + + import re + for tag in soup.find_all(re.compile("^b")): + print(tag.name) + # body + # b + +Этот код находит все теги, имена которых содержат букву "t":: + + for tag in soup.find_all(re.compile("t")): + print(tag.name) + # html + # title + +.. _a list: + +Список +^^^^^^ + +Если вы передадите список, Beautiful Soup разрешит совпадение строк +с `любым` элементом из этого списка. Следующий код находит все теги <a> +`и` все теги <b>:: + + soup.find_all(["a", "b"]) + # [<b>The Dormouse's story</b>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _the value True: + +``True`` +^^^^^^^^ + +Значение ``True`` подходит везде, где возможно.. Следующий код находит `все` +теги в документе, но не текстовые строки:: + + for tag in soup.find_all(True): + print(tag.name) + # html + # head + # title + # body + # p + # b + # p + # a + # a + # a + # p + +.. a function: + +Функция +^^^^^^^ + +Если ничто из перечисленного вам не подходит, определите функцию, которая +принимает элемент в качестве единственного аргумента. Функция должна вернуть +``True``, если аргумент подходит, и ``False``, если нет. + +Вот функция, которая возвращает ``True``, если в теге определен атрибут "class", +но не определен атрибут "id":: + + def has_class_but_no_id(tag): + return tag.has_attr('class') and not tag.has_attr('id') + +Передайте эту функцию в ``find_all()``, и вы получите все +теги <p>:: + + soup.find_all(has_class_but_no_id) + # [<p class="title"><b>The Dormouse's story</b></p>, + # <p class="story">Once upon a time there were...</p>, + # <p class="story">...</p>] + +Эта функция выбирает только теги <p>. Она не выбирает теги <a>, +поскольку в них определены и атрибут "class" , и атрибут "id". Она не выбирает +теги вроде <html> и <title>, потому что в них не определен атрибут +"class". + +Если вы передаете функцию для фильтрации по определенному атрибуту, такому как +``href``, аргументом, переданным в функцию, будет +значение атрибута, а не весь тег. Вот функция, которая находит все теги ``a``, +у которых атрибут ``href`` *не* соответствует регулярному выражению:: + + def not_lacie(href): + return href and not re.compile("lacie").search(href) + soup.find_all(href=not_lacie) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Функция может быть настолько сложной, насколько вам нужно. Вот +функция, которая возвращает ``True``, если тег окружен строковыми +объектами:: + + from bs4 import NavigableString + def surrounded_by_strings(tag): + return (isinstance(tag.next_element, NavigableString) + and isinstance(tag.previous_element, NavigableString)) + + for tag in soup.find_all(surrounded_by_strings): + print tag.name + # p + # a + # a + # a + # p + +Теперь мы готовы подробно рассмотреть методы поиска. + +``find_all()`` +-------------- + +Сигнатура: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Метод ``find_all()`` просматривает потомков тега и +извлекает `всех` потомков, которые соответствую вашим фильтрам. Я привел несколько +примеров в разделе `Виды фильтров`_, а вот еще несколько:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.find_all("p", "title") + # [<p class="title"><b>The Dormouse's story</b></p>] + + soup.find_all("a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find_all(id="link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + import re + soup.find(string=re.compile("sisters")) + # u'Once upon a time there were three little sisters; and their names were\n' + +Кое-что из этого нам уже знакомо, но есть и новое. Что означает +передача значения для ``string`` или ``id``? Почему +``find_all ("p", "title")`` находит тег <p> с CSS-классом "title"? +Давайте посмотрим на аргументы ``find_all()``. + +.. _name: + +Аргумент ``name`` +^^^^^^^^^^^^^^^^^ + +Передайте значение для аргумента ``name``, и вы скажете Beautiful Soup +рассматривать только теги с определенными именами. Текстовые строки будут игнорироваться, так же как и +теги, имена которых не соответствуют заданным. + +Вот простейший пример использования:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + +В разделе `Виды фильтров`_ говорилось, что значением ``name`` может быть +`строка`_, `регулярное выражение`_, `список`_, `функция`_ или +`True`_. + +.. _kwargs: + +Именованные аргументы +^^^^^^^^^^^^^^^^^^^^^ + +Любой нераспознанный аргумент будет превращен в фильтр +по атрибуту тега. Если вы передаете значение для аргумента с именем ``id``, +Beautiful Soup будет фильтровать по атрибуту "id" каждого тега:: + + soup.find_all(id='link2') + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Если вы передадите значение для ``href``, Beautiful Soup отфильтрует +по атрибуту "href" каждого тега:: + + soup.find_all(href=re.compile("elsie")) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +Для фильтрации по атрибуту может использоваться `строка`_, `регулярное +выражение`_, `список`_, `функция`_ или значение `True`_. + +Следующий код находит все теги, атрибут ``id`` которых имеет значение, +независимо от того, что это за значение:: + + soup.find_all(id=True) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Вы можете отфильтровать несколько атрибутов одновременно, передав более одного +именованного аргумента:: + + soup.find_all(href=re.compile("elsie"), id='link1') + # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] + +Некоторые атрибуты, такие как атрибуты data-* в HTML 5, имеют имена, которые +нельзя использовать в качестве имен именованных аргументов:: + + data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') + data_soup.find_all(data-foo="value") + # SyntaxError: keyword can't be an expression + +Вы можете использовать эти атрибуты в поиске, поместив их в +словарь и передав словарь в ``find_all()`` как +аргумент ``attrs``:: + + data_soup.find_all(attrs={"data-foo": "value"}) + # [<div data-foo="value">foo!</div>] + +Нельзя использовать именованный аргумент для поиска в HTML по элементу "name", +потому что Beautiful Soup использует аргумент ``name`` для имени +самого тега. Вместо этого вы можете передать элемент "name" вместе с его значением в +составе аргумента ``attrs``:: + + name_soup = BeautifulSoup('<input name="email"/>') + name_soup.find_all(name="email") + # [] + name_soup.find_all(attrs={"name": "email"}) + # [<input name="email"/>] + +.. _attrs: + +Поиск по классу CSS +^^^^^^^^^^^^^^^^^^^ + +Очень удобно искать тег с определенным классом CSS, но +имя атрибута CSS, "class", является зарезервированным словом в +Python. Использование ``class`` в качестве именованного аргумента приведет к синтаксической +ошибке. Начиная с Beautiful Soup 4.1.2, вы можете выполнять поиск по классу CSS, используя +именованный аргумент ``class_``:: + + soup.find_all("a", class_="sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Как и с любым именованным аргументом, вы можете передать в качестве значения ``class_`` строку, регулярное +выражение, функцию или ``True``:: + + soup.find_all(class_=re.compile("itl")) + # [<p class="title"><b>The Dormouse's story</b></p>] + + def has_six_characters(css_class): + return css_class is not None and len(css_class) == 6 + + soup.find_all(class_=has_six_characters) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Помните, что один тег может иметь :ref:`несколько значений <multivalue>` +для атрибута "class". Когда вы ищете тег, который +соответствует определенному классу CSS, вы ищете соответствие `любому` из его +классов CSS:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.find_all("p", class_="strikeout") + # [<p class="body strikeout"></p>] + + css_soup.find_all("p", class_="body") + # [<p class="body strikeout"></p>] + +Можно искать точное строковое значение атрибута ``class``:: + + css_soup.find_all("p", class_="body strikeout") + # [<p class="body strikeout"></p>] + +Но поиск вариантов строкового значения не сработает:: + + css_soup.find_all("p", class_="strikeout body") + # [] + +Если вы хотите искать теги, которые соответствуют двум или более классам CSS, +следует использовать селектор CSS:: + + css_soup.select("p.strikeout.body") + # [<p class="body strikeout"></p>] + +В старых версиях Beautiful Soup, в которых нет ярлыка ``class_`` +можно использовать трюк с аргументом ``attrs``, упомянутый выше. Создайте +словарь, значение которого для "class" является строкой (или регулярным +выражением, или чем угодно еще), которую вы хотите найти:: + + soup.find_all("a", attrs={"class": "sister"}) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _string: + +Аргумент ``string`` +^^^^^^^^^^^^^^^^^^^ + +С помощью ``string`` вы можете искать строки вместо тегов. Как и в случае с +``name`` и именованными аргументами, передаваться может `строка`_, +`регулярное выражение`_, `список`_, `функция`_ или значения `True`_. +Вот несколько примеров:: + + soup.find_all(string="Elsie") + # [u'Elsie'] + + soup.find_all(string=["Tillie", "Elsie", "Lacie"]) + # [u'Elsie', u'Lacie', u'Tillie'] + + soup.find_all(string=re.compile("Dormouse")) + [u"The Dormouse's story", u"The Dormouse's story"] + + def is_the_only_string_within_a_tag(s): + """Return True if this string is the only child of its parent tag.""" + return (s == s.parent.string) + + soup.find_all(string=is_the_only_string_within_a_tag) + # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] + +Хотя значение типа ``string`` предназначено для поиска строк, вы можете комбинировать его с +аргументами, которые находят теги: Beautiful Soup найдет все теги, в которых +``.string`` соответствует вашему значению для ``string``. Следующий код находит все теги <a>, +у которых ``.string`` равно "Elsie":: + + soup.find_all("a", string="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +Аргумент ``string`` — это новое в Beautiful Soup 4.4.0. В ранних +версиях он назывался ``text``:: + + soup.find_all("a", text="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +.. _limit: + +Аргумент ``limit`` +^^^^^^^^^^^^^^^^^^ + +``find_all()`` возвращает все теги и строки, которые соответствуют вашим +фильтрам. Это может занять некоторое время, если документ большой. Если вам не +нужны `все` результаты, вы можете указать их предельное число — ``limit``. Это +работает так же, как ключевое слово LIMIT в SQL. Оно говорит Beautiful Soup +прекратить собирать результаты после того, как их найдено определенное количество. + +В фрагменте из «Алисы в стране чудес» есть три ссылки, но следующий код +находит только первые две:: + + soup.find_all("a", limit=2) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +.. _recursive: + +Аргумент ``recursive`` +^^^^^^^^^^^^^^^^^^^^^^ + +Если вы вызовете ``mytag.find_all()``, Beautiful Soup проверит всех +потомков ``mytag``: его дочерние элементы, дочерние элементы дочерних элементов, и +так далее. Если вы хотите, чтобы Beautiful Soup рассматривал только непосредственных потомков (дочерние элементы), +вы можете передать ``recursive = False``. Оцените разницу:: + + soup.html.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.html.find_all("title", recursive=False) + # [] + +Вот эта часть документа:: + + <html> + <head> + <title> + The Dormouse's story + </title> + </head> + ... + +Тег <title> находится под тегом <html>, но не `непосредственно` +под тегом <html>: на пути встречается тег <head>. Beautiful Soup +находит тег <title>, когда разрешено просматривать всех потомков +тега <html>, но когда ``recursive=False`` ограничивает поиск +только непосредстввенно дочерними элементами, Beautiful Soup ничего не находит. + +Beautiful Soup предлагает множество методов поиска по дереву (они рассмотрены ниже), +и они в основном принимают те же аргументы, что и ``find_all()``: ``name``, +``attrs``, ``string``, ``limit`` и именованные аргументы. Но +с аргументом ``recursive`` все иначе: ``find_all()`` и ``find()`` — +это единственные методы, которые его поддерживают. От передачи ``recursive=False`` в +метод типа ``find_parents()`` не очень много пользы. + +Вызов тега похож на вызов ``find_all()`` +---------------------------------------- + +Поскольку ``find_all()`` является самым популярным методом в Beautiful +Soup API, вы можете использовать сокращенную запись. Если относиться к +объекту ``BeautifulSoup`` или объекту ``Tag`` так, будто это +функция, то это похоже на вызов ``find_all()`` +с этим объектом. Эти две строки кода эквивалентны:: + + soup.find_all("a") + soup("a") + +Эти две строки также эквивалентны:: + + soup.title.find_all(string=True) + soup.title(string=True) + +``find()`` +---------- + +Сигнатура: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Метод ``find_all()`` сканирует весь документ в поиске +всех результатов, но иногда вам нужен только один. Если вы знаете, +что в документе есть только один тег <body>, нет смысла сканировать +весь документ в поиске остальных. Вместо того, чтобы передавать ``limit=1`` +каждый раз, когда вы вызываете ``find_all()``, используйте +метод ``find()``. Эти две строки кода эквивалентны:: + + soup.find_all('title', limit=1) + # [<title>The Dormouse's story</title>] + + soup.find('title') + # <title>The Dormouse's story</title> + +Разница лишь в том, что ``find_all()`` возвращает список, содержащий +единственный результат, а ``find()`` возвращает только сам результат. + +Если ``find_all()`` не может ничего найти, он возвращает пустой список. Если +``find()`` не может ничего найти, он возвращает ``None``:: + + print(soup.find("nosuchtag")) + # None + +Помните трюк с ``soup.head.title`` из раздела +`Навигация с использованием имен тегов`_? Этот трюк работает на основе неоднократного вызова ``find()``:: + + soup.head.title + # <title>The Dormouse's story</title> + + soup.find("head").find("title") + # <title>The Dormouse's story</title> + +``find_parents()`` и ``find_parent()`` +-------------------------------------- + +Сигнатура: find_parents(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Сигнатура: find_parent(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Я долго объяснял, как работают ``find_all()`` и +``find()``. Beautiful Soup API определяет десяток других методов для +поиска по дереву, но пусть вас это не пугает. Пять из этих методов +в целом похожи на ``find_all()``, а другие пять в целом +похожи на ``find()``. Единственное различие в том, по каким частям +дерева они ищут. + +Сначала давайте рассмотрим ``find_parents()`` и +``find_parent()``. Помните, что ``find_all()`` и ``find()`` прорабатывают +дерево сверху вниз, просматривая теги и их потомков. ``find_parents()`` и ``find_parent()`` +делают наоборот: они идут `снизу вверх`, рассматривая +родительские элементы тега или строки. Давайте испытаем их, начав со строки, +закопанной глубоко в фрагменте из «Алисы в стране чудес»:: + + a_string = soup.find(string="Lacie") + a_string + # u'Lacie' + + a_string.find_parents("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + a_string.find_parent("p") + # <p class="story">Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; + # and they lived at the bottom of a well.</p> + + a_string.find_parents("p", class="title") + # [] + +Один из трех тегов <a> является прямым родителем искомой строки, +так что наш поиск находит его. Один из трех тегов <p> является +непрямым родителем строки, и наш поиск тоже его +находит. Где-то в документе есть тег <p> с классом CSS "title", +но он не является родительским для строки, так что мы не можем найти +его с помощью ``find_parents()``. + +Вы могли заметить связь между ``find_parent()``, +``find_parents()`` и атрибутами `.parent`_ и `.parents`_, +упомянутыми ранее. Связь очень сильная. Эти методы поиска +на самом деле используют ``.parents``, чтобы перебрать все родительские элементы и проверить +каждый из них на соответствие заданному фильтру. + +``find_next_siblings()`` и ``find_next_sibling()`` +-------------------------------------------------- + +Сигнатура: find_next_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Сигнатура: find_next_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Эти методы используют :ref:`.next_siblings <sibling-generators>` для +перебора одноуровневых элементов для данного элемента в дереве. Метод +``find_next_siblings()`` возвращает все подходящие одноуровневые элементы, +а ``find_next_sibling()`` возвращает только первый из них:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_next_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_next_sibling("p") + # <p class="story">...</p> + +``find_previous_siblings()`` и ``find_previous_sibling()`` +---------------------------------------------------------- + +Сигнатура: find_previous_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Сигнатура: find_previous_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Эти методы используют :ref:`.previous_siblings <sibling-generators>` для перебора тех одноуровневых элементов, +которые предшествуют данному элементу в дереве разбора. Метод ``find_previous_siblings()`` +возвращает все подходящие одноуровневые элементы,, а +а ``find_next_sibling()`` только первый из них:: + + last_link = soup.find("a", id="link3") + last_link + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_link.find_previous_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_previous_sibling("p") + # <p class="title"><b>The Dormouse's story</b></p> + + +``find_all_next()`` и ``find_next()`` +------------------------------------- + +Сигнатура: find_all_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Сигнатура: find_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Эти методы используют :ref:`.next_elements <element-generators>` для +перебора любых тегов и строк, которые встречаются в документе после +элемента. Метод ``find_all_next()`` возвращает все совпадения, а +``find_next()`` только первое:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_next(string=True) + # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] + + first_link.find_next("p") + # <p class="story">...</p> + +В первом примере нашлась строка "Elsie", хотя она +содержится в теге <a>, с которого мы начали. Во втором примере +нашелся последний тег <p>, хотя он находится +в другой части дерева, чем тег <a>, с которого мы начали. Для этих +методов имеет значение только то, что элемент соответствует фильтру и +появляется в документе позже, чем тот элемент, с которого начали поиск. + +``find_all_previous()`` и ``find_previous()`` +--------------------------------------------- + +Сигнатура: find_all_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Сигнатура: find_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +Эти методы используют :ref:`.previous_elements <element-generators>` для +перебора любых тегов и строк, которые встречаются в документе до +элемента. Метод ``find_all_previous()`` возвращает все совпадения, а +``find_previous()`` только первое:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_previous("p") + # [<p class="story">Once upon a time there were three little sisters; ...</p>, + # <p class="title"><b>The Dormouse's story</b></p>] + + first_link.find_previous("title") + # <title>The Dormouse's story</title> + +Вызов ``find_all_previous ("p")`` нашел первый абзац в +документе (тот, который с ``class = "title"``), но он также находит +второй абзац, а именно тег <p>, содержащий тег <a>, с которого мы +начали. Это не так уж удивительно: мы смотрим на все теги, +которые появляются в документе раньше, чем тот, с которого мы начали. Тег +<p>, содержащий тег <a>, должен был появиться до тега <a>, который +в нем содержится. + +Селекторы CSS +------------- + +Начиная с версии 4.7.0, Beautiful Soup поддерживает большинство селекторов CSS4 благодаря +проекту `SoupSieve +<https://facelessuser.github.io/soupsieve/>`_. Если вы установили Beautiful Soup через ``pip``, одновременно должен был установиться SoupSieve, +так что вам больше ничего не нужно делать. + +В ``BeautifulSoup`` есть метод ``.select()``, который использует SoupSieve, чтобы +запустить селектор CSS и вернуть все +подходящие элементы. ``Tag`` имеет похожий метод, который запускает селектор CSS +в отношении содержимого одного тега. + +(В более ранних версиях Beautiful Soup тоже есть метод ``.select()``, +но поддерживаются только наиболее часто используемые селекторы CSS.) + +В `документации SoupSieve +<https://facelessuser.github.io/soupsieve/>`_ перечислены все +селекторы CSS, которые поддерживаются на данный момент, но вот некоторые из основных: + +Вы можете найти теги:: + + soup.select("title") + # [<title>The Dormouse's story</title>] + + soup.select("p:nth-of-type(3)") + # [<p class="story">...</p>] + +Найти теги под другими тегами:: + + soup.select("body a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("html head title") + # [<title>The Dormouse's story</title>] + +Найти теги `непосредственно` под другими тегами:: + + soup.select("head > title") + # [<title>The Dormouse's story</title>] + + soup.select("p > a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("p > a:nth-of-type(2)") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + soup.select("p > #link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("body > a") + # [] + +Найти одноуровневые элементы тега:: + + soup.select("#link1 ~ .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("#link1 + .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Найти теги по классу CSS:: + + soup.select(".sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("[class~=sister]") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Найти теги по ID:: + + soup.select("#link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("a#link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Найти теги, которые соответствуют любому селектору из списка:: + + soup.select("#link1,#link2") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Проверка на наличие атрибута:: + + soup.select('a[href]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Найти теги по значению атрибута:: + + soup.select('a[href="http://example.com/elsie"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select('a[href^="http://example.com/"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href$="tillie"]') + # [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href*=".com/el"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +Есть также метод ``select_one()``, который находит только +первый тег, соответствующий селектору:: + + soup.select_one(".sister") + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +Если вы разобрали XML, в котором определены пространства имен, вы можете использовать их в +селекторах CSS:: + + from bs4 import BeautifulSoup + xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/"> + <ns1:child>I'm in namespace 1</ns1:child> + <ns2:child>I'm in namespace 2</ns2:child> + </tag> """ + soup = BeautifulSoup(xml, "xml") + + soup.select("child") + # [<ns1:child>I'm in namespace 1</ns1:child>, <ns2:child>I'm in namespace 2</ns2:child>] + + soup.select("ns1|child", namespaces=namespaces) + # [<ns1:child>I'm in namespace 1</ns1:child>] + +При обработке селектора CSS, который использует пространства имен, Beautiful Soup +использует сокращения пространства имен, найденные при разборе +документа. Вы можете заменить сокращения своими собственными, передав словарь +сокращений:: + + namespaces = dict(first="http://namespace1/", second="http://namespace2/") + soup.select("second|child", namespaces=namespaces) + # [<ns1:child>I'm in namespace 2</ns1:child>] + +Все эти селекторы CSS удобны для тех, кто уже +знаком с синтаксисом селекторов CSS. Вы можете сделать все это с помощью +Beautiful Soup API. И если CSS селекторы — это все, что вам нужно, вам следует +использовать парсер lxml: так будет намного быстрее. Но вы можете +`комбинировать` селекторы CSS с Beautiful Soup API. + +Изменение дерева +================ + +Основная сила Beautiful Soup в поиске по дереву разбора, но вы +также можете изменить дерево и записать свои изменения в виде нового HTML или +XML-документа. + +Изменение имен тегов и атрибутов +-------------------------------- + +Я говорил об этом раньше, в разделе `Атрибуты`_, но это стоит повторить. Вы +можете переименовать тег, изменить значения его атрибутов, добавить новые +атрибуты и удалить атрибуты:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + + tag.name = "blockquote" + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + +Изменение ``.string`` +--------------------- + +Если вы замените значение атрибута ``.string`` новой строкой, содержимое тега будет +заменено на эту строку:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + + tag = soup.a + tag.string = "New link text." + tag + # <a href="http://example.com/">New link text.</a> + +Будьте осторожны: если тег содержит другие теги, они и все их +содержимое будет уничтожено. + +``append()`` +------------ + +Вы можете добавить содержимое тега с помощью ``Tag.append()``. Это работает +точно так же, как ``.append()`` для списка в Python:: + + soup = BeautifulSoup("<a>Foo</a>") + soup.a.append("Bar") + + soup + # <html><head></head><body><a>FooBar</a></body></html> + soup.a.contents + # [u'Foo', u'Bar'] + +``extend()`` +------------ + +Начиная с версии Beautiful Soup 4.7.0, ``Tag`` также поддерживает метод +``.extend()``, который работает так же, как вызов ``.extend()`` для +списка в Python:: + + soup = BeautifulSoup("<a>Soup</a>") + soup.a.extend(["'s", " ", "on"]) + + soup + # <html><head></head><body><a>Soup's on</a></body></html> + soup.a.contents + # [u'Soup', u''s', u' ', u'on'] + +``NavigableString()`` и ``.new_tag()`` +-------------------------------------- + +Если вам нужно добавить строку в документ, нет проблем — вы можете передать +строку Python в ``append()`` или вызвать +конструктор ``NavigableString``:: + + soup = BeautifulSoup("<b></b>") + tag = soup.b + tag.append("Hello") + new_string = NavigableString(" there") + tag.append(new_string) + tag + # <b>Hello there.</b> + tag.contents + # [u'Hello', u' there'] + +Если вы хотите создать комментарий или другой подкласс +``NavigableString``, просто вызовите конструктор:: + + from bs4 import Comment + new_comment = Comment("Nice to see you.") + tag.append(new_comment) + tag + # <b>Hello there<!--Nice to see you.--></b> + tag.contents + # [u'Hello', u' there', u'Nice to see you.'] + +(Это новая функция в Beautiful Soup 4.4.0.) + +Что делать, если вам нужно создать совершенно новый тег? Наилучшим решением будет +вызвать фабричный метод ``BeautifulSoup.new_tag()``:: + + soup = BeautifulSoup("<b></b>") + original_tag = soup.b + + new_tag = soup.new_tag("a", href="http://www.example.com") + original_tag.append(new_tag) + original_tag + # <b><a href="http://www.example.com"></a></b> + + new_tag.string = "Link text." + original_tag + # <b><a href="http://www.example.com">Link text.</a></b> + +Нужен только первый аргумент, имя тега. + +``insert()`` +------------ + +``Tag.insert()`` похож на ``Tag.append()``, за исключением того, что новый элемент +не обязательно добавляется в конец родительского +``.contents``. Он добавится в любое место, номер которого +вы укажете. Это работает в точности как ``.insert()`` в списке Python:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.insert(1, "but did not endorse ") + tag + # <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a> + tag.contents + # [u'I linked to ', u'but did not endorse', <i>example.com</i>] + +``insert_before()`` и ``insert_after()`` +---------------------------------------- + +Метод ``insert_before()`` вставляет теги или строки непосредственно +перед чем-то в дереве разбора:: + + soup = BeautifulSoup("<b>stop</b>") + tag = soup.new_tag("i") + tag.string = "Don't" + soup.b.string.insert_before(tag) + soup.b + # <b><i>Don't</i>stop</b> + +Метод ``insert_after()`` вставляет теги или строки непосредственно +после чего-то в дереве разбора:: + + div = soup.new_tag('div') + div.string = 'ever' + soup.b.i.insert_after(" you ", div) + soup.b + # <b><i>Don't</i> you <div>ever</div> stop</b> + soup.b.contents + # [<i>Don't</i>, u' you', <div>ever</div>, u'stop'] + +``clear()`` +----------- + +``Tag.clear()`` удаляет содержимое тега:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.clear() + tag + # <a href="http://example.com/"></a> + +``extract()`` +------------- + +``PageElement.extract()`` удаляет тег или строку из дерева. Он +возвращает тег или строку, которая была извлечена:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + i_tag = soup.i.extract() + + a_tag + # <a href="http://example.com/">I linked to</a> + + i_tag + # <i>example.com</i> + + print(i_tag.parent) + None + +К этому моменту у вас фактически есть два дерева разбора: одно в +объекте ``BeautifulSoup``, который вы использовали, чтобы разобрать документ, другое в +теге, который был извлечен. Вы можете далее вызывать ``extract`` в отношении +дочернего элемента того тега, который был извлечен:: + + my_string = i_tag.string.extract() + my_string + # u'example.com' + + print(my_string.parent) + # None + i_tag + # <i></i> + + +``decompose()`` +--------------- + +``Tag.decompose()`` удаляет тег из дерева, а затем `полностью +уничтожает его вместе с его содержимым`:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + soup.i.decompose() + + a_tag + # <a href="http://example.com/">I linked to</a> + + +.. _replace_with(): + +``replace_with()`` +------------------ + +``PageElement.extract()`` удаляет тег или строку из дерева +и заменяет его тегом или строкой по вашему выбору:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + new_tag = soup.new_tag("b") + new_tag.string = "example.net" + a_tag.i.replace_with(new_tag) + + a_tag + # <a href="http://example.com/">I linked to <b>example.net</b></a> + +``replace_with()`` возвращает тег или строку, которые были заменены, так что +вы можете изучить его или добавить его обратно в другую часть дерева. + +``wrap()`` +---------- + +``PageElement.wrap()`` обертывает элемент в указанный вами тег. Он +возвращает новую обертку:: + + soup = BeautifulSoup("<p>I wish I was bold.</p>") + soup.p.string.wrap(soup.new_tag("b")) + # <b>I wish I was bold.</b> + + soup.p.wrap(soup.new_tag("div") + # <div><p><b>I wish I was bold.</b></p></div> + +Это новый метод в Beautiful Soup 4.0.5. + +``unwrap()`` +------------ + +``Tag.unwrap()`` — это противоположность ``wrap()``. Он заменяет весь тег на +его содержимое. Этим методом удобно очищать разметку:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + a_tag.i.unwrap() + a_tag + # <a href="http://example.com/">I linked to example.com</a> + +Как и ``replace_with()``, ``unwrap()`` возвращает тег, +который был заменен. + +``smooth()`` +------------ + +После вызова ряда методов, которые изменяют дерево разбора, у вас может оказаться несколько объектов ``NavigableString`` подряд. У Beautiful Soup с этим нет проблем, но поскольку такое не случается со свежеразобранным документом, вам может показаться неожиданным следующее поведение:: + + soup = BeautifulSoup("<p>A one</p>") + soup.p.append(", a two") + + soup.p.contents + # [u'A one', u', a two'] + + print(soup.p.encode()) + # <p>A one, a two</p> + + print(soup.p.prettify()) + # <p> + # A one + # , a two + # </p> + +Вы можете вызвать ``Tag.smooth()``, чтобы очистить дерево разбора путем объединения смежных строк:: + + soup.smooth() + + soup.p.contents + # [u'A one, a two'] + + print(soup.p.prettify()) + # <p> + # A one, a two + # </p> + +``smooth()`` — это новый метод в Beautiful Soup 4.8.0. + +Вывод +===== + +.. _.prettyprinting: + +Красивое форматирование +----------------------- + +Метод ``prettify()`` превратит дерево разбора Beautiful Soup в +красиво отформатированную строку Unicode, где каждый +тег и каждая строка выводятся на отдельной строчке:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + soup.prettify() + # '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...' + + print(soup.prettify()) + # <html> + # <head> + # </head> + # <body> + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + # </body> + # </html> + +Вы можете вызвать ``prettify()`` для объекта ``BeautifulSoup`` верхнего уровня +или для любого из его объектов ``Tag``:: + + print(soup.a.prettify()) + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + +Без красивого форматирования +---------------------------- + +Если вам нужна просто строка, без особого форматирования, вы можете вызвать +``unicode()`` или ``str()`` для объекта ``BeautifulSoup`` или объекта ``Tag`` +внутри:: + + str(soup) + # '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>' + + unicode(soup.a) + # u'<a href="http://example.com/">I linked to <i>example.com</i></a>' + +Функция ``str()`` возвращает строку, кодированную в UTF-8. Для получения более подробной информации см. +`Кодировки`_. + +Вы также можете вызвать ``encode()`` для получения байтовой строки, и ``decode()``, +чтобы получить Unicode. + +.. _output_formatters: + +Средства форматирования вывода +------------------------------ + +Если вы дадите Beautiful Soup документ, который содержит HTML-мнемоники, такие как +"&lquot;", они будут преобразованы в символы Unicode:: + + soup = BeautifulSoup("“Dammit!” he said.") + unicode(soup) + # u'<html><head></head><body>\u201cDammit!\u201d he said.</body></html>' + +Если затем преобразовать документ в строку, символы Unicode +будет кодироваться как UTF-8. Вы не получите обратно HTML-мнемоники:: + + str(soup) + # '<html><head></head><body>\xe2\x80\x9cDammit!\xe2\x80\x9d he said.</body></html>' + +По умолчанию единственные символы, которые экранируются при выводе — это чистые +амперсанды и угловые скобки. Они превращаются в «&», «<» +и ">", чтобы Beautiful Soup случайно не сгенерировал +невалидный HTML или XML:: + + soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>") + soup.p + # <p>The law firm of Dewey, Cheatem, & Howe</p> + + soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') + soup.a + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + +Вы можете изменить это поведение, указав для +аргумента ``formatter`` одно из значений: ``prettify()``, ``encode()`` или +``decode()``. Beautiful Soup распознает пять возможных значений +``formatter``. + +Значение по умолчанию — ``formatter="minimal"``. Строки будут обрабатываться +ровно настолько, чтобы Beautiful Soup генерировал валидный HTML / XML:: + + french = "<p>Il a dit <<Sacré bleu!>></p>" + soup = BeautifulSoup(french) + print(soup.prettify(formatter="minimal")) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + +Если вы передадите ``formatter = "html"``, Beautiful Soup преобразует +символы Unicode в HTML-мнемоники, когда это возможно:: + + print(soup.prettify(formatter="html")) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + +Если вы передаете ``formatter="html5"``, это то же самое, что +``formatter="html"``, только Beautiful Soup будет +пропускать закрывающую косую черту в пустых тегах HTML, таких как "br":: + + soup = BeautifulSoup("<br>") + + print(soup.encode(formatter="html")) + # <html><body><br/></body></html> + + print(soup.encode(formatter="html5")) + # <html><body><br></body></html> + +Если вы передадите ``formatter=None``, Beautiful Soup вообще не будет менять +строки на выходе. Это самый быстрый вариант, но он может привести +к тому, что Beautiful Soup будет генерировать невалидный HTML / XML:: + + print(soup.prettify(formatter=None)) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + + link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') + print(link_soup.a.encode(formatter=None)) + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + +Если вам нужен более сложный контроль над выводом, вы можете +использовать класс ``Formatter`` из Beautiful Soup. Вот как можно +преобразовать строки в верхний регистр, независимо от того, находятся ли они в текстовом узле или в +значении атрибута:: + + from bs4.formatter import HTMLFormatter + def uppercase(str): + return str.upper() + formatter = HTMLFormatter(uppercase) + + print(soup.prettify(formatter=formatter)) + # <html> + # <body> + # <p> + # IL A DIT <<SACRÉ BLEU!>> + # </p> + # </body> + # </html> + + print(link_soup.a.prettify(formatter=formatter)) + # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2"> + # A LINK + # </a> + +Подклассы ``HTMLFormatter`` или ``XMLFormatter`` дают еще +больший контроль над выводом. Например, Beautiful Soup сортирует +атрибуты в каждом теге по умолчанию:: + + attr_soup = BeautifulSoup(b'<p z="1" m="2" a="3"></p>') + print(attr_soup.p.encode()) + # <p a="3" m="2" z="1"></p> + +Чтобы выключить сортировку по умолчанию, вы можете создать подкласс на основе метода ``Formatter.attributes()``, +который контролирует, какие атрибуты выводятся и в каком +порядке. Эта реализация также отфильтровывает атрибут с именем "m", +где бы он ни появился:: + + class UnsortedAttributes(HTMLFormatter): + def attributes(self, tag): + for k, v in tag.attrs.items(): + if k == 'm': + continue + yield k, v + print(attr_soup.p.encode(formatter=UnsortedAttributes())) + # <p z="1" a="3"></p> + +Последнее предостережение: если вы создаете объект ``CData``, текст внутри +этого объекта всегда представлен `как есть, без какого-либо +форматирования`. Beautiful Soup вызовет вашу функцию для замены мнемоник, +на тот случай, если вы написали функцию, которая подсчитывает +все строки в документе или что-то еще, но он будет игнорировать +возвращаемое значение:: + + from bs4.element import CData + soup = BeautifulSoup("<a></a>") + soup.a.string = CData("one < three") + print(soup.a.prettify(formatter="xml")) + # <a> + # <![CDATA[one < three]]> + # </a> + + +``get_text()`` +-------------- + +Если вам нужна только текстовая часть документа или тега, вы можете использовать +метод ``get_text()``. Он возвращает весь текст документа или +тега в виде единственной строки Unicode:: + + markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>' + soup = BeautifulSoup(markup) + + soup.get_text() + u'\nI linked to example.com\n' + soup.i.get_text() + u'example.com' + +Вы можете указать строку, которая будет использоваться для объединения текстовых фрагментов +в единую строку:: + + # soup.get_text("|") + u'\nI linked to |example.com|\n' + +Вы можете сказать Beautiful Soup удалять пробелы в начале и +конце каждого текстового фрагмента:: + + # soup.get_text("|", strip=True) + u'I linked to|example.com' + +Но в этом случае вы можете предпочесть использовать генератор :ref:`.stripped_strings <string-generators>` +и затем обработать текст самостоятельно:: + + [text for text in soup.stripped_strings] + # [u'I linked to', u'example.com'] + +Указание парсера +================ + +Если вам нужно просто разобрать HTML, вы можете скинуть разметку в +конструктор ``BeautifulSoup``, и, скорее всего, все будет в порядке. Beautiful +Soup подберет для вас парсер и проанализирует данные. Но есть +несколько дополнительных аргументов, которые вы можете передать конструктору, чтобы изменить +используемый парсер. + +Первым аргументом конструктора ``BeautifulSou`` является строка или +открытый дескриптор файла — сама разметка, которую вы хотите разобрать. Второй аргумент — это +`как` вы хотите, чтобы разметка была разобрана. + +Если вы ничего не укажете, будет использован лучший HTML-парсер из тех, +которые установлены. Beautiful Soup оценивает парсер lxml как лучший, за ним идет +html5lib, затем встроенный парсер Python. Вы можете переопределить используемый парсер, +указав что-то из следующего: + +* Какой тип разметки вы хотите разобрать. В данный момент поддерживаются: + "html", "xml" и "html5". + +* Имя библиотеки парсера, которую вы хотите использовать. В данный момент поддерживаются + "lxml", "html5lib" и "html.parser" (встроенный в Python + парсер HTML). + +В разделе `Установка парсера`_ вы найдете сравнительную таблицу поддерживаемых парсеров. + +Если у вас не установлен соответствующий парсер, Beautiful Soup +проигнорирует ваш запрос и выберет другой парсер. На текущий момент единственный +поддерживаемый парсер XML — это lxml. Если у вас не установлен lxml, запрос на +парсер XML ничего не даст, и запрос "lxml" тоже +не сработает. + +Различия между парсерами +------------------------ + +Beautiful Soup представляет один интерфейс для разных +парсеров, но парсеры неодинаковы. Разные парсеры создадут +различные деревья разбора из одного и того же документа. Самые большие различия будут +между парсерами HTML и парсерами XML. Вот короткий +документ, разобранный как HTML:: + + BeautifulSoup("<a><b /></a>") + # <html><head></head><body><a><b></b></a></body></html> + +Поскольку пустой тег <b /> не является валидным кодом HTML, парсер превращает его в +пару тегов <b></b>. + +Вот тот же документ, который разобран как XML (для его запуска нужно, чтобы был +установлен lxml). Обратите внимание, что пустой тег <b /> остается, и +что в документ добавляется объявление XML вместо +тега <html>:: + + BeautifulSoup("<a><b /></a>", "xml") + # <?xml version="1.0" encoding="utf-8"?> + # <a><b/></a> + +Есть также различия между парсерами HTML. Если вы даете Beautiful +Soup идеально оформленный документ HTML, эти различия не будут +иметь значения. Один парсер будет быстрее другого, но все они будут давать +структуру данных, которая выглядит точно так же, как оригинальный +документ HTML. + +Но если документ оформлен неидеально, различные парсеры +дадут разные результаты. Вот короткий невалидный документ, разобранный с помощью +HTML-парсера lxml. Обратите внимание, что висячий тег </p> просто +игнорируется:: + + BeautifulSoup("<a></p>", "lxml") + # <html><body><a></a></body></html> + +Вот тот же документ, разобранный с помощью html5lib:: + + BeautifulSoup("<a></p>", "html5lib") + # <html><head></head><body><a><p></p></a></body></html> + +Вместо того, чтобы игнорировать висячий тег </p>, html5lib добавляет +открывающй тег <p>. Этот парсер также добавляет пустой тег <head> в +документ. + +Вот тот же документ, разобранный с помощью встроенного в Python +парсера HTML:: + + BeautifulSoup("<a></p>", "html.parser") + # <a></a> + +Как и html5lib, этот парсер игнорирует закрывающий тег </p>. В отличие от +html5lib, этот парсер не делает попытки создать правильно оформленный HTML- +документ, добавив тег <body>. В отличие от lxml, он даже не +добавляет тег <html>. + +Поскольку документ ``<a></p>`` невалиден, ни один из этих способов +нельзя назвать "правильным". Парсер html5lib использует способы, +которые являются частью стандарта HTML5, поэтому он может претендовать на то, что его подход +самый "правильный", но правомерно использовать любой из трех методов. + +Различия между парсерами могут повлиять на ваш скрипт. Если вы планируете +распространять ваш скрипт или запускать его на нескольких +машинах, вам нужно указать парсер в +конструкторе ``BeautifulSoup``. Это уменьшит вероятность того, что ваши пользователи при разборе +документа получат результат, отличный от вашего. + +Кодировки +========= + +Любой документ HTML или XML написан в определенной кодировке, такой как ASCII +или UTF-8. Но когда вы загрузите этот документ в Beautiful Soup, вы +обнаружите, что он был преобразован в Unicode:: + + markup = "<h1>Sacr\xc3\xa9 bleu!</h1>" + soup = BeautifulSoup(markup) + soup.h1 + # <h1>Sacré bleu!</h1> + soup.h1.string + # u'Sacr\xe9 bleu!' + +Это не волшебство. (Хотя это было бы здорово, конечно.) Beautiful Soup использует +подбиблиотеку под названием `Unicode, Dammit`_ для определения кодировки документа +и преобразования ее в Unicode. Кодировка, которая была автоматически определена, содержится в значении +атрибута ``.original_encoding`` объекта ``BeautifulSoup``:: + + soup.original_encoding + 'utf-8' + +Unicode, Dammit чаще всего угадывает правильно, но иногда +делает ошибки. Иногда он угадывает правильно только после +побайтового поиска по документу, что занимает очень много времени. Если +вы вдруг уже знаете кодировку документа, вы можете избежать +ошибок и задержек, передав кодировку конструктору ``BeautifulSoup`` +как аргумент ``from_encoding``. + +Вот документ, написанный на ISO-8859-8. Документ настолько короткий, что +Unicode, Dammit не может разобраться и неправильно идентифицирует кодировку как +ISO-8859-7:: + + markup = b"<h1>\xed\xe5\xec\xf9</h1>" + soup = BeautifulSoup(markup) + soup.h1 + <h1>νεμω</h1> + soup.original_encoding + 'ISO-8859-7' + +Мы можем все исправить, передав правильный ``from_encoding``:: + + soup = BeautifulSoup(markup, from_encoding="iso-8859-8") + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'iso8859-8' + +Если вы не знаете правильную кодировку, но видите, что +Unicode, Dammit определяет ее неправильно, вы можете передать ошибочные варианты в +``exclude_encodings``:: + + soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"]) + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'WINDOWS-1255' + +Windows-1255 не на 100% подходит, но это совместимое +надмножество ISO-8859-8, так что догадка почти верна. (``exclude_encodings`` +— это новая функция в Beautiful Soup 4.4.0.) + +В редких случаях (обычно когда документ UTF-8 содержит текст в +совершенно другой кодировке) единственным способом получить Unicode может оказаться +замена некоторых символов специальным символом Unicode +"REPLACEMENT CHARACTER" (U+FFFD, �). Если Unicode, Dammit приходится это сделать, +он установит атрибут ``.contains_replacement_characters`` +в ``True`` для объектов ``UnicodeDammit`` или ``BeautifulSoup``. Это +даст понять, что представление в виде Unicode не является точным +представление оригинала, и что некоторые данные потерялись. Если документ +содержит �, но ``.contains_replacement_characters`` равен ``False``, +вы будете знать, что � был в тексте изначально (как в этом +параграфе), а не служит заменой отсутствующим данным. + +Кодировка вывода +---------------- + +Когда вы пишете документ из Beautiful Soup, вы получаете документ в UTF-8, +даже если он изначально не был в UTF-8. Вот +документ в кодировке Latin-1:: + + markup = b''' + <html> + <head> + <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /> + </head> + <body> + <p>Sacr\xe9 bleu!</p> + </body> + </html> + ''' + + soup = BeautifulSoup(markup) + print(soup.prettify()) + # <html> + # <head> + # <meta content="text/html; charset=utf-8" http-equiv="Content-type" /> + # </head> + # <body> + # <p> + # Sacré bleu! + # </p> + # </body> + # </html> + +Обратите внимание, что тег <meta> был переписан, чтобы отразить тот факт, что +теперь документ кодируется в UTF-8. + +Если вы не хотите кодировку UTF-8, вы можете передать другую в ``prettify()``:: + + print(soup.prettify("latin-1")) + # <html> + # <head> + # <meta content="text/html; charset=latin-1" http-equiv="Content-type" /> + # ... + +Вы также можете вызвать encode() для объекта ``BeautifulSoup`` или любого +элемента в супе, как если бы это была строка Python:: + + soup.p.encode("latin-1") + # '<p>Sacr\xe9 bleu!</p>' + + soup.p.encode("utf-8") + # '<p>Sacr\xc3\xa9 bleu!</p>' + +Любые символы, которые не могут быть представлены в выбранной вами кодировке, будут +преобразованы в числовые коды мнемоник XML. Вот документ, +который включает в себя Unicode-символ SNOWMAN (снеговик):: + + markup = u"<b>\N{SNOWMAN}</b>" + snowman_soup = BeautifulSoup(markup) + tag = snowman_soup.b + +Символ SNOWMAN может быть частью документа UTF-8 (он выглядит +так: ☃), но в ISO-Latin-1 или +ASCII нет представления для этого символа, поэтому для этих кодировок он конвертируется в "☃": + + print(tag.encode("utf-8")) + # <b>☃</b> + + print tag.encode("latin-1") + # <b>☃</b> + + print tag.encode("ascii") + # <b>☃</b> + +Unicode, Dammit +--------------- + +Вы можете использовать Unicode, Dammit без Beautiful Soup. Он полезен в тех случаях. +когда у вас есть данные в неизвестной кодировке, и вы просто хотите, чтобы они +преобразовались в Unicode:: + + from bs4 import UnicodeDammit + dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'utf-8' + +Догадки Unicode, Dammit станут намного точнее, если вы установите +библиотеки Python ``chardet`` или ``cchardet``. Чем больше данных вы +даете Unicode, Dammit, тем точнее он определит кодировку. Если у вас есть +собственные предположения относительно возможных кодировок, вы можете передать +их в виде списка:: + + dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'latin-1' + +В Unicode, Dammit есть две специальные функции, которые Beautiful Soup не +использует. + +Парные кавычки +^^^^^^^^^^^^^^ + +Вы можете использовать Unicode, Dammit, чтобы конвертировать парные кавычки (Microsoft smart quotes) в +мнемоники HTML или XML:: + + markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>" + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup + # u'<p>I just “love” Microsoft Word’s smart quotes</p>' + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup + # u'<p>I just “love” Microsoft Word’s smart quotes</p>' + +Вы также можете конвертировать парные кавычки в обычные кавычки ASCII:: + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup + # u'<p>I just "love" Microsoft Word\'s smart quotes</p>' + +Надеюсь, вы найдете эту функцию полезной, но Beautiful Soup не +использует ее. Beautiful Soup по умолчанию +конвертирует парные кавычки в символы Unicode, как и +все остальное:: + + UnicodeDammit(markup, ["windows-1252"]).unicode_markup + # u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>' + +Несогласованные кодировки +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Иногда документ кодирован в основном в UTF-8, но содержит символы Windows-1252, +такие как, опять-таки, парные кавычки. Такое бывает, +когда веб-сайт содержит данные из нескольких источников. Вы можете использовать +``UnicodeDammit.detwingle()``, чтобы превратить такой документ в чистый +UTF-8. Вот простой пример:: + + snowmen = (u"\N{SNOWMAN}" * 3) + quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") + doc = snowmen.encode("utf8") + quote.encode("windows_1252") + +В этом документе бардак. Снеговики в UTF-8, а парные кавычки +в Windows-1252. Можно отображать или снеговиков, или кавычки, но не +то и другое одновременно:: + + print(doc) + # ☃☃☃�I like snowmen!� + + print(doc.decode("windows-1252")) + # ☃☃☃“I like snowmen!” + +Декодирование документа как UTF-8 вызывает ``UnicodeDecodeError``, а +декодирование его как Windows-1252 выдаст тарабарщину. К счастью, +``UnicodeDammit.detwingle()`` преобразует строку в чистый UTF-8, +позволяя затем декодировать его в Unicode и отображать снеговиков и кавычки +одновременно:: + + new_doc = UnicodeDammit.detwingle(doc) + print(new_doc.decode("utf8")) + # ☃☃☃“I like snowmen!” + +``UnicodeDammit.detwingle()`` знает только, как обрабатывать Windows-1252, +встроенный в UTF-8 (и наоборот, мне кажется), но это наиболее +общий случай. + +Обратите внимание, что нужно вызывать ``UnicodeDammit.detwingle()`` для ваших данных +перед передачей в конструктор ``BeautifulSoup`` или +``UnicodeDammit``. Beautiful Soup предполагает, что документ имеет единую +кодировку, какой бы она ни была. Если вы передадите ему документ, который +содержит как UTF-8, так и Windows-1252, скорее всего, он решит, что весь +документ кодируется в Windows-1252, и это будет выглядеть как +``☃☃☃“I like snowmen!”``. + +``UnicodeDammit.detwingle()`` — это новое в Beautiful Soup 4.1.0. + +Нумерация строк +=============== + +Парсеры ``html.parser`` и ``html5lib`` могут отслеживать, где в +исходном документе был найден каждый тег. Вы можете получить доступ к этой +информации через ``Tag.sourceline`` (номер строки) и ``Tag.sourcepos`` +(позиция начального тега в строке):: + + markup = "<p\n>Paragraph 1</p>\n <p>Paragraph 2</p>" + soup = BeautifulSoup(markup, 'html.parser') + for tag in soup.find_all('p'): + print(tag.sourceline, tag.sourcepos, tag.string) + # (1, 0, u'Paragraph 1') + # (2, 3, u'Paragraph 2') + +Обратите внимание, что два парсера понимают +``sourceline`` и ``sourcepos`` немного по-разному. Для html.parser эти числа +представляет позицию начального знака "<". Для html5lib +эти числа представляют позицию конечного знака ">":: + + soup = BeautifulSoup(markup, 'html5lib') + for tag in soup.find_all('p'): + print(tag.sourceline, tag.sourcepos, tag.string) + # (2, 1, u'Paragraph 1') + # (3, 7, u'Paragraph 2') + +Вы можете отключить эту функцию, передав ``store_line_numbers = False`` +в конструктор ``BeautifulSoup``:: + + markup = "<p\n>Paragraph 1</p>\n <p>Paragraph 2</p>" + soup = BeautifulSoup(markup, 'html.parser', store_line_numbers=False) + soup.p.sourceline + # None + +Эта функция является новой в 4.8.1, и парсеры, основанные на lxml, не +поддерживают ее. + +Проверка объектов на равенство +============================== + +Beautiful Soup считает, что два объекта ``NavigableString`` или ``Tag`` +равны, если они представлены в одинаковой разметке HTML или XML. В этом +примере два тега <b> рассматриваются как равные, даже если они находятся +в разных частях дерева объекта, потому что они оба выглядят как +``<b>pizza</b>``:: + + markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>" + soup = BeautifulSoup(markup, 'html.parser') + first_b, second_b = soup.find_all('b') + print first_b == second_b + # True + + print first_b.previous_element == second_b.previous_element + # False + +Если вы хотите выяснить, указывают ли две переменные на один и тот же +объект, используйте `is`:: + + print first_b is second_b + # False + +Копирование объектов Beautiful Soup +=================================== + +Вы можете использовать ``copy.copy()`` для создания копии любого ``Tag`` или +``NavigableString``:: + + import copy + p_copy = copy.copy(soup.p) + print p_copy + # <p>I want <b>pizza</b> and more <b>pizza</b>!</p> + +Копия считается равной оригиналу, так как у нее +такая же разметка, что и у оригинала, но это другой объект:: + + print soup.p == p_copy + # True + + print soup.p is p_copy + # False + +Единственная настоящая разница в том, что копия полностью отделена от +исходного дерева объекта Beautiful Soup, как если бы в отношении нее вызвали +метод ``extract()``:: + + print p_copy.parent + # None + +Это потому, что два разных объекта ``Tag`` не могут занимать одно и то же +пространство в одно и то же время. + + +Разбор части документа +====================== + +Допустим, вы хотите использовать Beautiful Soup, чтобы посмотреть на +теги <a> в документе. Было бы бесполезной тратой времени и памяти разобирать весь документ и +затем снова проходить по нему в поисках тегов <a>. Намного быстрее +изначательно игнорировать все, что не является тегом <a>. Класс +``SoupStrainer`` позволяет выбрать, какие части входящего +документ разбирать. Вы просто создаете ``SoupStrainer`` и передаете его в +конструктор ``BeautifulSoup`` в качестве аргумента ``parse_only``. + +(Обратите внимание, что *эта функция не будет работать, если вы используете парсер html5lib*. +Если вы используете html5lib, будет разобран весь документ, независимо +от обстоятельств. Это потому что html5lib постоянно переставляет части дерева разбора +в процессе работы, и если какая-то часть документа не +попала в дерево разбора, все рухнет. Чтобы избежать путаницы, в +примерах ниже я принудительно использую встроенный в Python +парсер HTML.) + +``SoupStrainer`` +---------------- + +Класс ``SoupStrainer`` принимает те же аргументы, что и типичный +метод из раздела `Поиск по дереву`_: :ref:`name <name>`, :ref:`attrs +<attrs>`, :ref:`string <string>` и :ref:`**kwargs <kwargs>`. Вот +три объекта ``SoupStrainer``:: + + from bs4 import SoupStrainer + + only_a_tags = SoupStrainer("a") + + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return len(string) < 10 + + only_short_strings = SoupStrainer(string=is_short_string) + +Вернемся к фрагменту из «Алисы в стране чудес» +и увидим, как выглядит документ, когда он разобран с этими +тремя объектами ``SoupStrainer``:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) + # Elsie + # , + # Lacie + # and + # Tillie + # ... + # + +Вы также можете передать ``SoupStrainer`` в любой из методов. описанных в разделе +`Поиск по дереву`_. Может, это не безумно полезно, но я +решил упомянуть:: + + soup = BeautifulSoup(html_doc) + soup.find_all(only_short_strings) + # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u'\n\n', u'...', u'\n'] + +Устранение неисправностей +========================= + +.. _diagnose: + +``diagnose()`` +-------------- + +Если у вас возникли проблемы с пониманием того, что Beautiful Soup делает с +документом, передайте документ в функцию ``Diagnose()``. (Новое в +Beautiful Soup 4.2.0.) Beautiful Soup выведет отчет, показывающий, +как разные парсеры обрабатывают документ, и сообщит вам, если +отсутствует парсер, который Beautiful Soup мог бы использовать:: + + from bs4.diagnose import diagnose + with open("bad.html") as fp: + data = fp.read() + diagnose(data) + + # Diagnostic running on Beautiful Soup 4.2.0 + # Python version 2.7.3 (default, Aug 1 2012, 05:16:07) + # I noticed that html5lib is not installed. Installing it may help. + # Found lxml version 2.3.2.0 + # + # Trying to parse your data with html.parser + # Here's what html.parser did with the document: + # ... + +Простой взгляд на вывод diagnose() может показать, как решить +проблему. Если это и не поможет, вы можете скопировать вывод ``Diagnose()``, когда +обратитесь за помощью. + +Ошибки при разборе документа +---------------------------- + +Существует два вида ошибок разбора. Есть сбои, +когда вы подаете документ в Beautiful Soup, и это поднимает +исключение, обычно ``HTMLParser.HTMLParseError``. И есть +неожиданное поведение, когда дерево разбора Beautiful Soup сильно +отличается от документа, использованного для создания дерева. + +Практически никогда источником этих проблемы не бывает Beautiful +Soup. Это не потому, что Beautiful Soup так прекрасно +написан. Это потому, что Beautiful Soup не содержит +кода, который бы разбирал документ. Beautiful Soup опирается на внешние парсеры. Если один парсер +не подходит для разбора документа, лучшим решением будет попробовать +другой парсер. В разделе `Установка парсера`_ вы найдете больше информации +и таблицу сравнения парсеров. + +Наиболее распространенные ошибки разбора — это ``HTMLParser.HTMLParseError: +malformed start tag`` и ``HTMLParser.HTMLParseError: bad end +tag``. Они оба генерируются встроенным в Python парсером HTML, +и решением будет :ref:`установить lxml или +html5lib. <parser-installation>` + +Наиболее распространенный тип неожиданного поведения — когда вы не можете найти +тег, который точно есть в документе. Вы видели его на входе, но +``find_all()`` возвращает ``[]``, или ``find()`` возвращает ``None``. Это +еще одна распространенная проблема со встроенным в Python парсером HTML, который +иногда пропускает теги, которые он не понимает. Опять же, решение заключается в +:ref:`установке lxml или html5lib <parser-installation>`. + +Проблемы несоответствия версий +------------------------------ + +* ``SyntaxError: Invalid syntax`` (в строке ``ROOT_TAG_NAME = + u'[document]'``) — вызвано запуском версии Beautiful Soup на Python 2 + под Python 3 без конвертации кода. + +* ``ImportError: No module named HTMLParser`` — вызвано запуском + версия Beautiful Soup на Python 3 под Python 2. + +* ``ImportError: No module named html.parser`` — вызвано запуском + версия Beautiful Soup на Python 2 под Python 3. + +* ``ImportError: No module named BeautifulSoup`` — вызвано запуском + кода Beautiful Soup 3 в системе, где BS3 + не установлен. Или код писали на Beautiful Soup 4, не зная, что + имя пакета сменилось на ``bs4``. + +* ``ImportError: No module named bs4`` — вызвано запуском + кода Beautiful Soup 4 в системе, где BS4 не установлен. + +.. _parsing-xml: + +Разбор XML +---------- + +По умолчанию Beautiful Soup разбирает документы как HTML. Чтобы разобрать +документ в виде XML, передайте "xml" в качестве второго аргумента +в конструктор ``BeautifulSoup``:: + + soup = BeautifulSoup(markup, "xml") + +Вам также нужно будет :ref:`установить lxml <parser-installation>`. + +Другие проблемы с парсерами +--------------------------- + +* Если ваш скрипт работает на одном компьютере, но не работает на другом, или работает в одной + виртуальной среде, но не в другой, или работает вне виртуальной + среды, но не внутри нее, это, вероятно, потому что в двух + средах разные библиотеки парсеров. Например, + вы могли разработать скрипт на компьютере с установленным lxml, + а затем попытались запустить его на компьютере, где установлен только + html5lib. Читайте в разделе `Различия между парсерами`_, почему это + важно, и исправляйте проблемы, указывая конкретную библиотеку парсера + в конструкторе ``BeautifulSoup``. + +* Поскольку `HTML-теги и атрибуты нечувствительны к регистру + <http://www.w3.org/TR/html5/syntax.html#syntax>`_, все три HTML- + парсера конвертируют имена тегов и атрибутов в нижний регистр. Таким образом, + разметка <TAG></TAG> преобразуется в <tag></tag>. Если вы хотите + сохранить смешанный или верхний регистр тегов и атрибутов, вам нужно + :ref:`разобрать документ как XML <parsing-xml>`. + +.. _misc: + +Прочие ошибки +------------- + +* ``UnicodeEncodeError: 'charmap' codec can't encode character + u'\xfoo' in position bar`` (или практически любая другая ошибка + ``UnicodeEncodeError``) — это не проблема с Beautiful Soup. + Эта проблема проявляется в основном в двух ситуациях. Во-первых, когда вы пытаетесь + вывести символ Unicode, который ваша консоль не может отобразить, потому что не знает, как. + (Смотрите `эту страницу в Python вики + <http://wiki.python.org/moin/PrintFails>`_.) Во-вторых, когда + вы пишете в файл и передаете символ Unicode, который + не поддерживается вашей кодировкой по умолчанию. В этом случае самым простым + решением будет явное кодирование строки Unicode в UTF-8 с помощью + ``u.encode("utf8")``. + +* ``KeyError: [attr]`` — вызывается при обращении к ``tag['attr']``, когда + в искомом теге не определен атрибут ``attr``. Наиболее + типичны ошибки ``KeyError: 'href'`` и ``KeyError: + 'class'``. Используйте ``tag.get('attr')``, если вы не уверены, что ``attr`` + определен — так же, как если бы вы работали со словарем Python. + +* ``AttributeError: 'ResultSet' object has no attribute 'foo'`` — это + обычно происходит тогда, когда вы ожидаете, что ``find_all()`` вернет + один тег или строку. Но ``find_all()`` возвращает *список* тегов + и строк в объекте ``ResultSet``. Вам нужно перебрать + список и поискать ``.foo`` в каждом из элементов. Или, если вам действительно + нужен только один результат, используйте ``find()`` вместо + ``find_all()``. + +* ``AttributeError: 'NoneType' object has no attribute 'foo'`` — это + обычно происходит, когда вы вызываете ``find()`` и затем пытаетесь + получить доступ к атрибуту ``.foo``. Но в вашем случае + ``find()`` не нашел ничего, поэтому вернул ``None`` вместо + того, чтобы вернуть тег или строку. Вам нужно выяснить, почему + ``find()`` ничего не возвращает. + +Повышение производительности +---------------------------- + +Beautiful Soup никогда не будет таким же быстрым, как парсеры, на основе которых он +работает. Если время отклика критично, если вы платите за компьютерное время +по часам, или если есть какая-то другая причина, почему компьютерное время +важнее программистского, стоит забыть о Beautiful Soup +и работать непосредственно с `lxml <http://lxml.de/>`_. + +Тем не менее, есть вещи, которые вы можете сделать, чтобы ускорить Beautiful Soup. Если +вы не используете lxml в качестве основного парсера, самое время +:ref:`начать <parser-installation>`. Beautiful Soup разбирает документы +значительно быстрее с lxml, чем с html.parser или html5lib. + +Вы можете значительно ускорить распознавание кодировок, установив +библиотеку `cchardet <http://pypi.python.org/pypi/cchardet/>`_. + +`Разбор части документа`_ не сэкономит много времени в процессе разбора, +но может сэкономить много памяти, что сделает +`поиск` по документу намного быстрее. + + +Beautiful Soup 3 +================ + +Beautiful Soup 3 — предыдущая версия, и она больше +активно не развивается. На текущий момент Beautiful Soup 3 поставляется со всеми основными +дистрибутивами Linux: + +:kbd:`$ apt-get install python-beautifulsoup` + +Он также публикуется через PyPi как ``BeautifulSoup``: + +:kbd:`$ easy_install BeautifulSoup` + +:kbd:`$ pip install BeautifulSoup` + +Вы можете скачать `tar-архив Beautiful Soup 3.2.0 +<http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_. + +Если вы запустили ``easy_install beautifulsoup`` или ``easy_install +BeautifulSoup``, но ваш код не работает, значит, вы ошибочно установили Beautiful +Soup 3. Вам нужно запустить ``easy_install beautifulsoup4``. + +Архивная документация для Beautiful Soup 3 доступна `онлайн +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. + +Перенос кода на BS4 +------------------- + +Большая часть кода, написанного для Beautiful Soup 3, будет работать и в Beautiful +Soup 4 с одной простой заменой. Все, что вам нужно сделать, это изменить +имя пакета c ``BeautifulSoup`` на ``bs4``. Так что это:: + + from BeautifulSoup import BeautifulSoup + +становится этим:: + + from bs4 import BeautifulSoup + +* Если выводится сообщение ``ImportError`` "No module named BeautifulSoup", ваша + проблема в том, что вы пытаетесь запустить код Beautiful Soup 3, в то время как + у вас установлен Beautiful Soup 4. + +* Если выводится сообщение ``ImportError`` "No module named bs4", ваша проблема + в том, что вы пытаетесь запустить код Beautiful Soup 4, в то время как + у вас установлен Beautiful Soup 3. + +Хотя BS4 в основном обратно совместим с BS3, большинство +методов BS3 устарели и получили новые имена, чтобы `соответствовать PEP 8 +<http://www.python.org/dev/peps/pep-0008/>`_. Некоторые +из переименований и изменений нарушают обратную совместимость. + +Вот что нужно знать, чтобы перейти с BS3 на BS4: + +Вам нужен парсер +^^^^^^^^^^^^^^^^ + +Beautiful Soup 3 использовал модуль Python ``SGMLParser``, который теперь +устарел и был удален в Python 3.0. Beautiful Soup 4 по умолчанию использует +``html.parser``, но вы можете подключить lxml или html5lib +вместо него. Вы найдете таблицу сравнения парсеров в разделе `Установка парсера`_. + +Поскольку ``html.parser`` — это не то же, что ``SGMLParser``, вы +можете обнаружить, что Beautiful Soup 4 дает другое дерево разбора, чем +Beautiful Soup 3. Если вы замените html.parser +на lxml или html5lib, может оказаться, что дерево разбора опять +изменилось. Если такое случится, вам придется обновить код, +чтобы разобраться с новым деревом. + +Имена методов +^^^^^^^^^^^^^ + +* ``renderContents`` -> ``encode_contents`` +* ``replaceWith`` -> ``replace_with`` +* ``replaceWithChildren`` -> ``unwrap`` +* ``findAll`` -> ``find_all`` +* ``findAllNext`` -> ``find_all_next`` +* ``findAllPrevious`` -> ``find_all_previous`` +* ``findNext`` -> ``find_next`` +* ``findNextSibling`` -> ``find_next_sibling`` +* ``findNextSiblings`` -> ``find_next_siblings`` +* ``findParent`` -> ``find_parent`` +* ``findParents`` -> ``find_parents`` +* ``findPrevious`` -> ``find_previous`` +* ``findPreviousSibling`` -> ``find_previous_sibling`` +* ``findPreviousSiblings`` -> ``find_previous_siblings`` +* ``getText`` -> ``get_text`` +* ``nextSibling`` -> ``next_sibling`` +* ``previousSibling`` -> ``previous_sibling`` + +Некоторые аргументы конструктора Beautiful Soup были переименованы по +той же причине: + +* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` +* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` + +Я переименовал один метод для совместимости с Python 3: + +* ``Tag.has_key()`` -> ``Tag.has_attr()`` + +Я переименовал один атрибут, чтобы использовать более точную терминологию: + +* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` + +Я переименовал три атрибута, чтобы избежать использования зарезервированных слов +в Python. В отличие от других, эти изменения *не являются обратно +совместимыми*. Если вы использовали эти атрибуты в BS3, ваш код не сработает +на BS4, пока вы их не измените. + +* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` +* ``Tag.next`` -> ``Tag.next_element`` +* ``Tag.previous`` -> ``Tag.previous_element`` + +Генераторы +^^^^^^^^^^ + +Я дал генераторам PEP 8-совместимые имена и преобразовал их в +свойства: + +* ``childGenerator()`` -> ``children`` +* ``nextGenerator()`` -> ``next_elements`` +* ``nextSiblingGenerator()`` -> ``next_siblings`` +* ``previousGenerator()`` -> ``previous_elements`` +* ``previousSiblingGenerator()`` -> ``previous_siblings`` +* ``recursiveChildGenerator()`` -> ``descendants`` +* ``parentGenerator()`` -> ``parents`` + +Так что вместо этого:: + + for parent in tag.parentGenerator(): + ... + +Вы можете написать это:: + + for parent in tag.parents: + ... + +(Хотя старый код тоже будет работать.) + +Некоторые генераторы выдавали ``None`` после их завершения и +останавливались. Это была ошибка. Теперь генераторы просто останавливаются. + +Добавились два генератора: :ref:`.strings и +.stripped_strings <string-generators>`. + +``.strings`` выдает +объекты NavigableString, а ``.stripped_strings`` выдает строки Python, +у которых удалены пробелы. + +XML +^^^ + +Больше нет класса ``BeautifulStoneSoup`` для разбора XML. Чтобы +разобрать XML, нужно передать "xml" в качестве второго аргумента +в конструктор ``BeautifulSoup``. По той же причине +конструктор ``BeautifulSoup`` больше не распознает +аргумент ``isHTML``. + +Улучшена обработка пустых тегов +XML. Ранее при разборе XML нужно было явно указать, +какие теги считать пустыми элементами. Аргумент ``SelfClosingTags`` +больше не распознается. Вместо этого +Beautiful Soup считает пустым элементом любой тег без содержимого. Если +вы добавляете в тег дочерний элемент, тег больше не считается +пустым элементом. + +Мнемоники +^^^^^^^^^ + +Входящие мнемоники HTML или XML всегда преобразуются в +соответствующие символы Unicode. В Beautiful Soup 3 было несколько +перекрывающих друг друга способов взаимодействия с мнемониками. Эти способы +удалены. Конструктор ``BeautifulSoup`` больше не распознает +аргументы ``smartQuotesTo`` и ``convertEntities``. (В `Unicode, +Dammit`_ все еще присутствует ``smart_quotes_to``, но по умолчанию парные кавычки +преобразуются в Unicode). Константы ``HTML_ENTITIES``, +``XML_ENTITIES`` и ``XHTML_ENTITIES`` были удалены, так как они +служили для настройки функции, которой больше нет (преобразование отдельных мнемоник в +символы Unicode). + +Если вы хотите на выходе преобразовать символы Unicode обратно в мнемоники HTML, +а не превращать Unicode в символы UTF-8, вам нужно +использовать :ref:`средства форматирования вывода <output_formatters>`. + +Прочее +^^^^^^ + +:ref:`Tag.string <.string>` теперь работает рекурсивно. Если тег А +содержит только тег B и ничего больше, тогда значение A.string будет таким же, как +B.string. (Раньше это был None.) + +`Многозначные атрибуты`_, такие как ``class``, теперь в качестве значений имеют списки строк, +а не строки. Это может повлиять на поиск +по классу CSS. + +Если вы передадите в один из методов ``find*`` одновременно :ref:`string <string>` `и` +специфичный для тега аргумент, такой как :ref:`name <name>`, Beautiful Soup будет +искать теги, которые, во-первых, соответствуют специфичным для тега критериям, и, во-вторых, имеют +:ref:`Tag.string <.string>`, соответствующий заданному вами значению :ref:`string +<string>`. Beautiful Soup `не` найдет сами строки. Ранее +Beautiful Soup игнорировал аргументы, специфичные для тегов, и искал +строки. + +Конструктор ``BeautifulSoup`` больше не распознает +аргумент `markupMassage`. Теперь это задача парсера — +обрабатывать разметку правильно. + +Редко используемые альтернативные классы парсеров, такие как +``ICantBelieveItsBeautifulSoup`` и ``BeautifulSOAP``, +удалены. Теперь парсер решает, что делать с неоднозначной +разметкой. + +Метод ``prettify()`` теперь возвращает строку Unicode, а не байтовую строку. + +Перевод документации +==================== + +Переводы документации Beautiful Soup очень +приветствуются. Перевод должен быть лицензирован по лицензии MIT, +так же, как сам Beautiful Soup и англоязычная документация к нему. + +Есть два способа передать ваш перевод: + +1. Создайте ветку репозитория Beautiful Soup, добавьте свой + перевод и предложите слияние с основной веткой — так же, + как вы предложили бы изменения исходного кода. +2. Отправьте `в дискуссионную группу Beautiful Soup <https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_ + сообщение со ссылкой на + ваш перевод, или приложите перевод к сообщению. + +Используйте существующие переводы документации на китайский или португальский в качестве образца. В +частности, переводите исходный файл ``doc/source/index.rst`` вместо +того, чтобы переводить HTML-версию документации. Это позволяет +публиковать документацию в разных форматах, не +только в HTML. + +Об этом переводе +---------------- + +Перевод на русский язык: `authoress <mailto:geekwriter@yandex.ru>`_ + +Дата перевода: февраль 2020 + +Перевод выполнен с `оригинала на английском языке <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_. diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/conf.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/conf.py new file mode 100644 index 00000000000..77f417e7ea2 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/conf.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# +# Beautiful Soup documentation build configuration file, created by +# sphinx-quickstart on Thu Jan 26 11:22:55 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Beautiful Soup' +copyright = u'2004-2020, Leonard Richardson' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4' +# The full version, including alpha/beta/rc tags. +release = '4.9.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BeautifulSoupdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', + u'Leonard Richardson', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'beautifulsoup', u'Beautiful Soup Documentation', + [u'Leonard Richardson'], 1) +] + + +# -- Options for Epub output --------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = u'Beautiful Soup' +epub_author = u'Leonard Richardson' +epub_publisher = u'Leonard Richardson' +epub_copyright = u'2012, Leonard Richardson' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/index.rst b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/index.rst new file mode 100644 index 00000000000..31b87d33cea --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.ru/source/index.rst @@ -0,0 +1,17 @@ +.. bs4RUdocs documentation master file, created by + sphinx-quickstart on Sat Feb 1 21:26:47 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Beautiful Soup на русском языке +=============================== + +Переведено на русский `authoress <http://geekwriter.ru/>`_. + +.. toctree:: + :maxdepth: 2 + :caption: Оглавление: + + bs4ru + + diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/Makefile b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/Makefile new file mode 100644 index 00000000000..8c833d2cedb --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/Makefile @@ -0,0 +1,130 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/6.1.jpg b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/6.1.jpg Binary files differnew file mode 100644 index 00000000000..97014f0ec04 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/6.1.jpg diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/conf.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/conf.py new file mode 100644 index 00000000000..102c3cf972a --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/conf.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# +# Beautiful Soup documentation build configuration file, created by +# sphinx-quickstart on Thu Jan 26 11:22:55 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Beautiful Soup' +copyright = u'2012, Leonard Richardson' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4' +# The full version, including alpha/beta/rc tags. +release = '4.2.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BeautifulSoupdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', + u'Leonard Richardson', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'beautifulsoup', u'Beautiful Soup Documentation', + [u'Leonard Richardson'], 1) +] + + +# -- Options for Epub output --------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = u'Beautiful Soup' +epub_author = u'Leonard Richardson' +epub_publisher = u'Leonard Richardson' +epub_copyright = u'2012, Leonard Richardson' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/index.rst b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/index.rst new file mode 100644 index 00000000000..228ef888411 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc.zh/source/index.rst @@ -0,0 +1,2741 @@ +.. BeautifulSoup文档 documentation master file, created by + Deron Wang on Fri Nov 29 13:49:30 2013. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Beautiful Soup 4.4.0 文档 +========================== + +`Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/>`_ 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间. + +这篇文档介绍了BeautifulSoup4中所有主要特性,并且有小例子.让我来向你展示它适合做什么,如何工作,怎样使用,如何达到你想要的效果,和处理异常情况. + +文档中出现的例子在Python2.7和Python3.2中的执行结果相同 + +你可能在寻找 `Beautiful Soup3 <http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ 的文档,Beautiful Soup 3 目前已经停止开发,我们推荐在现在的项目中使用Beautiful Soup 4, `移植到BS4 <http://www.baidu.com>`_ + +这篇帮助文档已经被翻译成了其它语言: + +* `这篇文档当然还有中文版. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/>`_ +* このページは日本語で利用できます(`外部リンク <http://kondou.com/BS4/>`_) +* `이 문서는 한국어 번역도 가능합니다. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ko/>`_ +* `Este documento também está disponível em Português do Brasil. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ptbr/>`_ +* `Эта документация доступна на русском языке. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ru/>`_ + + +寻求帮助 +-------- + +如果你有关于BeautifulSoup的问题,可以发送邮件到 `讨论组 <https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_ .如果你的问题包含了一段需要转换的HTML代码,那么确保你提的问题描述中附带这段HTML文档的 `代码诊断`_ [1]_ + +快速开始 +======== + +下面的一段HTML代码将作为例子被多次用到.这是 *爱丽丝梦游仙境的* 的一段内容(以后内容中简称为 *爱丽丝* 的文档): + +:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + +使用BeautifulSoup解析这段代码,能够得到一个 ``BeautifulSoup`` 的对象,并能按照标准的缩进格式的结构输出: + +:: + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + + print(soup.prettify()) + # <html> + # <head> + # <title> + # The Dormouse's story + # </title> + # </head> + # <body> + # <p class="title"> + # <b> + # The Dormouse's story + # </b> + # </p> + # <p class="story"> + # Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # , + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # and + # <a class="sister" href="http://example.com/tillie" id="link2"> + # Tillie + # </a> + # ; and they lived at the bottom of a well. + # </p> + # <p class="story"> + # ... + # </p> + # </body> + # </html> + +几个简单的浏览结构化数据的方法: + +:: + + soup.title + # <title>The Dormouse's story</title> + + soup.title.name + # u'title' + + soup.title.string + # u'The Dormouse's story' + + soup.title.parent.name + # u'head' + + soup.p + # <p class="title"><b>The Dormouse's story</b></p> + + soup.p['class'] + # u'title' + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find(id="link3") + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +从文档中找到所有<a>标签的链接: + +:: + + for link in soup.find_all('a'): + print(link.get('href')) + # http://example.com/elsie + # http://example.com/lacie + # http://example.com/tillie + +从文档中获取所有文字内容: + +:: + + print(soup.get_text()) + # The Dormouse's story + # + # The Dormouse's story + # + # Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well. + # + # ... + +这是你想要的吗?别着急,还有更好用的 + +安装 Beautiful Soup +====================== + +如果你用的是新版的Debain或ubuntu,那么可以通过系统的软件包管理来安装: + +``$ apt-get install Python-bs4`` + +Beautiful Soup 4 通过PyPi发布,所以如果你无法使用系统包管理安装,那么也可以通过 ``easy_install`` 或 ``pip`` 来安装.包的名字是 ``beautifulsoup4`` ,这个包兼容Python2和Python3. + +``$ easy_install beautifulsoup4`` + +``$ pip install beautifulsoup4`` + +(在PyPi中还有一个名字是 ``BeautifulSoup`` 的包,但那可能不是你想要的,那是 `Beautiful Soup3 <http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ 的发布版本,因为很多项目还在使用BS3, 所以 ``BeautifulSoup`` 包依然有效.但是如果你在编写新项目,那么你应该安装的 ``beautifulsoup4`` ) + +如果你没有安装 ``easy_install`` 或 ``pip`` ,那你也可以 `下载BS4的源码 <http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ ,然后通过setup.py来安装. + +``$ Python setup.py install`` + +如果上述安装方法都行不通,Beautiful Soup的发布协议允许你将BS4的代码打包在你的项目中,这样无须安装即可使用. + +作者在Python2.7和Python3.2的版本下开发Beautiful Soup, 理论上Beautiful Soup应该在所有当前的Python版本中正常工作 + +安装完成后的问题 +----------------- + +Beautiful Soup发布时打包成Python2版本的代码,在Python3环境下安装时,会自动转换成Python3的代码,如果没有一个安装的过程,那么代码就不会被转换. + +如果代码抛出了 ``ImportError`` 的异常: "No module named HTMLParser", 这是因为你在Python3版本中执行Python2版本的代码. + + +如果代码抛出了 ``ImportError`` 的异常: "No module named html.parser", 这是因为你在Python2版本中执行Python3版本的代码. + +如果遇到上述2种情况,最好的解决方法是重新安装BeautifulSoup4. + +如果在ROOT_TAG_NAME = u'[document]'代码处遇到 ``SyntaxError`` "Invalid syntax"错误,需要将把BS4的Python代码版本从Python2转换到Python3. 可以重新安装BS4: + +``$ Python3 setup.py install`` + +或在bs4的目录中执行Python代码版本转换脚本 + +``$ 2to3-3.2 -w bs4`` + +安装解析器 +------------ + +Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器,其中一个是 `lxml <http://lxml.de/>`_ .根据操作系统不同,可以选择下列方法来安装lxml: + +``$ apt-get install Python-lxml`` + +``$ easy_install lxml`` + +``$ pip install lxml`` + +另一个可供选择的解析器是纯Python实现的 `html5lib <http://code.google.com/p/html5lib/>`_ , html5lib的解析方式与浏览器相同,可以选择下列方法来安装html5lib: + +``$ apt-get install Python-html5lib`` + +``$ easy_install html5lib`` + +``$ pip install html5lib`` + +下表列出了主要的解析器,以及它们的优缺点: + ++-----------------------+---------------------------+---------------------------+---------------------------+ +| 解析器 | 使用方法 | 优势 | 劣势 | ++=======================+===========================+===========================+===========================+ +| Python标准库 | ``BeautifulSoup(markup, | - Python的内置标准库 | - Python 2.7.3 or 3.2.2)前| +| | "html.parser")`` | - 执行速度适中 | 的版本中文档容错能力差 | +| | | - 文档容错能力强 | | +| | | | | ++-----------------------+---------------------------+---------------------------+---------------------------+ +| lxml HTML 解析器 | ``BeautifulSoup(markup, | - 速度快 | - 需要安装C语言库 | +| | "lxml")`` | - 文档容错能力强 | | +| | | | | ++-----------------------+---------------------------+---------------------------+---------------------------+ +| lxml XML 解析器 | ``BeautifulSoup(markup, | - 速度快 | - 需要安装C语言库 | +| | ["lxml-xml"])`` | - 唯一支持XML的解析器 | | +| | | | | +| | ``BeautifulSoup(markup, | | | +| | "xml")`` | | | ++-----------------------+---------------------------+---------------------------+---------------------------+ +| html5lib | ``BeautifulSoup(markup, | - 最好的容错性 | - 速度慢 | +| | "html5lib")`` | - 以浏览器的方式解析文档 | - 不依赖外部扩展 | +| | | - 生成HTML5格式的文档 | | ++-----------------------+---------------------------+---------------------------+---------------------------+ + +推荐使用lxml作为解析器,因为效率更高. 在Python2.7.3之前的版本和Python3中3.2.2之前的版本,必须安装lxml或html5lib, 因为那些Python版本的标准库中内置的HTML解析方法不够稳定. + +提示: 如果一段HTML或XML文档格式不正确的话,那么在不同的解析器中返回的结果可能是不一样的,查看 `解析器之间的区别`_ 了解更多细节 + +如何使用 +======== + +将一段文档传入BeautifulSoup 的构造方法,就能得到一个文档的对象, 可以传入一段字符串或一个文件句柄. + +:: + + from bs4 import BeautifulSoup + + soup = BeautifulSoup(open("index.html")) + + soup = BeautifulSoup("<html>data</html>") + +首先,文档被转换成Unicode,并且HTML的实例都被转换成Unicode编码 + +:: + + BeautifulSoup("Sacré bleu!") + <html><head></head><body>Sacré bleu!</body></html> + +然后,Beautiful Soup选择最合适的解析器来解析这段文档,如果手动指定解析器那么Beautiful Soup会选择指定的解析器来解析文档.(参考 `解析成XML`_ ). + +对象的种类 +========== + +Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种: +``Tag`` , ``NavigableString`` , ``BeautifulSoup`` , ``Comment`` . + +Tag +----- + +``Tag`` 对象与XML或HTML原生文档中的tag相同: + +:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + type(tag) + # <class 'bs4.element.Tag'> + +Tag有很多方法和属性,在 `遍历文档树`_ 和 `搜索文档树`_ 中有详细解释.现在介绍一下tag中最重要的属性: name和attributes + +Name +..... + +每个tag都有自己的名字,通过 ``.name`` 来获取: + +:: + + tag.name + # u'b' + +如果改变了tag的name,那将影响所有通过当前Beautiful Soup对象生成的HTML文档: + +:: + + tag.name = "blockquote" + tag + # <blockquote class="boldest">Extremely bold</blockquote> + +Attributes +............ + +一个tag可能有很多个属性. tag ``<b class="boldest">`` 有一个 "class" 的属性,值为 "boldest" . tag的属性的操作方法与字典相同: + +:: + + tag['class'] + # u'boldest' + +也可以直接"点"取属性, 比如: ``.attrs`` : + +:: + + tag.attrs + # {u'class': u'boldest'} + +tag的属性可以被添加,删除或修改. 再说一次, tag的属性操作方法与字典一样 + +:: + + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + + tag['class'] + # KeyError: 'class' + print(tag.get('class')) + # None + +多值属性 +`````````` + +HTML 4定义了一系列可以包含多个值的属性.在HTML5中移除了一些,却增加更多.最常见的多值的属性是 class (一个tag可以有多个CSS的class). 还有一些属性 ``rel`` , ``rev`` , ``accept-charset`` , ``headers`` , ``accesskey`` . 在Beautiful Soup中多值属性的返回类型是list: + +:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.p['class'] + # ["body", "strikeout"] + + css_soup = BeautifulSoup('<p class="body"></p>') + css_soup.p['class'] + # ["body"] + +如果某个属性看起来好像有多个值,但在任何版本的HTML定义中都没有被定义为多值属性,那么Beautiful Soup会将这个属性作为字符串返回 + +:: + + id_soup = BeautifulSoup('<p id="my id"></p>') + id_soup.p['id'] + # 'my id' + +将tag转换成字符串时,多值属性会合并为一个值 + +:: + + rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>') + rel_soup.a['rel'] + # ['index'] + rel_soup.a['rel'] = ['index', 'contents'] + print(rel_soup.p) + # <p>Back to the <a rel="index contents">homepage</a></p> + +如果转换的文档是XML格式,那么tag中不包含多值属性 + +:: + + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') + xml_soup.p['class'] + # u'body strikeout' + +可以遍历的字符串 +---------------- + +字符串常被包含在tag内.Beautiful Soup用 ``NavigableString`` 类来包装tag中的字符串: + +:: + + tag.string + # u'Extremely bold' + type(tag.string) + # <class 'bs4.element.NavigableString'> + +一个 ``NavigableString`` 字符串与Python中的Unicode字符串相同,并且还支持包含在 `遍历文档树`_ 和 `搜索文档树`_ 中的一些特性. 通过 ``unicode()`` 方法可以直接将 ``NavigableString`` 对象转换成Unicode字符串: + +:: + + unicode_string = unicode(tag.string) + unicode_string + # u'Extremely bold' + type(unicode_string) + # <type 'unicode'> + +tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 `replace_with()`_ 方法: + +:: + + tag.string.replace_with("No longer bold") + tag + # <blockquote>No longer bold</blockquote> + +``NavigableString`` 对象支持 `遍历文档树`_ 和 `搜索文档树`_ 中定义的大部分属性, 并非全部.尤其是,一个字符串不能包含其它内容(tag能够包含字符串或是其它tag),字符串不支持 ``.contents`` 或 ``.string`` 属性或 ``find()`` 方法. + +如果想在Beautiful Soup之外使用 ``NavigableString`` 对象,需要调用 ``unicode()`` 方法,将该对象转换成普通的Unicode字符串,否则就算Beautiful Soup已方法已经执行结束,该对象的输出也会带有对象的引用地址.这样会浪费内存. + +BeautifulSoup +---------------- + +``BeautifulSoup`` 对象表示的是一个文档的全部内容.大部分时候,可以把它当作 ``Tag`` 对象,它支持 `遍历文档树`_ 和 `搜索文档树`_ 中描述的大部分的方法. + +因为 ``BeautifulSoup`` 对象并不是真正的HTML或XML的tag,所以它没有name和attribute属性.但有时查看它的 ``.name`` 属性是很方便的,所以 ``BeautifulSoup`` 对象包含了一个值为 "[document]" 的特殊属性 ``.name`` + +:: + + soup.name + # u'[document]' + +注释及特殊字符串 +----------------- + +``Tag`` , ``NavigableString`` , ``BeautifulSoup`` 几乎覆盖了html和xml中的所有内容,但是还有一些特殊对象.容易让人担心的内容是文档的注释部分: + +:: + + markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" + soup = BeautifulSoup(markup) + comment = soup.b.string + type(comment) + # <class 'bs4.element.Comment'> + +``Comment`` 对象是一个特殊类型的 ``NavigableString`` 对象: + +:: + + comment + # u'Hey, buddy. Want to buy a used parser' + +但是当它出现在HTML文档中时, ``Comment`` 对象会使用特殊的格式输出: + +:: + + print(soup.b.prettify()) + # <b> + # <!--Hey, buddy. Want to buy a used parser?--> + # </b> + +Beautiful Soup中定义的其它类型都可能会出现在XML的文档中: ``CData`` , ``ProcessingInstruction`` , ``Declaration`` , ``Doctype`` .与 ``Comment`` 对象类似,这些类都是 ``NavigableString`` 的子类,只是添加了一些额外的方法的字符串独享.下面是用CDATA来替代注释的例子: + +:: + + from bs4 import CData + cdata = CData("A CDATA block") + comment.replace_with(cdata) + + print(soup.b.prettify()) + # <b> + # <![CDATA[A CDATA block]]> + # </b> + +遍历文档树 +========== + +还拿"爱丽丝梦游仙境"的文档来做例子: + +:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + +通过这段例子来演示怎样从文档的一段内容找到另一段内容 + +子节点 +------- + +一个Tag可能包含多个字符串或其它的Tag,这些都是这个Tag的子节点.Beautiful Soup提供了许多操作和遍历子节点的属性. + +注意: Beautiful Soup中字符串节点不支持这些属性,因为字符串没有子节点 + +tag的名字 +.......... + +操作文档树最简单的方法就是告诉它你想获取的tag的name.如果想获取 <head> 标签,只要用 ``soup.head`` : + +:: + + soup.head + # <head><title>The Dormouse's story</title></head> + + soup.title + # <title>The Dormouse's story</title> + +这是个获取tag的小窍门,可以在文档树的tag中多次调用这个方法.下面的代码可以获取<body>标签中的第一个<b>标签: + +:: + + soup.body.b + # <b>The Dormouse's story</b> + +通过点取属性的方式只能获得当前名字的第一个tag: + +:: + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +如果想要得到所有的<a>标签,或是通过名字得到比一个tag更多的内容的时候,就需要用到 `Searching the tree` 中描述的方法,比如: find_all() + +:: + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.contents 和 .children +........................ + +tag的 ``.contents`` 属性可以将tag的子节点以列表的方式输出: + +:: + + head_tag = soup.head + head_tag + # <head><title>The Dormouse's story</title></head> + + head_tag.contents + [<title>The Dormouse's story</title>] + + title_tag = head_tag.contents[0] + title_tag + # <title>The Dormouse's story</title> + title_tag.contents + # [u'The Dormouse's story'] + +``BeautifulSoup`` 对象本身一定会包含子节点,也就是说<html>标签也是 ``BeautifulSoup`` 对象的子节点: + +:: + + len(soup.contents) + # 1 + soup.contents[0].name + # u'html' + +字符串没有 ``.contents`` 属性,因为字符串没有子节点: + +:: + + text = title_tag.contents[0] + text.contents + # AttributeError: 'NavigableString' object has no attribute 'contents' + +通过tag的 ``.children`` 生成器,可以对tag的子节点进行循环: + +:: + + for child in title_tag.children: + print(child) + # The Dormouse's story + +.descendants +.............. + +``.contents`` 和 ``.children`` 属性仅包含tag的直接子节点.例如,<head>标签只有一个直接子节点<title> + +:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + +但是<title>标签也包含一个子节点:字符串 “The Dormouse’s story”,这种情况下字符串 “The Dormouse’s story”也属于<head>标签的子孙节点. ``.descendants`` 属性可以对所有tag的子孙节点进行递归循环 [5]_ : + +:: + + for child in head_tag.descendants: + print(child) + # <title>The Dormouse's story</title> + # The Dormouse's story + +上面的例子中, <head>标签只有一个子节点,但是有2个子孙节点:<head>节点和<head>的子节点, ``BeautifulSoup`` 有一个直接子节点(<html>节点),却有很多子孙节点: + +:: + + len(list(soup.children)) + # 1 + len(list(soup.descendants)) + # 25 + +.string +........ + +如果tag只有一个 ``NavigableString`` 类型子节点,那么这个tag可以使用 ``.string`` 得到子节点: + +:: + + title_tag.string + # u'The Dormouse's story' + +如果一个tag仅有一个子节点,那么这个tag也可以使用 ``.string`` 方法,输出结果与当前唯一子节点的 ``.string`` 结果相同: + +:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + + head_tag.string + # u'The Dormouse's story' + +如果tag包含了多个子节点,tag就无法确定 ``.string`` 方法应该调用哪个子节点的内容, ``.string`` 的输出结果是 ``None`` : + +:: + + print(soup.html.string) + # None + +.strings 和 stripped_strings +............................. + +如果tag中包含多个字符串 [2]_ ,可以使用 ``.strings`` 来循环获取: + +:: + + for string in soup.strings: + print(repr(string)) + # u"The Dormouse's story" + # u'\n\n' + # u"The Dormouse's story" + # u'\n\n' + # u'Once upon a time there were three little sisters; and their names were\n' + # u'Elsie' + # u',\n' + # u'Lacie' + # u' and\n' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # u'...' + # u'\n' + +输出的字符串中可能包含了很多空格或空行,使用 ``.stripped_strings`` 可以去除多余空白内容: + +:: + + for string in soup.stripped_strings: + print(repr(string)) + # u"The Dormouse's story" + # u"The Dormouse's story" + # u'Once upon a time there were three little sisters; and their names were' + # u'Elsie' + # u',' + # u'Lacie' + # u'and' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'...' + +全部是空格的行会被忽略掉,段首和段末的空白会被删除 + +父节点 +------- + +继续分析文档树,每个tag或字符串都有父节点:被包含在某个tag中 + +.parent +........ + +通过 ``.parent`` 属性来获取某个元素的父节点.在例子“爱丽丝”的文档中,<head>标签是<title>标签的父节点: + +:: + + title_tag = soup.title + title_tag + # <title>The Dormouse's story</title> + title_tag.parent + # <head><title>The Dormouse's story</title></head> + +文档title的字符串也有父节点:<title>标签 + +:: + + title_tag.string.parent + # <title>The Dormouse's story</title> + +文档的顶层节点比如<html>的父节点是 ``BeautifulSoup`` 对象: + +:: + + html_tag = soup.html + type(html_tag.parent) + # <class 'bs4.BeautifulSoup'> + +``BeautifulSoup`` 对象的 ``.parent`` 是None: + +:: + + print(soup.parent) + # None + +.parents +.......... + +通过元素的 ``.parents`` 属性可以递归得到元素的所有父辈节点,下面的例子使用了 ``.parents`` 方法遍历了<a>标签到根节点的所有节点. + +:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + for parent in link.parents: + if parent is None: + print(parent) + else: + print(parent.name) + # p + # body + # html + # [document] + # None + +兄弟节点 +--------- + +看一段简单的例子: + +:: + + sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>") + print(sibling_soup.prettify()) + # <html> + # <body> + # <a> + # <b> + # text1 + # </b> + # <c> + # text2 + # </c> + # </a> + # </body> + # </html> + +因为<b>标签和<c>标签是同一层:他们是同一个元素的子节点,所以<b>和<c>可以被称为兄弟节点.一段文档以标准格式输出时,兄弟节点有相同的缩进级别.在代码中也可以使用这种关系. + +.next_sibling 和 .previous_sibling +.................................... + +在文档树中,使用 ``.next_sibling`` 和 ``.previous_sibling`` 属性来查询兄弟节点: + +:: + + sibling_soup.b.next_sibling + # <c>text2</c> + + sibling_soup.c.previous_sibling + # <b>text1</b> + +<b>标签有 ``.next_sibling`` 属性,但是没有 ``.previous_sibling`` 属性,因为<b>标签在同级节点中是第一个.同理,<c>标签有 ``.previous_sibling`` 属性,却没有 ``.next_sibling`` 属性: + +:: + + print(sibling_soup.b.previous_sibling) + # None + print(sibling_soup.c.next_sibling) + # None + +例子中的字符串“text1”和“text2”不是兄弟节点,因为它们的父节点不同: + +:: + + sibling_soup.b.string + # u'text1' + + print(sibling_soup.b.string.next_sibling) + # None + +实际文档中的tag的 ``.next_sibling`` 和 ``.previous_sibling`` 属性通常是字符串或空白. 看看“爱丽丝”文档: + +:: + + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> + +如果以为第一个<a>标签的 ``.next_sibling`` 结果是第二个<a>标签,那就错了,真实结果是第一个<a>标签和第二个<a>标签之间的顿号和换行符: + +:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + link.next_sibling + # u',\n' + +第二个<a>标签是顿号的 ``.next_sibling`` 属性: + +:: + + link.next_sibling.next_sibling + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + +.next_siblings 和 .previous_siblings +...................................... + +通过 ``.next_siblings`` 和 ``.previous_siblings`` 属性可以对当前节点的兄弟节点迭代输出: + +:: + + for sibling in soup.a.next_siblings: + print(repr(sibling)) + # u',\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u' and\n' + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + # u'; and they lived at the bottom of a well.' + # None + + for sibling in soup.find(id="link3").previous_siblings: + print(repr(sibling)) + # ' and\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u',\n' + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + # u'Once upon a time there were three little sisters; and their names were\n' + # None + +回退和前进 +---------- + +看一下“爱丽丝” 文档: + +:: + + <html><head><title>The Dormouse's story</title></head> + <p class="title"><b>The Dormouse's story</b></p> + +HTML解析器把这段字符串转换成一连串的事件: "打开<html>标签","打开一个<head>标签","打开一个<title>标签","添加一段字符串","关闭<title>标签","打开<p>标签",等等.Beautiful Soup提供了重现解析器初始化过程的方法. + +.next_element 和 .previous_element +................................... + +``.next_element`` 属性指向解析过程中下一个被解析的对象(字符串或tag),结果可能与 ``.next_sibling`` 相同,但通常是不一样的. + +这是“爱丽丝”文档中最后一个<a>标签,它的 ``.next_sibling`` 结果是一个字符串,因为当前的解析过程 [2]_ 因为当前的解析过程因为遇到了<a>标签而中断了: + +:: + + last_a_tag = soup.find("a", id="link3") + last_a_tag + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_a_tag.next_sibling + # '; and they lived at the bottom of a well.' + +但这个<a>标签的 ``.next_element`` 属性结果是在<a>标签被解析之后的解析内容,不是<a>标签后的句子部分,应该是字符串"Tillie": + +:: + + last_a_tag.next_element + # u'Tillie' + +这是因为在原始文档中,字符串“Tillie” 在分号前出现,解析器先进入<a>标签,然后是字符串“Tillie”,然后关闭</a>标签,然后是分号和剩余部分.分号与<a>标签在同一层级,但是字符串“Tillie”会被先解析. + +``.previous_element`` 属性刚好与 ``.next_element`` 相反,它指向当前被解析的对象的前一个解析对象: + +:: + + last_a_tag.previous_element + # u' and\n' + last_a_tag.previous_element.next_element + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +.next_elements 和 .previous_elements +..................................... + +通过 ``.next_elements`` 和 ``.previous_elements`` 的迭代器就可以向前或向后访问文档的解析内容,就好像文档正在被解析一样: + +:: + + for element in last_a_tag.next_elements: + print(repr(element)) + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # <p class="story">...</p> + # u'...' + # u'\n' + # None + +搜索文档树 +========== + +Beautiful Soup定义了很多搜索方法,这里着重介绍2个: ``find()`` 和 ``find_all()`` .其它方法的参数和用法类似,请读者举一反三. + +再以“爱丽丝”文档作为例子: + +:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + +使用 ``find_all()`` 类似的方法可以查找到想要查找的文档内容 + +过滤器 +------ + +介绍 ``find_all()`` 方法前,先介绍一下过滤器的类型 [3]_ ,这些过滤器贯穿整个搜索的API.过滤器可以被用在tag的name中,节点的属性中,字符串中或他们的混合中. + +字符串 +............ + +最简单的过滤器是字符串.在搜索方法中传入一个字符串参数,Beautiful Soup会查找与字符串完整匹配的内容,下面的例子用于查找文档中所有的<b>标签: + +:: + + soup.find_all('b') + # [<b>The Dormouse's story</b>] + +如果传入字节码参数,Beautiful Soup会当作UTF-8编码,可以传入一段Unicode 编码来避免Beautiful Soup解析编码出错 + +正则表达式 +.......... + +如果传入正则表达式作为参数,Beautiful Soup会通过正则表达式的 ``search()`` 来匹配内容.下面例子中找出所有以b开头的标签,这表示<body>和<b>标签都应该被找到: + +:: + + import re + for tag in soup.find_all(re.compile("^b")): + print(tag.name) + # body + # b + +下面代码找出所有名字中包含"t"的标签: + +:: + + for tag in soup.find_all(re.compile("t")): + print(tag.name) + # html + # title + +列表 +.... + +如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容返回.下面代码找到文档中所有<a>标签和<b>标签: + +:: + + soup.find_all(["a", "b"]) + # [<b>The Dormouse's story</b>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +True +..... + +``True`` 可以匹配任何值,下面代码查找到所有的tag,但是不会返回字符串节点 + +:: + + for tag in soup.find_all(True): + print(tag.name) + # html + # head + # title + # body + # p + # b + # p + # a + # a + # a + # p + +方法 +.... + +如果没有合适过滤器,那么还可以定义一个方法,方法只接受一个元素参数 [4]_ ,如果这个方法返回 ``True`` 表示当前元素匹配并且被找到,如果不是则反回 ``False`` + +下面方法校验了当前元素,如果包含 ``class`` 属性却不包含 ``id`` 属性,那么将返回 ``True``: + +:: + + def has_class_but_no_id(tag): + return tag.has_attr('class') and not tag.has_attr('id') + +将这个方法作为参数传入 ``find_all()`` 方法,将得到所有<p>标签: + +:: + + soup.find_all(has_class_but_no_id) + # [<p class="title"><b>The Dormouse's story</b></p>, + # <p class="story">Once upon a time there were...</p>, + # <p class="story">...</p>] + +返回结果中只有<p>标签没有<a>标签,因为<a>标签还定义了"id",没有返回<html>和<head>,因为<html>和<head>中没有定义"class"属性. + +通过一个方法来过滤一类标签属性的时候, 这个方法的参数是要被过滤的属性的值, 而不是这个标签. +下面的例子是找出 ``href`` 属性不符合指定正则的 ``a`` 标签. + +:: + + + def not_lacie(href): + return href and not re.compile("lacie").search(href) + soup.find_all(href=not_lacie) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +标签过滤方法可以使用复杂方法. 下面的例子可以过滤出前后都有文字的标签. + +:: + + from bs4 import NavigableString + def surrounded_by_strings(tag): + return (isinstance(tag.next_element, NavigableString) + and isinstance(tag.previous_element, NavigableString)) + + for tag in soup.find_all(surrounded_by_strings): + print tag.name + # p + # a + # a + # a + # p + +现在来了解一下搜索方法的细节 + +find_all() +----------- + +find_all( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +``find_all()`` 方法搜索当前tag的所有tag子节点,并判断是否符合过滤器的条件.这里有几个例子: + +:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.find_all("p", "title") + # [<p class="title"><b>The Dormouse's story</b></p>] + + soup.find_all("a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find_all(id="link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + import re + soup.find(string=re.compile("sisters")) + # u'Once upon a time there were three little sisters; and their names were\n' + +有几个方法很相似,还有几个方法是新的,参数中的 ``string`` 和 ``id`` 是什么含义? 为什么 ``find_all("p", "title")`` 返回的是CSS Class为"title"的<p>标签? 我们来仔细看一下 ``find_all()`` 的参数 + +name 参数 +.......... + +``name`` 参数可以查找所有名字为 ``name`` 的tag,字符串对象会被自动忽略掉. + +简单的用法如下: + +:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + +重申: 搜索 ``name`` 参数的值可以使任一类型的 `过滤器`_ ,字符窜,正则表达式,列表,方法或是 ``True`` . + +keyword 参数 +.............. + +如果一个指定名字的参数不是搜索内置的参数名,搜索时会把该参数当作指定名字tag的属性来搜索,如果包含一个名字为 ``id`` 的参数,Beautiful Soup会搜索每个tag的"id"属性. + +:: + + soup.find_all(id='link2') + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +如果传入 ``href`` 参数,Beautiful Soup会搜索每个tag的"href"属性: + +:: + + soup.find_all(href=re.compile("elsie")) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +搜索指定名字的属性时可以使用的参数值包括 `字符串`_ , `正则表达式`_ , `列表`_, `True`_ . + +下面的例子在文档树中查找所有包含 ``id`` 属性的tag,无论 ``id`` 的值是什么: + +:: + + soup.find_all(id=True) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +使用多个指定名字的参数可以同时过滤tag的多个属性: + +:: + + soup.find_all(href=re.compile("elsie"), id='link1') + # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] + +有些tag属性在搜索不能使用,比如HTML5中的 data-* 属性: + +:: + + data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') + data_soup.find_all(data-foo="value") + # SyntaxError: keyword can't be an expression + +但是可以通过 ``find_all()`` 方法的 ``attrs`` 参数定义一个字典参数来搜索包含特殊属性的tag: + +:: + + data_soup.find_all(attrs={"data-foo": "value"}) + # [<div data-foo="value">foo!</div>] + +按CSS搜索 +.......... + +按照CSS类名搜索tag的功能非常实用,但标识CSS类名的关键字 ``class`` 在Python中是保留字,使用 ``class`` 做参数会导致语法错误.从Beautiful Soup的4.1.1版本开始,可以通过 ``class_`` 参数搜索有指定CSS类名的tag: + +:: + + soup.find_all("a", class_="sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +``class_`` 参数同样接受不同类型的 ``过滤器`` ,字符串,正则表达式,方法或 ``True`` : + +:: + + soup.find_all(class_=re.compile("itl")) + # [<p class="title"><b>The Dormouse's story</b></p>] + + def has_six_characters(css_class): + return css_class is not None and len(css_class) == 6 + + soup.find_all(class_=has_six_characters) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +tag的 ``class`` 属性是 `多值属性`_ .按照CSS类名搜索tag时,可以分别搜索tag中的每个CSS类名: + +:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.find_all("p", class_="strikeout") + # [<p class="body strikeout"></p>] + + css_soup.find_all("p", class_="body") + # [<p class="body strikeout"></p>] + +搜索 ``class`` 属性时也可以通过CSS值完全匹配: + +:: + + css_soup.find_all("p", class_="body strikeout") + # [<p class="body strikeout"></p>] + +完全匹配 ``class`` 的值时,如果CSS类名的顺序与实际不符,将搜索不到结果: + +:: + + soup.find_all("a", attrs={"class": "sister"}) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +``string`` 参数 +............... + +通过 ``string`` 参数可以搜搜文档中的字符串内容.与 ``name`` 参数的可选值一样, ``string`` 参数接受 `字符串`_ , `正则表达式`_ , `列表`_, `True`_ . 看例子: + +:: + + soup.find_all(string="Elsie") + # [u'Elsie'] + + soup.find_all(string=["Tillie", "Elsie", "Lacie"]) + # [u'Elsie', u'Lacie', u'Tillie'] + + soup.find_all(string=re.compile("Dormouse")) + [u"The Dormouse's story", u"The Dormouse's story"] + + def is_the_only_string_within_a_tag(s): + ""Return True if this string is the only child of its parent tag."" + return (s == s.parent.string) + + soup.find_all(string=is_the_only_string_within_a_tag) + # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] + +虽然 ``string`` 参数用于搜索字符串,还可以与其它参数混合使用来过滤tag.Beautiful Soup会找到 ``.string`` 方法与 ``string`` 参数值相符的tag.下面代码用来搜索内容里面包含“Elsie”的<a>标签: + +:: + + soup.find_all("a", string="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +``limit`` 参数 +............... + +``find_all()`` 方法返回全部的搜索结构,如果文档树很大那么搜索会很慢.如果我们不需要全部结果,可以使用 ``limit`` 参数限制返回结果的数量.效果与SQL中的limit关键字类似,当搜索到的结果数量达到 ``limit`` 的限制时,就停止搜索返回结果. + +文档树中有3个tag符合搜索条件,但结果只返回了2个,因为我们限制了返回数量: + +:: + + soup.find_all("a", limit=2) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +``recursive`` 参数 +................... + +调用tag的 ``find_all()`` 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 ``recursive=False`` . + +一段简单的文档: + +:: + + <html> + <head> + <title> + The Dormouse's story + </title> + </head> + ... + +是否使用 ``recursive`` 参数的搜索结果: + +:: + + soup.html.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.html.find_all("title", recursive=False) + # [] + +这是文档片段 + +:: + + <html> + <head> + <title> + The Dormouse's story + </title> + </head> + ... + +<title>标签在 <html> 标签下, 但并不是直接子节点, <head> 标签才是直接子节点. +在允许查询所有后代节点时 Beautiful Soup 能够查找到 <title> 标签. +但是使用了 ``recursive=False`` 参数之后,只能查找直接子节点,这样就查不到 <title> 标签了. + +Beautiful Soup 提供了多种DOM树搜索方法. 这些方法都使用了类似的参数定义. +比如这些方法: ``find_all()``: ``name``, ``attrs``, ``text``, ``limit``. +但是只有 ``find_all()`` 和 ``find()`` 支持 ``recursive`` 参数. + +像调用 ``find_all()`` 一样调用tag +---------------------------------- + +``find_all()`` 几乎是Beautiful Soup中最常用的搜索方法,所以我们定义了它的简写方法. ``BeautifulSoup`` 对象和 ``tag`` 对象可以被当作一个方法来使用,这个方法的执行结果与调用这个对象的 ``find_all()`` 方法相同,下面两行代码是等价的: + +:: + + soup.find_all("a") + soup("a") + +这两行代码也是等价的: + +:: + + soup.title.find_all(string=True) + soup.title(string=True) + +find() +------- + +find( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +``find_all()`` 方法将返回文档中符合条件的所有tag,尽管有时候我们只想得到一个结果.比如文档中只有一个<body>标签,那么使用 ``find_all()`` 方法来查找<body>标签就不太合适, 使用 ``find_all`` 方法并设置 ``limit=1`` 参数不如直接使用 ``find()`` 方法.下面两行代码是等价的: + +:: + + soup.find_all('title', limit=1) + # [<title>The Dormouse's story</title>] + + soup.find('title') + # <title>The Dormouse's story</title> + +唯一的区别是 ``find_all()`` 方法的返回结果是值包含一个元素的列表,而 ``find()`` 方法直接返回结果. + +``find_all()`` 方法没有找到目标是返回空列表, ``find()`` 方法找不到目标时,返回 ``None`` . + +:: + + print(soup.find("nosuchtag")) + # None + +``soup.head.title`` 是 `tag的名字`_ 方法的简写.这个简写的原理就是多次调用当前tag的 ``find()`` 方法: + +:: + + soup.head.title + # <title>The Dormouse's story</title> + + soup.find("head").find("title") + # <title>The Dormouse's story</title> + +find_parents() 和 find_parent() +-------------------------------- + +find_parents( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +find_parent( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +我们已经用了很大篇幅来介绍 ``find_all()`` 和 ``find()`` 方法,Beautiful Soup中还有10个用于搜索的API.它们中的五个用的是与 ``find_all()`` 相同的搜索参数,另外5个与 ``find()`` 方法的搜索参数类似.区别仅是它们搜索文档的不同部分. + +记住: ``find_all()`` 和 ``find()`` 只搜索当前节点的所有子节点,孙子节点等. ``find_parents()`` 和 ``find_parent()`` 用来搜索当前节点的父辈节点,搜索方法与普通tag的搜索方法相同,搜索文档\搜索文档包含的内容. 我们从一个文档中的一个叶子节点开始: + +:: + + a_string = soup.find(string="Lacie") + a_string + # u'Lacie' + + a_string.find_parents("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + a_string.find_parent("p") + # <p class="story">Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; + # and they lived at the bottom of a well.</p> + + a_string.find_parents("p", class="title") + # [] + +文档中的一个<a>标签是是当前叶子节点的直接父节点,所以可以被找到.还有一个<p>标签,是目标叶子节点的间接父辈节点,所以也可以被找到.包含class值为"title"的<p>标签不是不是目标叶子节点的父辈节点,所以通过 ``find_parents()`` 方法搜索不到. + +``find_parent()`` 和 ``find_parents()`` 方法会让人联想到 `.parent`_ 和 `.parents`_ 属性.它们之间的联系非常紧密.搜索父辈节点的方法实际上就是对 ``.parents`` 属性的迭代搜索. + +find_next_siblings() 和 find_next_sibling() +------------------------------------------- + +find_next_siblings( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +find_next_sibling( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +这2个方法通过 `.next_siblings`_ 属性对当tag的所有后面解析 [5]_ 的兄弟tag节点进行迭代, ``find_next_siblings()`` 方法返回所有符合条件的后面的兄弟节点, ``find_next_sibling()`` 只返回符合条件的后面的第一个tag节点. + +:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_next_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_next_sibling("p") + # <p class="story">...</p> + +find_previous_siblings() 和 find_previous_sibling() +----------------------------------------------------- + +find_previous_siblings( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +find_previous_sibling( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +这2个方法通过 `.previous_siblings`_ 属性对当前tag的前面解析 [5]_ 的兄弟tag节点进行迭代, ``find_previous_siblings()`` 方法返回所有符合条件的前面的兄弟节点, ``find_previous_sibling()`` 方法返回第一个符合条件的前面的兄弟节点: + +:: + + last_link = soup.find("a", id="link3") + last_link + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_link.find_previous_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_previous_sibling("p") + # <p class="title"><b>The Dormouse's story</b></p> + +find_all_next() 和 find_next() +-------------------------------- + +find_all_next( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +find_next( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +这2个方法通过 `.next_elements`_ 属性对当前tag的之后的 [5]_ tag和字符串进行迭代, ``find_all_next()`` 方法返回所有符合条件的节点, ``find_next()`` 方法返回第一个符合条件的节点: + +:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_next(string=True) + # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] + + first_link.find_next("p") + # <p class="story">...</p> + +第一个例子中,字符串 “Elsie”也被显示出来,尽管它被包含在我们开始查找的<a>标签的里面.第二个例子中,最后一个<p>标签也被显示出来,尽管它与我们开始查找位置的<a>标签不属于同一部分.例子中,搜索的重点是要匹配过滤器的条件,并且在文档中出现的顺序而不是开始查找的元素的位置. + +find_all_previous() 和 find_previous() +--------------------------------------- + +find_all_previous( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +find_previous( `name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ ) + +这2个方法通过 `.previous_elements`_ 属性对当前节点前面 [5]_ 的tag和字符串进行迭代, ``find_all_previous()`` 方法返回所有符合条件的节点, ``find_previous()`` 方法返回第一个符合条件的节点. + +:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_previous("p") + # [<p class="story">Once upon a time there were three little sisters; ...</p>, + # <p class="title"><b>The Dormouse's story</b></p>] + + first_link.find_previous("title") + # <title>The Dormouse's story</title> + +``find_all_previous("p")`` 返回了文档中的第一段(class="title"的那段),但还返回了第二段,<p>标签包含了我们开始查找的<a>标签.不要惊讶,这段代码的功能是查找所有出现在指定<a>标签之前的<p>标签,因为这个<p>标签包含了开始的<a>标签,所以<p>标签一定是在<a>之前出现的. + +CSS选择器 +------------ + +Beautiful Soup支持大部分的CSS选择器 `<http://www.w3.org/TR/CSS2/selector.html>`_ [6]_ , +在 ``Tag`` 或 ``BeautifulSoup`` 对象的 ``.select()`` 方法中传入字符串参数, +即可使用CSS选择器的语法找到tag: + +:: + + soup.select("title") + # [<title>The Dormouse's story</title>] + + soup.select("p:nth-of-type(3)") + # [<p class="story">...</p>] + +通过tag标签逐层查找: + +:: + + soup.select("body a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("html head title") + # [<title>The Dormouse's story</title>] + +找到某个tag标签下的直接子标签 [6]_ : + +:: + + soup.select("head > title") + # [<title>The Dormouse's story</title>] + + soup.select("p > a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("p > a:nth-of-type(2)") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + soup.select("p > #link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("body > a") + # [] + +找到兄弟节点标签: + +:: + + soup.select("#link1 ~ .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("#link1 + .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +通过CSS的类名查找: + +:: + + soup.select(".sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("[class~=sister]") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +通过tag的id查找: + +:: + + soup.select("#link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("a#link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +同时用多种CSS选择器查询元素: + +:: + + soup.select("#link1,#link2") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + +通过是否存在某个属性来查找: + +:: + + soup.select('a[href]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +通过属性的值来查找: + +:: + + soup.select('a[href="http://example.com/elsie"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select('a[href^="http://example.com/"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href$="tillie"]') + # [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href*=".com/el"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +通过语言设置来查找: + +:: + + multilingual_markup = """ + <p lang="en">Hello</p> + <p lang="en-us">Howdy, y'all</p> + <p lang="en-gb">Pip-pip, old fruit</p> + <p lang="fr">Bonjour mes amis</p> + """ + multilingual_soup = BeautifulSoup(multilingual_markup) + multilingual_soup.select('p[lang|=en]') + # [<p lang="en">Hello</p>, + # <p lang="en-us">Howdy, y'all</p>, + # <p lang="en-gb">Pip-pip, old fruit</p>] + +返回查找到的元素的第一个 + +:: + + soup.select_one(".sister") + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + +对于熟悉CSS选择器语法的人来说这是个非常方便的方法.Beautiful Soup也支持CSS选择器API, +如果你仅仅需要CSS选择器的功能,那么直接使用 ``lxml`` 也可以, +而且速度更快,支持更多的CSS选择器语法,但Beautiful Soup整合了CSS选择器的语法和自身方便使用API. + + +修改文档树 +=========== + +Beautiful Soup的强项是文档树的搜索,但同时也可以方便的修改文档树 + +修改tag的名称和属性 +------------------- + +在 `Attributes`_ 的章节中已经介绍过这个功能,但是再看一遍也无妨. 重命名一个tag,改变属性的值,添加或删除属性: + +:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + + tag.name = "blockquote" + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + +修改 .string +------------- + +给tag的 ``.string`` 属性赋值,就相当于用当前的内容替代了原来的内容: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + + tag = soup.a + tag.string = "New link text." + tag + # <a href="http://example.com/">New link text.</a> + +注意: 如果当前的tag包含了其它tag,那么给它的 ``.string`` 属性赋值会覆盖掉原有的所有内容包括子tag + +append() +---------- + +``Tag.append()`` 方法想tag中添加内容,就好像Python的列表的 ``.append()`` 方法: + +:: + + soup = BeautifulSoup("<a>Foo</a>") + soup.a.append("Bar") + + soup + # <html><head></head><body><a>FooBar</a></body></html> + soup.a.contents + # [u'Foo', u'Bar'] + +NavigableString() 和 .new_tag() +----------------------------------------- + +如果想添加一段文本内容到文档中也没问题,可以调用Python的 ``append()`` 方法 +或调用 ``NavigableString`` 的构造方法: + +:: + + soup = BeautifulSoup("<b></b>") + tag = soup.b + tag.append("Hello") + new_string = NavigableString(" there") + tag.append(new_string) + tag + # <b>Hello there.</b> + tag.contents + # [u'Hello', u' there'] + +如果想要创建一段注释,或 ``NavigableString`` 的任何子类, 只要调用 NavigableString 的构造方法: + +:: + + from bs4 import Comment + new_comment = soup.new_string("Nice to see you.", Comment) + tag.append(new_comment) + tag + # <b>Hello there<!--Nice to see you.--></b> + tag.contents + # [u'Hello', u' there', u'Nice to see you.'] + +# 这是Beautiful Soup 4.2.1 中新增的方法 + +创建一个tag最好的方法是调用工厂方法 ``BeautifulSoup.new_tag()`` : + +:: + + soup = BeautifulSoup("<b></b>") + original_tag = soup.b + + new_tag = soup.new_tag("a", href="http://www.example.com") + original_tag.append(new_tag) + original_tag + # <b><a href="http://www.example.com"></a></b> + + new_tag.string = "Link text." + original_tag + # <b><a href="http://www.example.com">Link text.</a></b> + +第一个参数作为tag的name,是必填,其它参数选填 + +insert() +-------- + +``Tag.insert()`` 方法与 ``Tag.append()`` 方法类似,区别是不会把新元素添加到父节点 ``.contents`` 属性的最后,而是把元素插入到指定的位置.与Python列表总的 ``.insert()`` 方法的用法下同: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.insert(1, "but did not endorse ") + tag + # <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a> + tag.contents + # [u'I linked to ', u'but did not endorse', <i>example.com</i>] + +insert_before() 和 insert_after() +----------------------------------- + +``insert_before()`` 方法在当前tag或文本节点前插入内容: + +:: + + soup = BeautifulSoup("<b>stop</b>") + tag = soup.new_tag("i") + tag.string = "Don't" + soup.b.string.insert_before(tag) + soup.b + # <b><i>Don't</i>stop</b> + +``insert_after()`` 方法在当前tag或文本节点后插入内容: + +:: + + soup.b.i.insert_after(soup.new_string(" ever ")) + soup.b + # <b><i>Don't</i> ever stop</b> + soup.b.contents + # [<i>Don't</i>, u' ever ', u'stop'] + +clear() +-------- + +``Tag.clear()`` 方法移除当前tag的内容: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.clear() + tag + # <a href="http://example.com/"></a> + +extract() +---------- + +``PageElement.extract()`` 方法将当前tag移除文档树,并作为方法结果返回: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + i_tag = soup.i.extract() + + a_tag + # <a href="http://example.com/">I linked to</a> + + i_tag + # <i>example.com</i> + + print(i_tag.parent) + None + +这个方法实际上产生了2个文档树: 一个是用来解析原始文档的 ``BeautifulSoup`` 对象,另一个是被移除并且返回的tag.被移除并返回的tag可以继续调用 ``extract`` 方法: + +:: + + my_string = i_tag.string.extract() + my_string + # u'example.com' + + print(my_string.parent) + # None + i_tag + # <i></i> + +decompose() +------------ + +``Tag.decompose()`` 方法将当前节点移除文档树并完全销毁: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + soup.i.decompose() + + a_tag + # <a href="http://example.com/">I linked to</a> + +replace_with() +--------------- + +``PageElement.replace_with()`` 方法移除文档树中的某段内容,并用新tag或文本节点替代它: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + new_tag = soup.new_tag("b") + new_tag.string = "example.net" + a_tag.i.replace_with(new_tag) + + a_tag + # <a href="http://example.com/">I linked to <b>example.net</b></a> + +``replace_with()`` 方法返回被替代的tag或文本节点,可以用来浏览或添加到文档树其它地方 + +wrap() +------ + +``PageElement.wrap()`` 方法可以对指定的tag元素进行包装 [8]_ ,并返回包装后的结果: + +:: + + soup = BeautifulSoup("<p>I wish I was bold.</p>") + soup.p.string.wrap(soup.new_tag("b")) + # <b>I wish I was bold.</b> + + soup.p.wrap(soup.new_tag("div")) + # <div><p><b>I wish I was bold.</b></p></div> + +该方法在 Beautiful Soup 4.0.5 中添加 + +unwrap() +--------- + +``Tag.unwrap()`` 方法与 ``wrap()`` 方法相反.将移除tag内的所有tag标签,该方法常被用来进行标记的解包: + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + a_tag.i.unwrap() + a_tag + # <a href="http://example.com/">I linked to example.com</a> + +与 ``replace_with()`` 方法相同, ``unwrap()`` 方法返回被移除的tag + +输出 +==== + +格式化输出 +----------- + +``prettify()`` 方法将Beautiful Soup的文档树格式化后以Unicode编码输出,每个XML/HTML标签都独占一行 + +:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + soup.prettify() + # '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...' + + print(soup.prettify()) + # <html> + # <head> + # </head> + # <body> + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + # </body> + # </html> + +``BeautifulSoup`` 对象和它的tag节点都可以调用 ``prettify()`` 方法: + +:: + + print(soup.a.prettify()) + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + +压缩输出 +---------- + +如果只想得到结果字符串,不重视格式,那么可以对一个 ``BeautifulSoup`` 对象或 ``Tag`` 对象使用Python的 ``unicode()`` 或 ``str()`` 方法: + +:: + + str(soup) + # '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>' + + unicode(soup.a) + # u'<a href="http://example.com/">I linked to <i>example.com</i></a>' + +``str()`` 方法返回UTF-8编码的字符串,可以指定 `编码`_ 的设置. + +还可以调用 ``encode()`` 方法获得字节码或调用 ``decode()`` 方法获得Unicode. + +输出格式 +--------- + +Beautiful Soup输出是会将HTML中的特殊字符转换成Unicode,比如“&lquot;”: + +:: + + soup = BeautifulSoup("“Dammit!” he said.") + unicode(soup) + # u'<html><head></head><body>\u201cDammit!\u201d he said.</body></html>' + +如果将文档转换成字符串,Unicode编码会被编码成UTF-8.这样就无法正确显示HTML特殊字符了: + +:: + + str(soup) + # '<html><head></head><body>\xe2\x80\x9cDammit!\xe2\x80\x9d he said.</body></html>' + +get_text() +---------- + +如果只想得到tag中包含的文本内容,那么可以调用 ``get_text()`` 方法,这个方法获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回: + +:: + + markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>' + soup = BeautifulSoup(markup) + + soup.get_text() + u'\nI linked to example.com\n' + soup.i.get_text() + u'example.com' + +可以通过参数指定tag的文本内容的分隔符: + +:: + + # soup.get_text("|") + u'\nI linked to |example.com|\n' + +还可以去除获得文本内容的前后空白: + +:: + + # soup.get_text("|", strip=True) + u'I linked to|example.com' + +或者使用 `.stripped_strings`_ 生成器,获得文本列表后手动处理列表: + +:: + + [text for text in soup.stripped_strings] + # [u'I linked to', u'example.com'] + +指定文档解析器 +============== + +如果仅是想要解析HTML文档,只要用文档创建 ``BeautifulSoup`` 对象就可以了.Beautiful Soup会自动选择一个解析器来解析文档.但是还可以通过参数指定使用那种解析器来解析当前文档. + +``BeautifulSoup`` 第一个参数应该是要被解析的文档字符串或是文件句柄,第二个参数用来标识怎样解析文档.如果第二个参数为空,那么Beautiful Soup根据当前系统安装的库自动选择解析器,解析器的优先数序: lxml, html5lib, Python标准库.在下面两种条件下解析器优先顺序会变化: + + * 要解析的文档是什么类型: 目前支持, “html”, “xml”, 和 “html5” + * 指定使用哪种解析器: 目前支持, “lxml”, “html5lib”, 和 “html.parser” + +`安装解析器`_ 章节介绍了可以使用哪种解析器,以及如何安装. + +如果指定的解析器没有安装,Beautiful Soup会自动选择其它方案.目前只有 lxml 解析器支持XML文档的解析,在没有安装lxml库的情况下,创建 ``beautifulsoup`` 对象时无论是否指定使用lxml,都无法得到解析后的对象 + +解析器之间的区别 +----------------- + +Beautiful Soup为不同的解析器提供了相同的接口,但解析器本身时有区别的.同一篇文档被不同的解析器解析后可能会生成不同结构的树型文档.区别最大的是HTML解析器和XML解析器,看下面片段被解析成HTML结构: + +:: + + BeautifulSoup("<a><b /></a>") + # <html><head></head><body><a><b></b></a></body></html> + +因为空标签<b />不符合HTML标准,所以解析器把它解析成<b></b> + +同样的文档使用XML解析如下(解析XML需要安装lxml库).注意,空标签<b />依然被保留,并且文档前添加了XML头,而不是被包含在<html>标签内: + +:: + + BeautifulSoup("<a><b /></a>", "xml") + # <?xml version="1.0" encoding="utf-8"?> + # <a><b/></a> + +HTML解析器之间也有区别,如果被解析的HTML文档是标准格式,那么解析器之间没有任何差别,只是解析速度不同,结果都会返回正确的文档树. + +但是如果被解析文档不是标准格式,那么不同的解析器返回结果可能不同.下面例子中,使用lxml解析错误格式的文档,结果</p>标签被直接忽略掉了: + +:: + + BeautifulSoup("<a></p>", "lxml") + # <html><body><a></a></body></html> + +使用html5lib库解析相同文档会得到不同的结果: + +:: + + BeautifulSoup("<a></p>", "html5lib") + # <html><head></head><body><a><p></p></a></body></html> + +html5lib库没有忽略掉</p>标签,而是自动补全了标签,还给文档树添加了<head>标签. + +使用pyhton内置库解析结果如下: + +:: + + BeautifulSoup("<a></p>", "html.parser") + # <a></a> + +与lxml [7]_ 库类似的,Python内置库忽略掉了</p>标签,与html5lib库不同的是标准库没有尝试创建符合标准的文档格式或将文档片段包含在<body>标签内,与lxml不同的是标准库甚至连<html>标签都没有尝试去添加. + +因为文档片段“<a></p>”是错误格式,所以以上解析方式都能算作"正确",html5lib库使用的是HTML5的部分标准,所以最接近"正确".不过所有解析器的结构都能够被认为是"正常"的. + +不同的解析器可能影响代码执行结果,如果在分发给别人的代码中使用了 ``BeautifulSoup`` ,那么最好注明使用了哪种解析器,以减少不必要的麻烦. + +编码 +==== + +任何HTML或XML文档都有自己的编码方式,比如ASCII 或 UTF-8,但是使用Beautiful Soup解析后,文档都被转换成了Unicode: + +:: + + markup = "<h1>Sacr\xc3\xa9 bleu!</h1>" + soup = BeautifulSoup(markup) + soup.h1 + # <h1>Sacré bleu!</h1> + soup.h1.string + # u'Sacr\xe9 bleu!' + +这不是魔术(但很神奇),Beautiful Soup用了 `编码自动检测`_ 子库来识别当前文档编码并转换成Unicode编码. ``BeautifulSoup`` 对象的 ``.original_encoding`` 属性记录了自动识别编码的结果: + +:: + + soup.original_encoding + 'utf-8' + +`编码自动检测`_ 功能大部分时候都能猜对编码格式,但有时候也会出错.有时候即使猜测正确,也是在逐个字节的遍历整个文档后才猜对的,这样很慢.如果预先知道文档编码,可以设置编码参数来减少自动检查编码出错的概率并且提高文档解析速度.在创建 ``BeautifulSoup`` 对象的时候设置 ``from_encoding`` 参数. + +下面一段文档用了ISO-8859-8编码方式,这段文档太短,结果Beautiful Soup以为文档是用ISO-8859-7编码: + +:: + + markup = b"<h1>\xed\xe5\xec\xf9</h1>" + soup = BeautifulSoup(markup) + soup.h1 + <h1>νεμω</h1> + soup.original_encoding + 'ISO-8859-7' + +通过传入 ``from_encoding`` 参数来指定编码方式: + +:: + + soup = BeautifulSoup(markup, from_encoding="iso-8859-8") + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'iso8859-8' + +如果仅知道文档采用了Unicode编码, 但不知道具体编码. 可以先自己猜测, 猜测错误(依旧是乱码)时, +可以把错误编码作为 ``exclude_encodings`` 参数, 这样文档就不会尝试使用这种编码了解码了. +译者备注: 在没有指定编码的情况下, BS会自己猜测编码, 把不正确的编码排除掉, BS就更容易猜到正确编码. + +:: + + soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"]) + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'WINDOWS-1255' + +猜测结果是 Windows-1255 编码, 猜测结果可能不够准确, 但是 Windows-1255 编码是 ISO-8859-8 的扩展集, +所以猜测结果已经十分接近了, 并且不影响使用. (``exclude_encodings`` 参数是 4.4.0版本的新功能) + +少数情况下(通常是UTF-8编码的文档中包含了其它编码格式的文件),想获得正确的Unicode编码就不得不将文档中少数特殊编码字符替换成特殊Unicode编码,“REPLACEMENT CHARACTER” (U+FFFD, �) [9]_ . 如果Beautifu Soup猜测文档编码时作了特殊字符的替换,那么Beautiful Soup会把 ``UnicodeDammit`` 或 ``BeautifulSoup`` 对象的 ``.contains_replacement_characters`` 属性标记为 ``True`` .这样就可以知道当前文档进行Unicode编码后丢失了一部分特殊内容字符.如果文档中包含�而 ``.contains_replacement_characters`` 属性是 ``False`` ,则表示�就是文档中原来的字符,不是转码失败. + +输出编码 +-------- + +通过Beautiful Soup输出文档时,不管输入文档是什么编码方式,输出编码均为UTF-8编码,下面例子输入文档是Latin-1编码: + +:: + + markup = b''' + <html> + <head> + <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /> + </head> + <body> + <p>Sacr\xe9 bleu!</p> + </body> + </html> + ''' + + soup = BeautifulSoup(markup) + print(soup.prettify()) + # <html> + # <head> + # <meta content="text/html; charset=utf-8" http-equiv="Content-type" /> + # </head> + # <body> + # <p> + # Sacré bleu! + # </p> + # </body> + # </html> + +注意,输出文档中的<meta>标签的编码设置已经修改成了与输出编码一致的UTF-8. + +如果不想用UTF-8编码输出,可以将编码方式传入 ``prettify()`` 方法: + +:: + + print(soup.prettify("latin-1")) + # <html> + # <head> + # <meta content="text/html; charset=latin-1" http-equiv="Content-type" /> + # ... + +还可以调用 ``BeautifulSoup`` 对象或任意节点的 ``encode()`` 方法,就像Python的字符串调用 ``encode()`` 方法一样: + +:: + + soup.p.encode("latin-1") + # '<p>Sacr\xe9 bleu!</p>' + + soup.p.encode("utf-8") + # '<p>Sacr\xc3\xa9 bleu!</p>' + +如果文档中包含当前编码不支持的字符,那么这些字符将被转换成一系列XML特殊字符引用,下面例子中包含了Unicode编码字符SNOWMAN: + +:: + + markup = u"<b>\N{SNOWMAN}</b>" + snowman_soup = BeautifulSoup(markup) + tag = snowman_soup.b + +SNOWMAN字符在UTF-8编码中可以正常显示(看上去像是☃),但有些编码不支持SNOWMAN字符,比如ISO-Latin-1或ASCII,那么在这些编码中SNOWMAN字符会被转换成“☃”: + +:: + + print(tag.encode("utf-8")) + # <b>☃</b> + + print tag.encode("latin-1") + # <b>☃</b> + + print tag.encode("ascii") + # <b>☃</b> + +Unicode, Dammit! (乱码, 靠!) +----------------------------- + +译者备注: UnicodeDammit 是BS内置库, 主要用来猜测文档编码. + +`编码自动检测`_ 功能可以在Beautiful Soup以外使用,检测某段未知编码时,可以使用这个方法: + +:: + + from bs4 import UnicodeDammit + dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'utf-8' + +如果Python中安装了 ``chardet`` 或 ``cchardet`` 那么编码检测功能的准确率将大大提高. +输入的字符越多,检测结果越精确,如果事先猜测到一些可能编码, +那么可以将猜测的编码作为参数,这样将优先检测这些编码: + +:: + + + dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'latin-1' + +`编码自动检测`_ 功能中有2项功能是Beautiful Soup库中用不到的 + +智能引号 +........... + +使用Unicode时,Beautiful Soup还会智能的把引号 [10]_ 转换成HTML或XML中的特殊字符: + +:: + + markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>" + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup + # u'<p>I just “love” Microsoft Word’s smart quotes</p>' + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup + # u'<p>I just “love” Microsoft Word’s smart quotes</p>' + +也可以把引号转换为ASCII码: + +:: + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup + # u'<p>I just "love" Microsoft Word\'s smart quotes</p>' + +很有用的功能,但是Beautiful Soup没有使用这种方式.默认情况下,Beautiful Soup把引号转换成Unicode: + +:: + + UnicodeDammit(markup, ["windows-1252"]).unicode_markup + # u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>' + +矛盾的编码 +........... + +有时文档的大部分都是用UTF-8,但同时还包含了Windows-1252编码的字符,就像微软的智能引号 [10]_ 一样. +一些包含多个信息的来源网站容易出现这种情况. ``UnicodeDammit.detwingle()`` +方法可以把这类文档转换成纯UTF-8编码格式,看个简单的例子: + +:: + + snowmen = (u"\N{SNOWMAN}" * 3) + quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") + doc = snowmen.encode("utf8") + quote.encode("windows_1252") + +这段文档很杂乱,snowmen是UTF-8编码,引号是Windows-1252编码,直接输出时不能同时显示snowmen和引号,因为它们编码不同: + +:: + + print(doc) + # ☃☃☃�I like snowmen!� + + print(doc.decode("windows-1252")) + # ☃☃☃“I like snowmen!” + +如果对这段文档用UTF-8解码就会得到 ``UnicodeDecodeError`` 异常,如果用Windows-1252解码就回得到一堆乱码. +幸好, ``UnicodeDammit.detwingle()`` 方法会把这段字符串转换成UTF-8编码,允许我们同时显示出文档中的snowmen和引号: + +:: + + new_doc = UnicodeDammit.detwingle(doc) + print(new_doc.decode("utf8")) + # ☃☃☃“I like snowmen!” + +``UnicodeDammit.detwingle()`` 方法只能解码包含在UTF-8编码中的Windows-1252编码内容,但这解决了最常见的一类问题. + +在创建 ``BeautifulSoup`` 或 ``UnicodeDammit`` 对象前一定要先对文档调用 ``UnicodeDammit.detwingle()`` 确保文档的编码方式正确.如果尝试去解析一段包含Windows-1252编码的UTF-8文档,就会得到一堆乱码,比如: ☃☃☃“I like snowmen!”. + +``UnicodeDammit.detwingle()`` 方法在Beautiful Soup 4.1.0版本中新增 + +比较对象是否相同 +================= + +两个 ``NavigableString`` 或 ``Tag`` 对象具有相同的HTML或XML结构时, +Beautiful Soup就判断这两个对象相同. 这个例子中, 2个 <b> 标签在 BS 中是相同的, +尽管他们在文档树的不同位置, 但是具有相同的表象: "<b>pizza</b>" + +:: + + markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>" + soup = BeautifulSoup(markup, 'html.parser') + first_b, second_b = soup.find_all('b') + print first_b == second_b + # True + + print first_b.previous_element == second_b.previous_element + # False + +如果想判断两个对象是否严格的指向同一个对象可以通过 ``is`` 来判断 + +:: + + print first_b is second_b + # False + +复制Beautiful Soup对象 +====================== + +``copy.copy()`` 方法可以复制任意 ``Tag`` 或 ``NavigableString`` 对象 + +:: + + import copy + p_copy = copy.copy(soup.p) + print p_copy + # <p>I want <b>pizza</b> and more <b>pizza</b>!</p> + +复制后的对象跟与对象是相等的, 但指向不同的内存地址 + +:: + + print soup.p == p_copy + # True + + print soup.p is p_copy + # False + +源对象和复制对象的区别是源对象在文档树中, 而复制后的对象是独立的还没有添加到文档树中. +复制后对象的效果跟调用了 ``extract()`` 方法相同. + +:: + + print p_copy.parent + # None + +这是因为相等的对象不能同时插入相同的位置 + + +解析部分文档 +============ + +如果仅仅因为想要查找文档中的<a>标签而将整片文档进行解析,实在是浪费内存和时间.最快的方法是从一开始就把<a>标签以外的东西都忽略掉. ``SoupStrainer`` 类可以定义文档的某段内容,这样搜索文档时就不必先解析整篇文档,只会解析在 ``SoupStrainer`` 中定义过的文档. 创建一个 ``SoupStrainer`` 对象并作为 ``parse_only`` 参数给 ``BeautifulSoup`` 的构造方法即可. + +SoupStrainer +------------- + +``SoupStrainer`` 类接受与典型搜索方法相同的参数:`name`_ , `attrs`_ , `recursive`_ , `string`_ , `**kwargs`_ 。下面举例说明三种 ``SoupStrainer`` 对象: + +:: + + from bs4 import SoupStrainer + + only_a_tags = SoupStrainer("a") + + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return len(string) < 10 + + only_short_strings = SoupStrainer(string=is_short_string) + +再拿“爱丽丝”文档来举例,来看看使用三种 ``SoupStrainer`` 对象做参数会有什么不同: + +:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) + # Elsie + # , + # Lacie + # and + # Tillie + # ... + # + +还可以将 ``SoupStrainer`` 作为参数传入 `搜索文档树`_ 中提到的方法.这可能不是个常用用法,所以还是提一下: + +:: + + soup = BeautifulSoup(html_doc) + soup.find_all(only_short_strings) + # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u'\n\n', u'...', u'\n'] + +常见问题 +======== + +代码诊断 +---------- + +如果想知道Beautiful Soup到底怎样处理一份文档,可以将文档传入 ``diagnose()`` 方法(Beautiful Soup 4.2.0中新增),Beautiful Soup会输出一份报告,说明不同的解析器会怎样处理这段文档,并标出当前的解析过程会使用哪种解析器: + +:: + + from bs4.diagnose import diagnose + data = open("bad.html").read() + diagnose(data) + + # Diagnostic running on Beautiful Soup 4.2.0 + # Python version 2.7.3 (default, Aug 1 2012, 05:16:07) + # I noticed that html5lib is not installed. Installing it may help. + # Found lxml version 2.3.2.0 + # + # Trying to parse your data with html.parser + # Here's what html.parser did with the document: + # ... + +``diagnose()`` 方法的输出结果可能帮助你找到问题的原因,如果不行,还可以把结果复制出来以便寻求他人的帮助 + +文档解析错误 +------------- + +文档解析错误有两种.一种是崩溃,Beautiful Soup尝试解析一段文档结果却抛除了异常,通常是 ``HTMLParser.HTMLParseError`` .还有一种异常情况,是Beautiful Soup解析后的文档树看起来与原来的内容相差很多. + +这些错误几乎都不是Beautiful Soup的原因,这不会是因为Beautiful Soup的代码写的太优秀,而是因为Beautiful Soup没有包含任何文档解析代码.异常产生自被依赖的解析器,如果解析器不能很好的解析出当前的文档,那么最好的办法是换一个解析器.更多细节查看 `安装解析器`_ 章节. + +最常见的解析错误是 ``HTMLParser.HTMLParseError: malformed start tag`` 和 ``HTMLParser.HTMLParseError: bad end tag`` .这都是由Python内置的解析器引起的,解决方法是 `安装lxml或html5lib`_ + +最常见的异常现象是当前文档找不到指定的Tag,而这个Tag光是用眼睛就足够发现的了. ``find_all()`` 方法返回 [] ,而 ``find()`` 方法返回 None .这是Python内置解析器的又一个问题: 解析器会跳过那些它不知道的tag.解决方法还是 `安装lxml或html5lib`_ + +版本错误 +---------- + +* ``SyntaxError: Invalid syntax`` (异常位置在代码行: ``ROOT_TAG_NAME = u'[document]'`` ),因为Python2语法的代码(没有经过迁移)直接在Python3中运行 + +* ``ImportError: No module named HTMLParser`` 因为在Python3中执行Python2版本的Beautiful Soup + +* ``ImportError: No module named html.parser`` 因为在Python2中执行Python3版本的Beautiful Soup + +* ``ImportError: No module named BeautifulSoup`` 因为在没有安装BeautifulSoup3库的Python环境下执行代码,或忘记了BeautifulSoup4的代码需要从 ``bs4`` 包中引入 + +* ``ImportError: No module named bs4`` 因为当前Python环境下还没有安装BeautifulSoup4 + +解析成XML +---------- + +默认情况下,Beautiful Soup会将当前文档作为HTML格式解析,如果要解析XML文档,要在 ``BeautifulSoup`` 构造方法中加入第二个参数 "xml": + +:: + + soup = BeautifulSoup(markup, "xml") + +当然,还需要 `安装lxml`_ + +解析器的错误 +------------ + +* 如果同样的代码在不同环境下结果不同,可能是因为两个环境下使用不同的解析器造成的.例如这个环境中安装了lxml,而另一个环境中只有html5lib, `解析器之间的区别`_ 中说明了原因.修复方法是在 ``BeautifulSoup`` 的构造方法中中指定解析器 + +* 因为HTML标签是 `大小写敏感 <http://www.w3.org/TR/html5/syntax.html#syntax>`_ 的,所以3种解析器再出来文档时都将tag和属性转换成小写.例如文档中的 <TAG></TAG> 会被转换为 <tag></tag> .如果想要保留tag的大写的话,那么应该将文档 `解析成XML`_ . + +杂项错误 +-------- + +* ``UnicodeEncodeError: 'charmap' codec can't encode character u'\xfoo' in position bar`` (或其它类型的 ``UnicodeEncodeError`` )的错误,主要是两方面的错误(都不是Beautiful Soup的原因),第一种是正在使用的终端(console)无法显示部分Unicode,参考 `Python wiki <http://wiki.Python.org/moin/PrintFails>`_ ,第二种是向文件写入时,被写入文件不支持部分Unicode,这时只要用 ``u.encode("utf8")`` 方法将编码转换为UTF-8. + +* ``KeyError: [attr]`` 因为调用 ``tag['attr']`` 方法而引起,因为这个tag没有定义该属性.出错最多的是 ``KeyError: 'href'`` 和 ``KeyError: 'class'`` .如果不确定某个属性是否存在时,用 ``tag.get('attr')`` 方法去获取它,跟获取Python字典的key一样 + +* ``AttributeError: 'ResultSet' object has no attribute 'foo'`` 错误通常是因为把 ``find_all()`` 的返回结果当作一个tag或文本节点使用,实际上返回结果是一个列表或 ``ResultSet`` 对象的字符串,需要对结果进行循环才能得到每个节点的 ``.foo`` 属性.或者使用 ``find()`` 方法仅获取到一个节点 + +* ``AttributeError: 'NoneType' object has no attribute 'foo'`` 这个错误通常是在调用了 ``find()`` 方法后直节点取某个属性 .foo 但是 ``find()`` 方法并没有找到任何结果,所以它的返回值是 ``None`` .需要找出为什么 ``find()`` 的返回值是 ``None`` . + +如何提高效率 +------------ + +Beautiful Soup对文档的解析速度不会比它所依赖的解析器更快,如果对计算时间要求很高或者计算机的时间比程序员的时间更值钱,那么就应该直接使用 `lxml <http://lxml.de/>`_ . + +换句话说,还有提高Beautiful Soup效率的办法,使用lxml作为解析器.Beautiful Soup用lxml做解析器比用html5lib或Python内置解析器速度快很多. + +安装 `cchardet <http://pypi.Python.org/pypi/cchardet/>`_ 后文档的解码的编码检测会速度更快 + +`解析部分文档`_ 不会节省多少解析时间,但是会节省很多内存,并且搜索时也会变得更快. + +Beautiful Soup 3 +================= + +Beautiful Soup 3是上一个发布版本,目前已经停止维护.Beautiful Soup 3库目前已经被几个主要的linux平台添加到源里: + +``$ apt-get install Python-beautifulsoup`` + +在PyPi中分发的包名字是 ``BeautifulSoup`` : + +``$ easy_install BeautifulSoup`` + +``$ pip install BeautifulSoup`` + +或通过 `Beautiful Soup 3.2.0源码包 <http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_ 安装 + +Beautiful Soup 3的在线文档查看 `这里 <http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ . + +迁移到BS4 +---------- + +只要一个小变动就能让大部分的Beautiful Soup 3代码使用Beautiful Soup 4的库和方法----修改 ``BeautifulSoup`` 对象的引入方式: + +:: + + from BeautifulSoup import BeautifulSoup + +修改为: + +:: + + from bs4 import BeautifulSoup + +* 如果代码抛出 ``ImportError`` 异常“No module named BeautifulSoup”,原因可能是尝试执行Beautiful Soup 3,但环境中只安装了Beautiful Soup 4库 + +* 如果代码跑出 ``ImportError`` 异常“No module named bs4”,原因可能是尝试运行Beautiful Soup 4的代码,但环境中只安装了Beautiful Soup 3. + +虽然BS4兼容绝大部分BS3的功能,但BS3中的大部分方法已经不推荐使用了,就方法按照 `PEP8标准 <http://www.Python.org/dev/peps/pep-0008/>`_ 重新定义了方法名.很多方法都重新定义了方法名,但只有少数几个方法没有向下兼容. + +上述内容就是BS3迁移到BS4的注意事项 + +需要的解析器 +............ + +Beautiful Soup 3曾使用Python的 ``SGMLParser`` 解析器,这个模块在Python3中已经被移除了.Beautiful Soup 4默认使用系统的 ``html.parser`` ,也可以使用lxml或html5lib扩展库代替.查看 `安装解析器`_ 章节 + +因为解析器 ``html.parser`` 与 ``SGMLParser`` 不同. BS4 和 BS3 处理相同的文档会产生不同的对象结构. 使用lxml或html5lib解析文档的时候, 如果添加了 ``html.parser`` 参数, 解析的对象又回发生变化. 如果发生了这种情况, 只能修改对应的处文档结果处理代码了. + +方法名的变化 +............ + +* ``renderContents`` -> ``encode_contents`` + +* ``replaceWith`` -> ``replace_with`` + +* ``replaceWithChildren`` -> ``unwrap`` + +* ``findAll`` -> ``find_all`` + +* ``findAllNext`` -> ``find_all_next`` + +* ``findAllPrevious`` -> ``find_all_previous`` + +* ``findNext`` -> ``find_next`` + +* ``findNextSibling`` -> ``find_next_sibling`` + +* ``findNextSiblings`` -> ``find_next_siblings`` + +* ``findParent`` -> ``find_parent`` + +* ``findParents`` -> ``find_parents`` + +* ``findPrevious`` -> ``find_previous`` + +* ``findPreviousSibling`` -> ``find_previous_sibling`` + +* ``findPreviousSiblings`` -> ``find_previous_siblings`` + +* ``nextSibling`` -> ``next_sibling`` + +* ``previousSibling`` -> ``previous_sibling`` + +Beautiful Soup构造方法的参数部分也有名字变化: + +* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` + +* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` + +为了适配Python3,修改了一个方法名: + +* ``Tag.has_key()`` -> ``Tag.has_attr()`` + +修改了一个属性名,让它看起来更专业点: + +* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` + +修改了下面3个属性的名字,以免雨Python保留字冲突.这些变动不是向下兼容的,如果在BS3中使用了这些属性,那么在BS4中这些代码无法执行. + +* UnicodeDammit.Unicode -> UnicodeDammit.Unicode_markup`` + +* ``Tag.next`` -> ``Tag.next_element`` + +* ``Tag.previous`` -> ``Tag.previous_element`` + +生成器 +....... + +将下列生成器按照PEP8标准重新命名,并转换成对象的属性: + +* ``childGenerator()`` -> ``children`` + +* ``nextGenerator()`` -> ``next_elements`` + +* ``nextSiblingGenerator()`` -> ``next_siblings`` + +* ``previousGenerator()`` -> ``previous_elements`` + +* ``previousSiblingGenerator()`` -> ``previous_siblings`` + +* ``recursiveChildGenerator()`` -> ``descendants`` + +* ``parentGenerator()`` -> ``parents`` + +所以迁移到BS4版本时要替换这些代码: + +:: + + for parent in tag.parentGenerator(): + ... + +替换为: + +:: + + for parent in tag.parents: + ... + +(两种调用方法现在都能使用) + +BS3中有的生成器循环结束后会返回 ``None`` 然后结束.这是个bug.新版生成器不再返回 ``None`` . + +BS4中增加了2个新的生成器, `.strings 和 stripped_strings`_ . ``.strings`` 生成器返回NavigableString对象, ``.stripped_strings`` 方法返回去除前后空白的Python的string对象. + +XML +.... + +BS4中移除了解析XML的 ``BeautifulStoneSoup`` 类.如果要解析一段XML文档,使用 ``BeautifulSoup`` 构造方法并在第二个参数设置为“xml”.同时 ``BeautifulSoup`` 构造方法也不再识别 ``isHTML`` 参数. + +Beautiful Soup处理XML空标签的方法升级了.旧版本中解析XML时必须指明哪个标签是空标签. 构造方法的 ``selfClosingTags`` 参数已经不再使用.新版Beautiful Soup将所有空标签解析为空元素,如果向空元素中添加子节点,那么这个元素就不再是空元素了. + +实体 +..... + +HTML或XML实体都会被解析成Unicode字符,Beautiful Soup 3版本中有很多处理实体的方法,在新版中都被移除了. ``BeautifulSoup`` 构造方法也不再接受 ``smartQuotesTo`` 或 ``convertEntities`` 参数. `编码自动检测`_ 方法依然有 ``smart_quotes_to`` 参数,但是默认会将引号转换成Unicode.内容配置项 ``HTML_ENTITIES`` , ``XML_ENTITIES`` 和 ``XHTML_ENTITIES`` 在新版中被移除.因为它们代表的特性已经不再被支持. + +如果在输出文档时想把Unicode字符转换成HTML实体,而不是输出成UTF-8编码,那就需要用到 `输出格式`_ 的方法. + +迁移杂项 +......... + +`Tag.string`_ 属性现在是一个递归操作.如果A标签只包含了一个B标签,那么A标签的.string属性值与B标签的.string属性值相同. + +`多值属性`_ 比如 ``class`` 属性包含一个他们的值的列表,而不是一个字符串.这可能会影响到如何按照CSS类名哦搜索tag. + +如果使用 ``find*`` 方法时同时传入了 `string 参数`_ 和 `name 参数`_ .Beautiful Soup会搜索指定name的tag,并且这个tag的 `Tag.string`_ 属性包含text参数的内容.结果中不会包含字符串本身.旧版本中Beautiful Soup会忽略掉tag参数,只搜索text参数. + +``BeautifulSoup`` 构造方法不再支持 markupMassage 参数.现在由解析器负责文档的解析正确性. + +很少被用到的几个解析器方法在新版中被移除,比如 ``ICantBelieveItsBeautifulSoup`` 和 ``BeautifulSOAP`` .现在由解析器完全负责如何解释模糊不清的文档标记. + +``prettify()`` 方法在新版中返回Unicode字符串,不再返回字节流. + +附录 +===== + +.. _`BeautifulSoup3 文档`: http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html +.. _name: `name 参数`_ +.. _attrs: `按CSS搜索`_ +.. _recursive: `recursive 参数`_ +.. _string: `string 参数`_ +.. _**kwargs: `keyword 参数`_ +.. _.next_siblings: `.next_siblings 和 .previous_siblings`_ +.. _.previous_siblings: `.next_siblings 和 .previous_siblings`_ +.. _.next_elements: `.next_elements 和 .previous_elements`_ +.. _.previous_elements: `.next_elements 和 .previous_elements`_ +.. _.stripped_strings: `.strings 和 stripped_strings`_ +.. _安装lxml: `安装解析器`_ +.. _安装lxml或html5lib: `安装解析器`_ +.. _编码自动检测: `Unicode, Dammit! (乱码, 靠!)`_ +.. _Tag.string: `.string`_ + + +.. [1] BeautifulSoup的google讨论组不是很活跃,可能是因为库已经比较完善了吧,但是作者还是会很热心的尽量帮你解决问题的. +.. [2] 文档被解析成树形结构,所以下一步解析过程应该是当前节点的子节点 +.. [3] 过滤器只能作为搜索文档的参数,或者说应该叫参数类型更为贴切,原文中用了 ``filter`` 因此翻译为过滤器 +.. [4] 元素参数,HTML文档中的一个tag节点,不能是文本节点 +.. [5] 采用先序遍历方式 +.. [6] CSS选择器是一种单独的文档搜索语法, 参考 http://www.w3school.com.cn/css/css_selector_type.asp +.. [7] 原文写的是 html5lib, 译者觉得这是原文档的一个笔误 +.. [8] wrap含有包装,打包的意思,但是这里的包装不是在外部包装而是将当前tag的内部内容包装在一个tag里.包装原来内容的新tag依然在执行 `wrap()`_ 方法的tag内 +.. [9] 文档中特殊编码字符被替换成特殊字符(通常是�)的过程是Beautful Soup自动实现的,如果想要多种编码格式的文档被完全转换正确,那么,只好,预先手动处理,统一编码格式 +.. [10] 智能引号,常出现在microsoft的word软件中,即在某一段落中按引号出现的顺序每个引号都被自动转换为左引号,或右引号. + +原文: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +翻译: Deron Wang + +查看 `BeautifulSoup3 文档`_ diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/Makefile b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/Makefile new file mode 100644 index 00000000000..8c833d2cedb --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/Makefile @@ -0,0 +1,130 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/6.1.jpg b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/6.1.jpg Binary files differnew file mode 100644 index 00000000000..97014f0ec04 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/6.1.jpg diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/check_doc.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/check_doc.py new file mode 100644 index 00000000000..43c470cb1e0 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/check_doc.py @@ -0,0 +1,28 @@ +from pdb import set_trace +class Parser(object): + + def __init__(self): + self.in_code = False + self.code = [] + + def parse(self, x): + for line in x: + self.parse_line(line) + + def parse_line(self, line): + line = line[:-1] + is_code = False + if self.in_code: + if line.strip() and not line.startswith(" "): + self.in_code = False + else: + is_code = True + elif line.strip().endswith("::"): + self.in_code = True + + if is_code: + self.code.append(line[1:]) + +parser = Parser() +parser.parse(open("index.rst").readlines()) +print("\n".join(parser.code)) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/conf.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/conf.py new file mode 100644 index 00000000000..7ba53ac96f7 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/conf.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# +# Beautiful Soup documentation build configuration file, created by +# sphinx-quickstart on Thu Jan 26 11:22:55 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Beautiful Soup' +copyright = u'2004-2020, Leonard Richardson' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4' +# The full version, including alpha/beta/rc tags. +release = '4.9.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BeautifulSoupdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', + u'Leonard Richardson', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'beautifulsoup', u'Beautiful Soup Documentation', + [u'Leonard Richardson'], 1) +] + + +# -- Options for Epub output --------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = u'Beautiful Soup' +epub_author = u'Leonard Richardson' +epub_publisher = u'Leonard Richardson' +epub_copyright = u'2012, Leonard Richardson' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/index.rst b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/index.rst new file mode 100644 index 00000000000..34ec7cf79ca --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/doc/source/index.rst @@ -0,0 +1,3505 @@ +.. _manual: + +Beautiful Soup Documentation +============================ + +.. image:: 6.1.jpg + :align: right + :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself." + +`Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/>`_ is a +Python library for pulling data out of HTML and XML files. It works +with your favorite parser to provide idiomatic ways of navigating, +searching, and modifying the parse tree. It commonly saves programmers +hours or days of work. + +These instructions illustrate all major features of Beautiful Soup 4, +with examples. I show you what the library is good for, how it works, +how to use it, how to make it do what you want, and what to do when it +violates your expectations. + +This document covers Beautiful Soup version 4.9.2. The examples in +this documentation should work the same way in Python 2.7 and Python +3.8. + +You might be looking for the documentation for `Beautiful Soup 3 +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. +If so, you should know that Beautiful Soup 3 is no longer being +developed and that support for it will be dropped on or after December +31, 2020. If you want to learn about the differences between Beautiful +Soup 3 and Beautiful Soup 4, see `Porting code to BS4`_. + +This documentation has been translated into other languages by +Beautiful Soup users: + +* `这篇文档当然还有中文版. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/>`_ +* このページは日本語で利用できます(`外部リンク <http://kondou.com/BS4/>`_) +* `이 문서는 한국어 번역도 가능합니다. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ko/>`_ +* `Este documento também está disponível em Português do Brasil. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ptbr>`_ +* `Эта документация доступна на русском языке. <https://www.crummy.com/software/BeautifulSoup/bs4/doc.ru/>`_ + +Getting help +------------ + +If you have questions about Beautiful Soup, or run into problems, +`send mail to the discussion group +<https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_. If +your problem involves parsing an HTML document, be sure to mention +:ref:`what the diagnose() function says <diagnose>` about +that document. + +Quick Start +=========== + +Here's an HTML document I'll be using as an example throughout this +document. It's part of a story from `Alice in Wonderland`:: + + html_doc = """<html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + +Running the "three sisters" document through Beautiful Soup gives us a +``BeautifulSoup`` object, which represents the document as a nested +data structure:: + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + + print(soup.prettify()) + # <html> + # <head> + # <title> + # The Dormouse's story + # </title> + # </head> + # <body> + # <p class="title"> + # <b> + # The Dormouse's story + # </b> + # </p> + # <p class="story"> + # Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # , + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # and + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + # ; and they lived at the bottom of a well. + # </p> + # <p class="story"> + # ... + # </p> + # </body> + # </html> + +Here are some simple ways to navigate that data structure:: + + soup.title + # <title>The Dormouse's story</title> + + soup.title.name + # u'title' + + soup.title.string + # u'The Dormouse's story' + + soup.title.parent.name + # u'head' + + soup.p + # <p class="title"><b>The Dormouse's story</b></p> + + soup.p['class'] + # u'title' + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find(id="link3") + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +One common task is extracting all the URLs found within a page's <a> tags:: + + for link in soup.find_all('a'): + print(link.get('href')) + # http://example.com/elsie + # http://example.com/lacie + # http://example.com/tillie + +Another common task is extracting all the text from a page:: + + print(soup.get_text()) + # The Dormouse's story + # + # The Dormouse's story + # + # Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well. + # + # ... + +Does this look like what you need? If so, read on. + +Installing Beautiful Soup +========================= + +If you're using a recent version of Debian or Ubuntu Linux, you can +install Beautiful Soup with the system package manager: + +:kbd:`$ apt-get install python-bs4` (for Python 2) + +:kbd:`$ apt-get install python3-bs4` (for Python 3) + +Beautiful Soup 4 is published through PyPi, so if you can't install it +with the system packager, you can install it with ``easy_install`` or +``pip``. The package name is ``beautifulsoup4``, and the same package +works on Python 2 and Python 3. Make sure you use the right version of +``pip`` or ``easy_install`` for your Python version (these may be named +``pip3`` and ``easy_install3`` respectively if you're using Python 3). + +:kbd:`$ easy_install beautifulsoup4` + +:kbd:`$ pip install beautifulsoup4` + +(The ``BeautifulSoup`` package is `not` what you want. That's +the previous major release, `Beautiful Soup 3`_. Lots of software uses +BS3, so it's still available, but if you're writing new code you +should install ``beautifulsoup4``.) + +If you don't have ``easy_install`` or ``pip`` installed, you can +`download the Beautiful Soup 4 source tarball +<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ and +install it with ``setup.py``. + +:kbd:`$ python setup.py install` + +If all else fails, the license for Beautiful Soup allows you to +package the entire library with your application. You can download the +tarball, copy its ``bs4`` directory into your application's codebase, +and use Beautiful Soup without installing it at all. + +I use Python 2.7 and Python 3.8 to develop Beautiful Soup, but it +should work with other recent versions. + +Problems after installation +--------------------------- + +Beautiful Soup is packaged as Python 2 code. When you install it for +use with Python 3, it's automatically converted to Python 3 code. If +you don't install the package, the code won't be converted. There have +also been reports on Windows machines of the wrong version being +installed. + +If you get the ``ImportError`` "No module named HTMLParser", your +problem is that you're running the Python 2 version of the code under +Python 3. + +If you get the ``ImportError`` "No module named html.parser", your +problem is that you're running the Python 3 version of the code under +Python 2. + +In both cases, your best bet is to completely remove the Beautiful +Soup installation from your system (including any directory created +when you unzipped the tarball) and try the installation again. + +If you get the ``SyntaxError`` "Invalid syntax" on the line +``ROOT_TAG_NAME = u'[document]'``, you need to convert the Python 2 +code to Python 3. You can do this either by installing the package: + +:kbd:`$ python3 setup.py install` + +or by manually running Python's ``2to3`` conversion script on the +``bs4`` directory: + +:kbd:`$ 2to3-3.2 -w bs4` + +.. _parser-installation: + + +Installing a parser +------------------- + +Beautiful Soup supports the HTML parser included in Python's standard +library, but it also supports a number of third-party Python parsers. +One is the `lxml parser <http://lxml.de/>`_. Depending on your setup, +you might install lxml with one of these commands: + +:kbd:`$ apt-get install python-lxml` + +:kbd:`$ easy_install lxml` + +:kbd:`$ pip install lxml` + +Another alternative is the pure-Python `html5lib parser +<http://code.google.com/p/html5lib/>`_, which parses HTML the way a +web browser does. Depending on your setup, you might install html5lib +with one of these commands: + +:kbd:`$ apt-get install python-html5lib` + +:kbd:`$ easy_install html5lib` + +:kbd:`$ pip install html5lib` + +This table summarizes the advantages and disadvantages of each parser library: + ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| Parser | Typical usage | Advantages | Disadvantages | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| Python's html.parser | ``BeautifulSoup(markup, "html.parser")`` | * Batteries included | * Not as fast as lxml, | +| | | * Decent speed | less lenient than | +| | | * Lenient (As of Python 2.7.3 | html5lib. | +| | | and 3.2.) | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| lxml's HTML parser | ``BeautifulSoup(markup, "lxml")`` | * Very fast | * External C dependency | +| | | * Lenient | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| lxml's XML parser | ``BeautifulSoup(markup, "lxml-xml")`` | * Very fast | * External C dependency | +| | ``BeautifulSoup(markup, "xml")`` | * The only currently supported | | +| | | XML parser | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ +| html5lib | ``BeautifulSoup(markup, "html5lib")`` | * Extremely lenient | * Very slow | +| | | * Parses pages the same way a | * External Python | +| | | web browser does | dependency | +| | | * Creates valid HTML5 | | ++----------------------+--------------------------------------------+--------------------------------+--------------------------+ + +If you can, I recommend you install and use lxml for speed. If you're +using a very old version of Python -- earlier than 2.7.3 or 3.2.2 -- +it's `essential` that you install lxml or html5lib. Python's built-in +HTML parser is just not very good in those old versions. + +Note that if a document is invalid, different parsers will generate +different Beautiful Soup trees for it. See `Differences +between parsers`_ for details. + +Making the soup +=============== + +To parse a document, pass it into the ``BeautifulSoup`` +constructor. You can pass in a string or an open filehandle:: + + from bs4 import BeautifulSoup + + with open("index.html") as fp: + soup = BeautifulSoup(fp, 'html.parser') + + soup = BeautifulSoup("<html>a web page</html>", 'html.parser') + +First, the document is converted to Unicode, and HTML entities are +converted to Unicode characters:: + + print(BeautifulSoup("<html><head></head><body>Sacré bleu!</body></html>", "html.parser")) + # <html><head></head><body>Sacré bleu!</body></html> + +Beautiful Soup then parses the document using the best available +parser. It will use an HTML parser unless you specifically tell it to +use an XML parser. (See `Parsing XML`_.) + +Kinds of objects +================ + +Beautiful Soup transforms a complex HTML document into a complex tree +of Python objects. But you'll only ever have to deal with about four +`kinds` of objects: ``Tag``, ``NavigableString``, ``BeautifulSoup``, +and ``Comment``. + +.. _Tag: + +``Tag`` +------- + +A ``Tag`` object corresponds to an XML or HTML tag in the original document:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser') + tag = soup.b + type(tag) + # <class 'bs4.element.Tag'> + +Tags have a lot of attributes and methods, and I'll cover most of them +in `Navigating the tree`_ and `Searching the tree`_. For now, the most +important features of a tag are its name and attributes. + +Name +^^^^ + +Every tag has a name, accessible as ``.name``:: + + tag.name + # 'b' + +If you change a tag's name, the change will be reflected in any HTML +markup generated by Beautiful Soup:: + + tag.name = "blockquote" + tag + # <blockquote class="boldest">Extremely bold</blockquote> + +Attributes +^^^^^^^^^^ + +A tag may have any number of attributes. The tag ``<b +id="boldest">`` has an attribute "id" whose value is +"boldest". You can access a tag's attributes by treating the tag like +a dictionary:: + + tag = BeautifulSoup('<b id="boldest">bold</b>', 'html.parser').b + tag['id'] + # 'boldest' + +You can access that dictionary directly as ``.attrs``:: + + tag.attrs + # {'id': 'boldest'} + +You can add, remove, and modify a tag's attributes. Again, this is +done by treating the tag as a dictionary:: + + tag['id'] = 'verybold' + tag['another-attribute'] = 1 + tag + # <b another-attribute="1" id="verybold"></b> + + del tag['id'] + del tag['another-attribute'] + tag + # <b>bold</b> + + tag['id'] + # KeyError: 'id' + tag.get('id') + # None + +.. _multivalue: + +Multi-valued attributes +&&&&&&&&&&&&&&&&&&&&&&& + +HTML 4 defines a few attributes that can have multiple values. HTML 5 +removes a couple of them, but defines a few more. The most common +multi-valued attribute is ``class`` (that is, a tag can have more than +one CSS class). Others include ``rel``, ``rev``, ``accept-charset``, +``headers``, and ``accesskey``. Beautiful Soup presents the value(s) +of a multi-valued attribute as a list:: + + css_soup = BeautifulSoup('<p class="body"></p>', 'html.parser') + css_soup.p['class'] + # ['body'] + + css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser') + css_soup.p['class'] + # ['body', 'strikeout'] + +If an attribute `looks` like it has more than one value, but it's not +a multi-valued attribute as defined by any version of the HTML +standard, Beautiful Soup will leave the attribute alone:: + + id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser') + id_soup.p['id'] + # 'my id' + +When you turn a tag back into a string, multiple attribute values are +consolidated:: + + rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>', 'html.parser') + rel_soup.a['rel'] + # ['index'] + rel_soup.a['rel'] = ['index', 'contents'] + print(rel_soup.p) + # <p>Back to the <a rel="index contents">homepage</a></p> + +You can disable this by passing ``multi_valued_attributes=None`` as a +keyword argument into the ``BeautifulSoup`` constructor:: + + no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser', multi_valued_attributes=None) + no_list_soup.p['class'] + # 'body strikeout' + +You can use ```get_attribute_list`` to get a value that's always a +list, whether or not it's a multi-valued atribute:: + + id_soup.p.get_attribute_list('id') + # ["my id"] + +If you parse a document as XML, there are no multi-valued attributes:: + + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') + xml_soup.p['class'] + # 'body strikeout' + +Again, you can configure this using the ``multi_valued_attributes`` argument:: + + class_is_multi= { '*' : 'class'} + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi) + xml_soup.p['class'] + # ['body', 'strikeout'] + +You probably won't need to do this, but if you do, use the defaults as +a guide. They implement the rules described in the HTML specification:: + + from bs4.builder import builder_registry + builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES + + +``NavigableString`` +------------------- + +A string corresponds to a bit of text within a tag. Beautiful Soup +uses the ``NavigableString`` class to contain these bits of text:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser') + tag = soup.b + tag.string + # 'Extremely bold' + type(tag.string) + # <class 'bs4.element.NavigableString'> + +A ``NavigableString`` is just like a Python Unicode string, except +that it also supports some of the features described in `Navigating +the tree`_ and `Searching the tree`_. You can convert a +``NavigableString`` to a Unicode string with ``unicode()`` (in +Python 2) or ``str`` (in Python 3):: + + unicode_string = str(tag.string) + unicode_string + # 'Extremely bold' + type(unicode_string) + # <type 'str'> + +You can't edit a string in place, but you can replace one string with +another, using :ref:`replace_with()`:: + + tag.string.replace_with("No longer bold") + tag + # <b class="boldest">No longer bold</b> + +``NavigableString`` supports most of the features described in +`Navigating the tree`_ and `Searching the tree`_, but not all of +them. In particular, since a string can't contain anything (the way a +tag may contain a string or another tag), strings don't support the +``.contents`` or ``.string`` attributes, or the ``find()`` method. + +If you want to use a ``NavigableString`` outside of Beautiful Soup, +you should call ``unicode()`` on it to turn it into a normal Python +Unicode string. If you don't, your string will carry around a +reference to the entire Beautiful Soup parse tree, even when you're +done using Beautiful Soup. This is a big waste of memory. + +``BeautifulSoup`` +----------------- + +The ``BeautifulSoup`` object represents the parsed document as a +whole. For most purposes, you can treat it as a :ref:`Tag` +object. This means it supports most of the methods described in +`Navigating the tree`_ and `Searching the tree`_. + +You can also pass a ``BeautifulSoup`` object into one of the methods +defined in `Modifying the tree`_, just as you would a :ref:`Tag`. This +lets you do things like combine two parsed documents:: + + doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml") + footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml") + doc.find(text="INSERT FOOTER HERE").replace_with(footer) + # 'INSERT FOOTER HERE' + print(doc) + # <?xml version="1.0" encoding="utf-8"?> + # <document><content/><footer>Here's the footer</footer></document> + +Since the ``BeautifulSoup`` object doesn't correspond to an actual +HTML or XML tag, it has no name and no attributes. But sometimes it's +useful to look at its ``.name``, so it's been given the special +``.name`` "[document]":: + + soup.name + # '[document]' + +Comments and other special strings +---------------------------------- + +``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost +everything you'll see in an HTML or XML file, but there are a few +leftover bits. The main one you'll probably encounter +is the comment:: + + markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" + soup = BeautifulSoup(markup, 'html.parser') + comment = soup.b.string + type(comment) + # <class 'bs4.element.Comment'> + +The ``Comment`` object is just a special type of ``NavigableString``:: + + comment + # 'Hey, buddy. Want to buy a used parser' + +But when it appears as part of an HTML document, a ``Comment`` is +displayed with special formatting:: + + print(soup.b.prettify()) + # <b> + # <!--Hey, buddy. Want to buy a used parser?--> + # </b> + +Beautiful Soup also defines classes called ``Stylesheet``, ``Script``, +and ``TemplateString``, for embedded CSS stylesheets (any strings +found inside a ``<style>`` tag), embedded Javascript (any strings +found in a ``<script>`` tag), and HTML templates (any strings inside a +``<template>`` tag). These classes work exactly the same way as +``NavigableString``; their only purpose is to make it easier to pick +out the main body of the page, by ignoring strings that represent +something else. `(These classes are new in Beautiful Soup 4.9.0, and +the html5lib parser doesn't use them.)` + +Beautiful Soup defines classes for anything else that might show up in +an XML document: ``CData``, ``ProcessingInstruction``, +``Declaration``, and ``Doctype``. Like ``Comment``, these classes +are subclasses of ``NavigableString`` that add something extra to the +string. Here's an example that replaces the comment with a CDATA +block:: + + from bs4 import CData + cdata = CData("A CDATA block") + comment.replace_with(cdata) + + print(soup.b.prettify()) + # <b> + # <![CDATA[A CDATA block]]> + # </b> + + +Navigating the tree +=================== + +Here's the "Three sisters" HTML document again:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + +I'll use this as an example to show you how to move from one part of +a document to another. + +Going down +---------- + +Tags may contain strings and other tags. These elements are the tag's +`children`. Beautiful Soup provides a lot of different attributes for +navigating and iterating over a tag's children. + +Note that Beautiful Soup strings don't support any of these +attributes, because a string can't have children. + +Navigating using tag names +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The simplest way to navigate the parse tree is to say the name of the +tag you want. If you want the <head> tag, just say ``soup.head``:: + + soup.head + # <head><title>The Dormouse's story</title></head> + + soup.title + # <title>The Dormouse's story</title> + +You can do use this trick again and again to zoom in on a certain part +of the parse tree. This code gets the first <b> tag beneath the <body> tag:: + + soup.body.b + # <b>The Dormouse's story</b> + +Using a tag name as an attribute will give you only the `first` tag by that +name:: + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +If you need to get `all` the <a> tags, or anything more complicated +than the first tag with a certain name, you'll need to use one of the +methods described in `Searching the tree`_, such as `find_all()`:: + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +``.contents`` and ``.children`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A tag's children are available in a list called ``.contents``:: + + head_tag = soup.head + head_tag + # <head><title>The Dormouse's story</title></head> + + head_tag.contents + # [<title>The Dormouse's story</title>] + + title_tag = head_tag.contents[0] + title_tag + # <title>The Dormouse's story</title> + title_tag.contents + # ['The Dormouse's story'] + +The ``BeautifulSoup`` object itself has children. In this case, the +<html> tag is the child of the ``BeautifulSoup`` object.:: + + len(soup.contents) + # 1 + soup.contents[0].name + # 'html' + +A string does not have ``.contents``, because it can't contain +anything:: + + text = title_tag.contents[0] + text.contents + # AttributeError: 'NavigableString' object has no attribute 'contents' + +Instead of getting them as a list, you can iterate over a tag's +children using the ``.children`` generator:: + + for child in title_tag.children: + print(child) + # The Dormouse's story + +``.descendants`` +^^^^^^^^^^^^^^^^ + +The ``.contents`` and ``.children`` attributes only consider a tag's +`direct` children. For instance, the <head> tag has a single direct +child--the <title> tag:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + +But the <title> tag itself has a child: the string "The Dormouse's +story". There's a sense in which that string is also a child of the +<head> tag. The ``.descendants`` attribute lets you iterate over `all` +of a tag's children, recursively: its direct children, the children of +its direct children, and so on:: + + for child in head_tag.descendants: + print(child) + # <title>The Dormouse's story</title> + # The Dormouse's story + +The <head> tag has only one child, but it has two descendants: the +<title> tag and the <title> tag's child. The ``BeautifulSoup`` object +only has one direct child (the <html> tag), but it has a whole lot of +descendants:: + + len(list(soup.children)) + # 1 + len(list(soup.descendants)) + # 26 + +.. _.string: + +``.string`` +^^^^^^^^^^^ + +If a tag has only one child, and that child is a ``NavigableString``, +the child is made available as ``.string``:: + + title_tag.string + # 'The Dormouse's story' + +If a tag's only child is another tag, and `that` tag has a +``.string``, then the parent tag is considered to have the same +``.string`` as its child:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + + head_tag.string + # 'The Dormouse's story' + +If a tag contains more than one thing, then it's not clear what +``.string`` should refer to, so ``.string`` is defined to be +``None``:: + + print(soup.html.string) + # None + +.. _string-generators: + +``.strings`` and ``stripped_strings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If there's more than one thing inside a tag, you can still look at +just the strings. Use the ``.strings`` generator:: + + for string in soup.strings: + print(repr(string)) + '\n' + # "The Dormouse's story" + # '\n' + # '\n' + # "The Dormouse's story" + # '\n' + # 'Once upon a time there were three little sisters; and their names were\n' + # 'Elsie' + # ',\n' + # 'Lacie' + # ' and\n' + # 'Tillie' + # ';\nand they lived at the bottom of a well.' + # '\n' + # '...' + # '\n' + +These strings tend to have a lot of extra whitespace, which you can +remove by using the ``.stripped_strings`` generator instead:: + + for string in soup.stripped_strings: + print(repr(string)) + # "The Dormouse's story" + # "The Dormouse's story" + # 'Once upon a time there were three little sisters; and their names were' + # 'Elsie' + # ',' + # 'Lacie' + # 'and' + # 'Tillie' + # ';\n and they lived at the bottom of a well.' + # '...' + +Here, strings consisting entirely of whitespace are ignored, and +whitespace at the beginning and end of strings is removed. + +Going up +-------- + +Continuing the "family tree" analogy, every tag and every string has a +`parent`: the tag that contains it. + +.. _.parent: + +``.parent`` +^^^^^^^^^^^ + +You can access an element's parent with the ``.parent`` attribute. In +the example "three sisters" document, the <head> tag is the parent +of the <title> tag:: + + title_tag = soup.title + title_tag + # <title>The Dormouse's story</title> + title_tag.parent + # <head><title>The Dormouse's story</title></head> + +The title string itself has a parent: the <title> tag that contains +it:: + + title_tag.string.parent + # <title>The Dormouse's story</title> + +The parent of a top-level tag like <html> is the ``BeautifulSoup`` object +itself:: + + html_tag = soup.html + type(html_tag.parent) + # <class 'bs4.BeautifulSoup'> + +And the ``.parent`` of a ``BeautifulSoup`` object is defined as None:: + + print(soup.parent) + # None + +.. _.parents: + +``.parents`` +^^^^^^^^^^^^ + +You can iterate over all of an element's parents with +``.parents``. This example uses ``.parents`` to travel from an <a> tag +buried deep within the document, to the very top of the document:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + for parent in link.parents: + print(parent.name) + # p + # body + # html + # [document] + +Going sideways +-------------- + +Consider a simple document like this:: + + sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>", 'html.parser') + print(sibling_soup.prettify()) + # <a> + # <b> + # text1 + # </b> + # <c> + # text2 + # </c> + # </a> + +The <b> tag and the <c> tag are at the same level: they're both direct +children of the same tag. We call them `siblings`. When a document is +pretty-printed, siblings show up at the same indentation level. You +can also use this relationship in the code you write. + +``.next_sibling`` and ``.previous_sibling`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can use ``.next_sibling`` and ``.previous_sibling`` to navigate +between page elements that are on the same level of the parse tree:: + + sibling_soup.b.next_sibling + # <c>text2</c> + + sibling_soup.c.previous_sibling + # <b>text1</b> + +The <b> tag has a ``.next_sibling``, but no ``.previous_sibling``, +because there's nothing before the <b> tag `on the same level of the +tree`. For the same reason, the <c> tag has a ``.previous_sibling`` +but no ``.next_sibling``:: + + print(sibling_soup.b.previous_sibling) + # None + print(sibling_soup.c.next_sibling) + # None + +The strings "text1" and "text2" are `not` siblings, because they don't +have the same parent:: + + sibling_soup.b.string + # 'text1' + + print(sibling_soup.b.string.next_sibling) + # None + +In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a +tag will usually be a string containing whitespace. Going back to the +"three sisters" document:: + + # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> + # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> + # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> + +You might think that the ``.next_sibling`` of the first <a> tag would +be the second <a> tag. But actually, it's a string: the comma and +newline that separate the first <a> tag from the second:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + link.next_sibling + # ',\n ' + +The second <a> tag is actually the ``.next_sibling`` of the comma:: + + link.next_sibling.next_sibling + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + +.. _sibling-generators: + +``.next_siblings`` and ``.previous_siblings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can iterate over a tag's siblings with ``.next_siblings`` or +``.previous_siblings``:: + + for sibling in soup.a.next_siblings: + print(repr(sibling)) + # ',\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # ' and\n' + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + # '; and they lived at the bottom of a well.' + + for sibling in soup.find(id="link3").previous_siblings: + print(repr(sibling)) + # ' and\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # ',\n' + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + # 'Once upon a time there were three little sisters; and their names were\n' + +Going back and forth +-------------------- + +Take a look at the beginning of the "three sisters" document:: + + # <html><head><title>The Dormouse's story</title></head> + # <p class="title"><b>The Dormouse's story</b></p> + +An HTML parser takes this string of characters and turns it into a +series of events: "open an <html> tag", "open a <head> tag", "open a +<title> tag", "add a string", "close the <title> tag", "open a <p> +tag", and so on. Beautiful Soup offers tools for reconstructing the +initial parse of the document. + +.. _element-generators: + +``.next_element`` and ``.previous_element`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``.next_element`` attribute of a string or tag points to whatever +was parsed immediately afterwards. It might be the same as +``.next_sibling``, but it's usually drastically different. + +Here's the final <a> tag in the "three sisters" document. Its +``.next_sibling`` is a string: the conclusion of the sentence that was +interrupted by the start of the <a> tag.:: + + last_a_tag = soup.find("a", id="link3") + last_a_tag + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_a_tag.next_sibling + # ';\nand they lived at the bottom of a well.' + +But the ``.next_element`` of that <a> tag, the thing that was parsed +immediately after the <a> tag, is `not` the rest of that sentence: +it's the word "Tillie":: + + last_a_tag.next_element + # 'Tillie' + +That's because in the original markup, the word "Tillie" appeared +before that semicolon. The parser encountered an <a> tag, then the +word "Tillie", then the closing </a> tag, then the semicolon and rest of +the sentence. The semicolon is on the same level as the <a> tag, but the +word "Tillie" was encountered first. + +The ``.previous_element`` attribute is the exact opposite of +``.next_element``. It points to whatever element was parsed +immediately before this one:: + + last_a_tag.previous_element + # ' and\n' + last_a_tag.previous_element.next_element + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +``.next_elements`` and ``.previous_elements`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You should get the idea by now. You can use these iterators to move +forward or backward in the document as it was parsed:: + + for element in last_a_tag.next_elements: + print(repr(element)) + # 'Tillie' + # ';\nand they lived at the bottom of a well.' + # '\n' + # <p class="story">...</p> + # '...' + # '\n' + +Searching the tree +================== + +Beautiful Soup defines a lot of methods for searching the parse tree, +but they're all very similar. I'm going to spend a lot of time explaining +the two most popular methods: ``find()`` and ``find_all()``. The other +methods take almost exactly the same arguments, so I'll just cover +them briefly. + +Once again, I'll be using the "three sisters" document as an example:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc, 'html.parser') + +By passing in a filter to an argument like ``find_all()``, you can +zoom in on the parts of the document you're interested in. + +Kinds of filters +---------------- + +Before talking in detail about ``find_all()`` and similar methods, I +want to show examples of different filters you can pass into these +methods. These filters show up again and again, throughout the +search API. You can use them to filter based on a tag's name, +on its attributes, on the text of a string, or on some combination of +these. + +.. _a string: + +A string +^^^^^^^^ + +The simplest filter is a string. Pass a string to a search method and +Beautiful Soup will perform a match against that exact string. This +code finds all the <b> tags in the document:: + + soup.find_all('b') + # [<b>The Dormouse's story</b>] + +If you pass in a byte string, Beautiful Soup will assume the string is +encoded as UTF-8. You can avoid this by passing in a Unicode string instead. + +.. _a regular expression: + +A regular expression +^^^^^^^^^^^^^^^^^^^^ + +If you pass in a regular expression object, Beautiful Soup will filter +against that regular expression using its ``search()`` method. This code +finds all the tags whose names start with the letter "b"; in this +case, the <body> tag and the <b> tag:: + + import re + for tag in soup.find_all(re.compile("^b")): + print(tag.name) + # body + # b + +This code finds all the tags whose names contain the letter 't':: + + for tag in soup.find_all(re.compile("t")): + print(tag.name) + # html + # title + +.. _a list: + +A list +^^^^^^ + +If you pass in a list, Beautiful Soup will allow a string match +against `any` item in that list. This code finds all the <a> tags +`and` all the <b> tags:: + + soup.find_all(["a", "b"]) + # [<b>The Dormouse's story</b>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _the value True: + +``True`` +^^^^^^^^ + +The value ``True`` matches everything it can. This code finds `all` +the tags in the document, but none of the text strings:: + + for tag in soup.find_all(True): + print(tag.name) + # html + # head + # title + # body + # p + # b + # p + # a + # a + # a + # p + +.. a function: + +A function +^^^^^^^^^^ + +If none of the other matches work for you, define a function that +takes an element as its only argument. The function should return +``True`` if the argument matches, and ``False`` otherwise. + +Here's a function that returns ``True`` if a tag defines the "class" +attribute but doesn't define the "id" attribute:: + + def has_class_but_no_id(tag): + return tag.has_attr('class') and not tag.has_attr('id') + +Pass this function into ``find_all()`` and you'll pick up all the <p> +tags:: + + soup.find_all(has_class_but_no_id) + # [<p class="title"><b>The Dormouse's story</b></p>, + # <p class="story">Once upon a time there were…bottom of a well.</p>, + # <p class="story">...</p>] + +This function only picks up the <p> tags. It doesn't pick up the <a> +tags, because those tags define both "class" and "id". It doesn't pick +up tags like <html> and <title>, because those tags don't define +"class". + +If you pass in a function to filter on a specific attribute like +``href``, the argument passed into the function will be the attribute +value, not the whole tag. Here's a function that finds all ``a`` tags +whose ``href`` attribute *does not* match a regular expression:: + + import re + def not_lacie(href): + return href and not re.compile("lacie").search(href) + + soup.find_all(href=not_lacie) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +The function can be as complicated as you need it to be. Here's a +function that returns ``True`` if a tag is surrounded by string +objects:: + + from bs4 import NavigableString + def surrounded_by_strings(tag): + return (isinstance(tag.next_element, NavigableString) + and isinstance(tag.previous_element, NavigableString)) + + for tag in soup.find_all(surrounded_by_strings): + print(tag.name) + # body + # p + # a + # a + # a + # p + +Now we're ready to look at the search methods in detail. + +``find_all()`` +-------------- + +Method signature: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +The ``find_all()`` method looks through a tag's descendants and +retrieves `all` descendants that match your filters. I gave several +examples in `Kinds of filters`_, but here are a few more:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.find_all("p", "title") + # [<p class="title"><b>The Dormouse's story</b></p>] + + soup.find_all("a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find_all(id="link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + import re + soup.find(string=re.compile("sisters")) + # 'Once upon a time there were three little sisters; and their names were\n' + +Some of these should look familiar, but others are new. What does it +mean to pass in a value for ``string``, or ``id``? Why does +``find_all("p", "title")`` find a <p> tag with the CSS class "title"? +Let's look at the arguments to ``find_all()``. + +.. _name: + +The ``name`` argument +^^^^^^^^^^^^^^^^^^^^^ + +Pass in a value for ``name`` and you'll tell Beautiful Soup to only +consider tags with certain names. Text strings will be ignored, as +will tags whose names that don't match. + +This is the simplest usage:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + +Recall from `Kinds of filters`_ that the value to ``name`` can be `a +string`_, `a regular expression`_, `a list`_, `a function`_, or `the value +True`_. + +.. _kwargs: + +The keyword arguments +^^^^^^^^^^^^^^^^^^^^^ + +Any argument that's not recognized will be turned into a filter on one +of a tag's attributes. If you pass in a value for an argument called ``id``, +Beautiful Soup will filter against each tag's 'id' attribute:: + + soup.find_all(id='link2') + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +If you pass in a value for ``href``, Beautiful Soup will filter +against each tag's 'href' attribute:: + + soup.find_all(href=re.compile("elsie")) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +You can filter an attribute based on `a string`_, `a regular +expression`_, `a list`_, `a function`_, or `the value True`_. + +This code finds all tags whose ``id`` attribute has a value, +regardless of what the value is:: + + soup.find_all(id=True) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +You can filter multiple attributes at once by passing in more than one +keyword argument:: + + soup.find_all(href=re.compile("elsie"), id='link1') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +Some attributes, like the data-* attributes in HTML 5, have names that +can't be used as the names of keyword arguments:: + + data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'html.parser') + data_soup.find_all(data-foo="value") + # SyntaxError: keyword can't be an expression + +You can use these attributes in searches by putting them into a +dictionary and passing the dictionary into ``find_all()`` as the +``attrs`` argument:: + + data_soup.find_all(attrs={"data-foo": "value"}) + # [<div data-foo="value">foo!</div>] + +You can't use a keyword argument to search for HTML's 'name' element, +because Beautiful Soup uses the ``name`` argument to contain the name +of the tag itself. Instead, you can give a value to 'name' in the +``attrs`` argument:: + + name_soup = BeautifulSoup('<input name="email"/>', 'html.parser') + name_soup.find_all(name="email") + # [] + name_soup.find_all(attrs={"name": "email"}) + # [<input name="email"/>] + +.. _attrs: + +Searching by CSS class +^^^^^^^^^^^^^^^^^^^^^^ + +It's very useful to search for a tag that has a certain CSS class, but +the name of the CSS attribute, "class", is a reserved word in +Python. Using ``class`` as a keyword argument will give you a syntax +error. As of Beautiful Soup 4.1.2, you can search by CSS class using +the keyword argument ``class_``:: + + soup.find_all("a", class_="sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +As with any keyword argument, you can pass ``class_`` a string, a regular +expression, a function, or ``True``:: + + soup.find_all(class_=re.compile("itl")) + # [<p class="title"><b>The Dormouse's story</b></p>] + + def has_six_characters(css_class): + return css_class is not None and len(css_class) == 6 + + soup.find_all(class_=has_six_characters) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +:ref:`Remember <multivalue>` that a single tag can have multiple +values for its "class" attribute. When you search for a tag that +matches a certain CSS class, you're matching against `any` of its CSS +classes:: + + css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser') + css_soup.find_all("p", class_="strikeout") + # [<p class="body strikeout"></p>] + + css_soup.find_all("p", class_="body") + # [<p class="body strikeout"></p>] + +You can also search for the exact string value of the ``class`` attribute:: + + css_soup.find_all("p", class_="body strikeout") + # [<p class="body strikeout"></p>] + +But searching for variants of the string value won't work:: + + css_soup.find_all("p", class_="strikeout body") + # [] + +If you want to search for tags that match two or more CSS classes, you +should use a CSS selector:: + + css_soup.select("p.strikeout.body") + # [<p class="body strikeout"></p>] + +In older versions of Beautiful Soup, which don't have the ``class_`` +shortcut, you can use the ``attrs`` trick mentioned above. Create a +dictionary whose value for "class" is the string (or regular +expression, or whatever) you want to search for:: + + soup.find_all("a", attrs={"class": "sister"}) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _string: + +The ``string`` argument +^^^^^^^^^^^^^^^^^^^^^^^ + +With ``string`` you can search for strings instead of tags. As with +``name`` and the keyword arguments, you can pass in `a string`_, `a +regular expression`_, `a list`_, `a function`_, or `the value True`_. +Here are some examples:: + + soup.find_all(string="Elsie") + # ['Elsie'] + + soup.find_all(string=["Tillie", "Elsie", "Lacie"]) + # ['Elsie', 'Lacie', 'Tillie'] + + soup.find_all(string=re.compile("Dormouse")) + # ["The Dormouse's story", "The Dormouse's story"] + + def is_the_only_string_within_a_tag(s): + """Return True if this string is the only child of its parent tag.""" + return (s == s.parent.string) + + soup.find_all(string=is_the_only_string_within_a_tag) + # ["The Dormouse's story", "The Dormouse's story", 'Elsie', 'Lacie', 'Tillie', '...'] + +Although ``string`` is for finding strings, you can combine it with +arguments that find tags: Beautiful Soup will find all tags whose +``.string`` matches your value for ``string``. This code finds the <a> +tags whose ``.string`` is "Elsie":: + + soup.find_all("a", string="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +The ``string`` argument is new in Beautiful Soup 4.4.0. In earlier +versions it was called ``text``:: + + soup.find_all("a", text="Elsie") + # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] + +.. _limit: + +The ``limit`` argument +^^^^^^^^^^^^^^^^^^^^^^ + +``find_all()`` returns all the tags and strings that match your +filters. This can take a while if the document is large. If you don't +need `all` the results, you can pass in a number for ``limit``. This +works just like the LIMIT keyword in SQL. It tells Beautiful Soup to +stop gathering results after it's found a certain number. + +There are three links in the "three sisters" document, but this code +only finds the first two:: + + soup.find_all("a", limit=2) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +.. _recursive: + +The ``recursive`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you call ``mytag.find_all()``, Beautiful Soup will examine all the +descendants of ``mytag``: its children, its children's children, and +so on. If you only want Beautiful Soup to consider direct children, +you can pass in ``recursive=False``. See the difference here:: + + soup.html.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.html.find_all("title", recursive=False) + # [] + +Here's that part of the document:: + + <html> + <head> + <title> + The Dormouse's story + </title> + </head> + ... + +The <title> tag is beneath the <html> tag, but it's not `directly` +beneath the <html> tag: the <head> tag is in the way. Beautiful Soup +finds the <title> tag when it's allowed to look at all descendants of +the <html> tag, but when ``recursive=False`` restricts it to the +<html> tag's immediate children, it finds nothing. + +Beautiful Soup offers a lot of tree-searching methods (covered below), +and they mostly take the same arguments as ``find_all()``: ``name``, +``attrs``, ``string``, ``limit``, and the keyword arguments. But the +``recursive`` argument is different: ``find_all()`` and ``find()`` are +the only methods that support it. Passing ``recursive=False`` into a +method like ``find_parents()`` wouldn't be very useful. + +Calling a tag is like calling ``find_all()`` +-------------------------------------------- + +Because ``find_all()`` is the most popular method in the Beautiful +Soup search API, you can use a shortcut for it. If you treat the +``BeautifulSoup`` object or a ``Tag`` object as though it were a +function, then it's the same as calling ``find_all()`` on that +object. These two lines of code are equivalent:: + + soup.find_all("a") + soup("a") + +These two lines are also equivalent:: + + soup.title.find_all(string=True) + soup.title(string=True) + +``find()`` +---------- + +Method signature: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +The ``find_all()`` method scans the entire document looking for +results, but sometimes you only want to find one result. If you know a +document only has one <body> tag, it's a waste of time to scan the +entire document looking for more. Rather than passing in ``limit=1`` +every time you call ``find_all``, you can use the ``find()`` +method. These two lines of code are `nearly` equivalent:: + + soup.find_all('title', limit=1) + # [<title>The Dormouse's story</title>] + + soup.find('title') + # <title>The Dormouse's story</title> + +The only difference is that ``find_all()`` returns a list containing +the single result, and ``find()`` just returns the result. + +If ``find_all()`` can't find anything, it returns an empty list. If +``find()`` can't find anything, it returns ``None``:: + + print(soup.find("nosuchtag")) + # None + +Remember the ``soup.head.title`` trick from `Navigating using tag +names`_? That trick works by repeatedly calling ``find()``:: + + soup.head.title + # <title>The Dormouse's story</title> + + soup.find("head").find("title") + # <title>The Dormouse's story</title> + +``find_parents()`` and ``find_parent()`` +---------------------------------------- + +Method signature: find_parents(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Method signature: find_parent(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +I spent a lot of time above covering ``find_all()`` and +``find()``. The Beautiful Soup API defines ten other methods for +searching the tree, but don't be afraid. Five of these methods are +basically the same as ``find_all()``, and the other five are basically +the same as ``find()``. The only differences are in what parts of the +tree they search. + +First let's consider ``find_parents()`` and +``find_parent()``. Remember that ``find_all()`` and ``find()`` work +their way down the tree, looking at tag's descendants. These methods +do the opposite: they work their way `up` the tree, looking at a tag's +(or a string's) parents. Let's try them out, starting from a string +buried deep in the "three daughters" document:: + + a_string = soup.find(string="Lacie") + a_string + # 'Lacie' + + a_string.find_parents("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + a_string.find_parent("p") + # <p class="story">Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; + # and they lived at the bottom of a well.</p> + + a_string.find_parents("p", class_="title") + # [] + +One of the three <a> tags is the direct parent of the string in +question, so our search finds it. One of the three <p> tags is an +indirect parent of the string, and our search finds that as +well. There's a <p> tag with the CSS class "title" `somewhere` in the +document, but it's not one of this string's parents, so we can't find +it with ``find_parents()``. + +You may have made the connection between ``find_parent()`` and +``find_parents()``, and the `.parent`_ and `.parents`_ attributes +mentioned earlier. The connection is very strong. These search methods +actually use ``.parents`` to iterate over all the parents, and check +each one against the provided filter to see if it matches. + +``find_next_siblings()`` and ``find_next_sibling()`` +---------------------------------------------------- + +Method signature: find_next_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Method signature: find_next_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.next_siblings <sibling-generators>` to +iterate over the rest of an element's siblings in the tree. The +``find_next_siblings()`` method returns all the siblings that match, +and ``find_next_sibling()`` only returns the first one:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_next_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_next_sibling("p") + # <p class="story">...</p> + +``find_previous_siblings()`` and ``find_previous_sibling()`` +------------------------------------------------------------ + +Method signature: find_previous_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Method signature: find_previous_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.previous_siblings <sibling-generators>` to iterate over an element's +siblings that precede it in the tree. The ``find_previous_siblings()`` +method returns all the siblings that match, and +``find_previous_sibling()`` only returns the first one:: + + last_link = soup.find("a", id="link3") + last_link + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_link.find_previous_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_previous_sibling("p") + # <p class="title"><b>The Dormouse's story</b></p> + + +``find_all_next()`` and ``find_next()`` +--------------------------------------- + +Method signature: find_all_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Method signature: find_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.next_elements <element-generators>` to +iterate over whatever tags and strings that come after it in the +document. The ``find_all_next()`` method returns all matches, and +``find_next()`` only returns the first match:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_next(string=True) + # ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', + # ';\nand they lived at the bottom of a well.', '\n', '...', '\n'] + + first_link.find_next("p") + # <p class="story">...</p> + +In the first example, the string "Elsie" showed up, even though it was +contained within the <a> tag we started from. In the second example, +the last <p> tag in the document showed up, even though it's not in +the same part of the tree as the <a> tag we started from. For these +methods, all that matters is that an element match the filter, and +show up later in the document than the starting element. + +``find_all_previous()`` and ``find_previous()`` +----------------------------------------------- + +Method signature: find_all_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Method signature: find_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`string <string>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.previous_elements <element-generators>` to +iterate over the tags and strings that came before it in the +document. The ``find_all_previous()`` method returns all matches, and +``find_previous()`` only returns the first match:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_previous("p") + # [<p class="story">Once upon a time there were three little sisters; ...</p>, + # <p class="title"><b>The Dormouse's story</b></p>] + + first_link.find_previous("title") + # <title>The Dormouse's story</title> + +The call to ``find_all_previous("p")`` found the first paragraph in +the document (the one with class="title"), but it also finds the +second paragraph, the <p> tag that contains the <a> tag we started +with. This shouldn't be too surprising: we're looking at all the tags +that show up earlier in the document than the one we started with. A +<p> tag that contains an <a> tag must have shown up before the <a> +tag it contains. + +CSS selectors +------------- + +``BeautifulSoup`` has a ``.select()`` method which uses the `SoupSieve +<https://facelessuser.github.io/soupsieve/>`_ package to run a CSS +selector against a parsed document and return all the matching +elements. ``Tag`` has a similar method which runs a CSS selector +against the contents of a single tag. + +(The SoupSieve integration was added in Beautiful Soup 4.7.0. Earlier +versions also have the ``.select()`` method, but only the most +commonly-used CSS selectors are supported. If you installed Beautiful +Soup through ``pip``, SoupSieve was installed at the same time, so you +don't have to do anything extra.) + +The SoupSieve `documentation +<https://facelessuser.github.io/soupsieve/>`_ lists all the currently +supported CSS selectors, but here are some of the basics: + +You can find tags:: + + soup.select("title") + # [<title>The Dormouse's story</title>] + + soup.select("p:nth-of-type(3)") + # [<p class="story">...</p>] + +Find tags beneath other tags:: + + soup.select("body a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("html head title") + # [<title>The Dormouse's story</title>] + +Find tags `directly` beneath other tags:: + + soup.select("head > title") + # [<title>The Dormouse's story</title>] + + soup.select("p > a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("p > a:nth-of-type(2)") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + soup.select("p > #link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("body > a") + # [] + +Find the siblings of tags:: + + soup.select("#link1 ~ .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("#link1 + .sister") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Find tags by CSS class:: + + soup.select(".sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select("[class~=sister]") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Find tags by ID:: + + soup.select("#link1") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select("a#link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Find tags that match any selector from a list of selectors:: + + soup.select("#link1,#link2") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +Test for the existence of an attribute:: + + soup.select('a[href]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +Find tags by attribute value:: + + soup.select('a[href="http://example.com/elsie"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + soup.select('a[href^="http://example.com/"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href$="tillie"]') + # [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.select('a[href*=".com/el"]') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +There's also a method called ``select_one()``, which finds only the +first tag that matches a selector:: + + soup.select_one(".sister") + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +If you've parsed XML that defines namespaces, you can use them in CSS +selectors.:: + + from bs4 import BeautifulSoup + xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/"> + <ns1:child>I'm in namespace 1</ns1:child> + <ns2:child>I'm in namespace 2</ns2:child> + </tag> """ + soup = BeautifulSoup(xml, "xml") + + soup.select("child") + # [<ns1:child>I'm in namespace 1</ns1:child>, <ns2:child>I'm in namespace 2</ns2:child>] + + soup.select("ns1|child", namespaces=soup.namespaces) + # [<ns1:child>I'm in namespace 1</ns1:child>] + +When handling a CSS selector that uses namespaces, Beautiful Soup +uses the namespace abbreviations it found when parsing the +document. You can override this by passing in your own dictionary of +abbreviations:: + + namespaces = dict(first="http://namespace1/", second="http://namespace2/") + soup.select("second|child", namespaces=namespaces) + # [<ns1:child>I'm in namespace 2</ns1:child>] + +All this CSS selector stuff is a convenience for people who already +know the CSS selector syntax. You can do all of this with the +Beautiful Soup API. And if CSS selectors are all you need, you should +parse the document with lxml: it's a lot faster. But this lets you +`combine` CSS selectors with the Beautiful Soup API. + +Modifying the tree +================== + +Beautiful Soup's main strength is in searching the parse tree, but you +can also modify the tree and write your changes as a new HTML or XML +document. + +Changing tag names and attributes +--------------------------------- + +I covered this earlier, in `Attributes`_, but it bears repeating. You +can rename a tag, change the values of its attributes, add new +attributes, and delete attributes:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser') + tag = soup.b + + tag.name = "blockquote" + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + +Modifying ``.string`` +--------------------- + +If you set a tag's ``.string`` attribute to a new string, the tag's contents are +replaced with that string:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + + tag = soup.a + tag.string = "New link text." + tag + # <a href="http://example.com/">New link text.</a> + +Be careful: if the tag contained other tags, they and all their +contents will be destroyed. + +``append()`` +------------ + +You can add to a tag's contents with ``Tag.append()``. It works just +like calling ``.append()`` on a Python list:: + + soup = BeautifulSoup("<a>Foo</a>", 'html.parser') + soup.a.append("Bar") + + soup + # <a>FooBar</a> + soup.a.contents + # ['Foo', 'Bar'] + +``extend()`` +------------ + +Starting in Beautiful Soup 4.7.0, ``Tag`` also supports a method +called ``.extend()``, which adds every element of a list to a ``Tag``, +in order:: + + soup = BeautifulSoup("<a>Soup</a>", 'html.parser') + soup.a.extend(["'s", " ", "on"]) + + soup + # <a>Soup's on</a> + soup.a.contents + # ['Soup', ''s', ' ', 'on'] + +``NavigableString()`` and ``.new_tag()`` +------------------------------------------------- + +If you need to add a string to a document, no problem--you can pass a +Python string in to ``append()``, or you can call the ``NavigableString`` +constructor:: + + soup = BeautifulSoup("<b></b>", 'html.parser') + tag = soup.b + tag.append("Hello") + new_string = NavigableString(" there") + tag.append(new_string) + tag + # <b>Hello there.</b> + tag.contents + # ['Hello', ' there'] + +If you want to create a comment or some other subclass of +``NavigableString``, just call the constructor:: + + from bs4 import Comment + new_comment = Comment("Nice to see you.") + tag.append(new_comment) + tag + # <b>Hello there<!--Nice to see you.--></b> + tag.contents + # ['Hello', ' there', 'Nice to see you.'] + +`(This is a new feature in Beautiful Soup 4.4.0.)` + +What if you need to create a whole new tag? The best solution is to +call the factory method ``BeautifulSoup.new_tag()``:: + + soup = BeautifulSoup("<b></b>", 'html.parser') + original_tag = soup.b + + new_tag = soup.new_tag("a", href="http://www.example.com") + original_tag.append(new_tag) + original_tag + # <b><a href="http://www.example.com"></a></b> + + new_tag.string = "Link text." + original_tag + # <b><a href="http://www.example.com">Link text.</a></b> + +Only the first argument, the tag name, is required. + +``insert()`` +------------ + +``Tag.insert()`` is just like ``Tag.append()``, except the new element +doesn't necessarily go at the end of its parent's +``.contents``. It'll be inserted at whatever numeric position you +say. It works just like ``.insert()`` on a Python list:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + tag = soup.a + + tag.insert(1, "but did not endorse ") + tag + # <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a> + tag.contents + # ['I linked to ', 'but did not endorse', <i>example.com</i>] + +``insert_before()`` and ``insert_after()`` +------------------------------------------ + +The ``insert_before()`` method inserts tags or strings immediately +before something else in the parse tree:: + + soup = BeautifulSoup("<b>leave</b>", 'html.parser') + tag = soup.new_tag("i") + tag.string = "Don't" + soup.b.string.insert_before(tag) + soup.b + # <b><i>Don't</i>leave</b> + +The ``insert_after()`` method inserts tags or strings immediately +following something else in the parse tree:: + + div = soup.new_tag('div') + div.string = 'ever' + soup.b.i.insert_after(" you ", div) + soup.b + # <b><i>Don't</i> you <div>ever</div> leave</b> + soup.b.contents + # [<i>Don't</i>, ' you', <div>ever</div>, 'leave'] + +``clear()`` +----------- + +``Tag.clear()`` removes the contents of a tag:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + tag = soup.a + + tag.clear() + tag + # <a href="http://example.com/"></a> + +``extract()`` +------------- + +``PageElement.extract()`` removes a tag or string from the tree. It +returns the tag or string that was extracted:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + a_tag = soup.a + + i_tag = soup.i.extract() + + a_tag + # <a href="http://example.com/">I linked to</a> + + i_tag + # <i>example.com</i> + + print(i_tag.parent) + # None + +At this point you effectively have two parse trees: one rooted at the +``BeautifulSoup`` object you used to parse the document, and one rooted +at the tag that was extracted. You can go on to call ``extract`` on +a child of the element you extracted:: + + my_string = i_tag.string.extract() + my_string + # 'example.com' + + print(my_string.parent) + # None + i_tag + # <i></i> + + +``decompose()`` +--------------- + +``Tag.decompose()`` removes a tag from the tree, then `completely +destroys it and its contents`:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + a_tag = soup.a + i_tag = soup.i + + i_tag.decompose() + a_tag + # <a href="http://example.com/">I linked to</a> + +The behavior of a decomposed ``Tag`` or ``NavigableString`` is not +defined and you should not use it for anything. If you're not sure +whether something has been decomposed, you can check its +``.decomposed`` property `(new in Beautiful Soup 4.9.0)`:: + + i_tag.decomposed + # True + + a_tag.decomposed + # False + + +.. _replace_with(): + +``replace_with()`` +------------------ + +``PageElement.replace_with()`` removes a tag or string from the tree, +and replaces it with the tag or string of your choice:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + a_tag = soup.a + + new_tag = soup.new_tag("b") + new_tag.string = "example.net" + a_tag.i.replace_with(new_tag) + + a_tag + # <a href="http://example.com/">I linked to <b>example.net</b></a> + +``replace_with()`` returns the tag or string that was replaced, so +that you can examine it or add it back to another part of the tree. + +``wrap()`` +---------- + +``PageElement.wrap()`` wraps an element in the tag you specify. It +returns the new wrapper:: + + soup = BeautifulSoup("<p>I wish I was bold.</p>", 'html.parser') + soup.p.string.wrap(soup.new_tag("b")) + # <b>I wish I was bold.</b> + + soup.p.wrap(soup.new_tag("div")) + # <div><p><b>I wish I was bold.</b></p></div> + +This method is new in Beautiful Soup 4.0.5. + +``unwrap()`` +--------------------------- + +``Tag.unwrap()`` is the opposite of ``wrap()``. It replaces a tag with +whatever's inside that tag. It's good for stripping out markup:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + a_tag = soup.a + + a_tag.i.unwrap() + a_tag + # <a href="http://example.com/">I linked to example.com</a> + +Like ``replace_with()``, ``unwrap()`` returns the tag +that was replaced. + +``smooth()`` +--------------------------- + +After calling a bunch of methods that modify the parse tree, you may end up with two or more ``NavigableString`` objects next to each other. Beautiful Soup doesn't have any problems with this, but since it can't happen in a freshly parsed document, you might not expect behavior like the following:: + + soup = BeautifulSoup("<p>A one</p>", 'html.parser') + soup.p.append(", a two") + + soup.p.contents + # ['A one', ', a two'] + + print(soup.p.encode()) + # b'<p>A one, a two</p>' + + print(soup.p.prettify()) + # <p> + # A one + # , a two + # </p> + +You can call ``Tag.smooth()`` to clean up the parse tree by consolidating adjacent strings:: + + soup.smooth() + + soup.p.contents + # ['A one, a two'] + + print(soup.p.prettify()) + # <p> + # A one, a two + # </p> + +The ``smooth()`` method is new in Beautiful Soup 4.8.0. + +Output +====== + +.. _.prettyprinting: + +Pretty-printing +--------------- + +The ``prettify()`` method will turn a Beautiful Soup parse tree into a +nicely formatted Unicode string, with a separate line for each +tag and each string:: + + markup = '<html><head><body><a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup, 'html.parser') + soup.prettify() + # '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...' + + print(soup.prettify()) + # <html> + # <head> + # </head> + # <body> + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + # </body> + # </html> + +You can call ``prettify()`` on the top-level ``BeautifulSoup`` object, +or on any of its ``Tag`` objects:: + + print(soup.a.prettify()) + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + +Since it adds whitespace (in the form of newlines), ``prettify()`` +changes the meaning of an HTML document and should not be used to +reformat one. The goal of ``prettify()`` is to help you visually +understand the structure of the documents you work with. + +Non-pretty printing +------------------- + +If you just want a string, with no fancy formatting, you can call +``str()`` on a ``BeautifulSoup`` object (``unicode()`` in Python 2), +or on a ``Tag`` within it:: + + str(soup) + # '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>' + + str(soup.a) + # '<a href="http://example.com/">I linked to <i>example.com</i></a>' + +The ``str()`` function returns a string encoded in UTF-8. See +`Encodings`_ for other options. + +You can also call ``encode()`` to get a bytestring, and ``decode()`` +to get Unicode. + +.. _output_formatters: + +Output formatters +----------------- + +If you give Beautiful Soup a document that contains HTML entities like +"&lquot;", they'll be converted to Unicode characters:: + + soup = BeautifulSoup("“Dammit!” he said.", 'html.parser') + str(soup) + # '“Dammit!” he said.' + +If you then convert the document to a bytestring, the Unicode characters +will be encoded as UTF-8. You won't get the HTML entities back:: + + soup.encode("utf8") + # b'\xe2\x80\x9cDammit!\xe2\x80\x9d he said.' + +By default, the only characters that are escaped upon output are bare +ampersands and angle brackets. These get turned into "&", "<", +and ">", so that Beautiful Soup doesn't inadvertently generate +invalid HTML or XML:: + + soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>", 'html.parser') + soup.p + # <p>The law firm of Dewey, Cheatem, & Howe</p> + + soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>', 'html.parser') + soup.a + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + +You can change this behavior by providing a value for the +``formatter`` argument to ``prettify()``, ``encode()``, or +``decode()``. Beautiful Soup recognizes five possible values for +``formatter``. + +The default is ``formatter="minimal"``. Strings will only be processed +enough to ensure that Beautiful Soup generates valid HTML/XML:: + + french = "<p>Il a dit <<Sacré bleu!>></p>" + soup = BeautifulSoup(french, 'html.parser') + print(soup.prettify(formatter="minimal")) + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + +If you pass in ``formatter="html"``, Beautiful Soup will convert +Unicode characters to HTML entities whenever possible:: + + print(soup.prettify(formatter="html")) + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + +If you pass in ``formatter="html5"``, it's the same as +``formatter="html"``, but Beautiful Soup will +omit the closing slash in HTML void tags like "br":: + + br = BeautifulSoup("<br>", 'html.parser').br + + print(br.encode(formatter="html")) + # b'<br/>' + + print(br.encode(formatter="html5")) + # b'<br>' + +If you pass in ``formatter=None``, Beautiful Soup will not modify +strings at all on output. This is the fastest option, but it may lead +to Beautiful Soup generating invalid HTML/XML, as in these examples:: + + print(soup.prettify(formatter=None)) + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + + link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>', 'html.parser') + print(link_soup.a.encode(formatter=None)) + # b'<a href="http://example.com/?foo=val1&bar=val2">A link</a>' + +If you need more sophisticated control over your output, you can +use Beautiful Soup's ``Formatter`` class. Here's a formatter that +converts strings to uppercase, whether they occur in a text node or in an +attribute value:: + + from bs4.formatter import HTMLFormatter + def uppercase(str): + return str.upper() + + formatter = HTMLFormatter(uppercase) + + print(soup.prettify(formatter=formatter)) + # <p> + # IL A DIT <<SACRÉ BLEU!>> + # </p> + + print(link_soup.a.prettify(formatter=formatter)) + # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2"> + # A LINK + # </a> + +Subclassing ``HTMLFormatter`` or ``XMLFormatter`` will give you even +more control over the output. For example, Beautiful Soup sorts the +attributes in every tag by default:: + + attr_soup = BeautifulSoup(b'<p z="1" m="2" a="3"></p>', 'html.parser') + print(attr_soup.p.encode()) + # <p a="3" m="2" z="1"></p> + +To turn this off, you can subclass the ``Formatter.attributes()`` +method, which controls which attributes are output and in what +order. This implementation also filters out the attribute called "m" +whenever it appears:: + + class UnsortedAttributes(HTMLFormatter): + def attributes(self, tag): + for k, v in tag.attrs.items(): + if k == 'm': + continue + yield k, v + + print(attr_soup.p.encode(formatter=UnsortedAttributes())) + # <p z="1" a="3"></p> + +One last caveat: if you create a ``CData`` object, the text inside +that object is always presented `exactly as it appears, with no +formatting`. Beautiful Soup will call your entity substitution +function, just in case you've written a custom function that counts +all the strings in the document or something, but it will ignore the +return value:: + + from bs4.element import CData + soup = BeautifulSoup("<a></a>", 'html.parser') + soup.a.string = CData("one < three") + print(soup.a.prettify(formatter="html")) + # <a> + # <![CDATA[one < three]]> + # </a> + + +``get_text()`` +-------------- + +If you only want the human-readable text inside a document or tag, you can use the +``get_text()`` method. It returns all the text in a document or +beneath a tag, as a single Unicode string:: + + markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>' + soup = BeautifulSoup(markup, 'html.parser') + + soup.get_text() + '\nI linked to example.com\n' + soup.i.get_text() + 'example.com' + +You can specify a string to be used to join the bits of text +together:: + + # soup.get_text("|") + '\nI linked to |example.com|\n' + +You can tell Beautiful Soup to strip whitespace from the beginning and +end of each bit of text:: + + # soup.get_text("|", strip=True) + 'I linked to|example.com' + +But at that point you might want to use the :ref:`.stripped_strings <string-generators>` +generator instead, and process the text yourself:: + + [text for text in soup.stripped_strings] + # ['I linked to', 'example.com'] + +*As of Beautiful Soup version 4.9.0, when lxml or html.parser are in +use, the contents of <script>, <style>, and <template> +tags are not considered to be 'text', since those tags are not part of +the human-visible content of the page.* + + +Specifying the parser to use +============================ + +If you just need to parse some HTML, you can dump the markup into the +``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful +Soup will pick a parser for you and parse the data. But there are a +few additional arguments you can pass in to the constructor to change +which parser is used. + +The first argument to the ``BeautifulSoup`` constructor is a string or +an open filehandle--the markup you want parsed. The second argument is +`how` you'd like the markup parsed. + +If you don't specify anything, you'll get the best HTML parser that's +installed. Beautiful Soup ranks lxml's parser as being the best, then +html5lib's, then Python's built-in parser. You can override this by +specifying one of the following: + +* What type of markup you want to parse. Currently supported are + "html", "xml", and "html5". + +* The name of the parser library you want to use. Currently supported + options are "lxml", "html5lib", and "html.parser" (Python's + built-in HTML parser). + +The section `Installing a parser`_ contrasts the supported parsers. + +If you don't have an appropriate parser installed, Beautiful Soup will +ignore your request and pick a different parser. Right now, the only +supported XML parser is lxml. If you don't have lxml installed, asking +for an XML parser won't give you one, and asking for "lxml" won't work +either. + +Differences between parsers +--------------------------- + +Beautiful Soup presents the same interface to a number of different +parsers, but each parser is different. Different parsers will create +different parse trees from the same document. The biggest differences +are between the HTML parsers and the XML parsers. Here's a short +document, parsed as HTML using the parser that comes with Python:: + + BeautifulSoup("<a><b/></a>", "html.parser") + # <a><b></b></a> + +Since a standalone <b/> tag is not valid HTML, html.parser turns it into +a <b></b> tag pair. + +Here's the same document parsed as XML (running this requires that you +have lxml installed). Note that the standalone <b/> tag is left alone, and +that the document is given an XML declaration instead of being put +into an <html> tag.:: + + print(BeautifulSoup("<a><b/></a>", "xml")) + # <?xml version="1.0" encoding="utf-8"?> + # <a><b/></a> + +There are also differences between HTML parsers. If you give Beautiful +Soup a perfectly-formed HTML document, these differences won't +matter. One parser will be faster than another, but they'll all give +you a data structure that looks exactly like the original HTML +document. + +But if the document is not perfectly-formed, different parsers will +give different results. Here's a short, invalid document parsed using +lxml's HTML parser. Note that the <a> tag gets wrapped in <body> and +<html> tags, and the dangling </p> tag is simply ignored:: + + BeautifulSoup("<a></p>", "lxml") + # <html><body><a></a></body></html> + +Here's the same document parsed using html5lib:: + + BeautifulSoup("<a></p>", "html5lib") + # <html><head></head><body><a><p></p></a></body></html> + +Instead of ignoring the dangling </p> tag, html5lib pairs it with an +opening <p> tag. html5lib also adds an empty <head> tag; lxml didn't +bother. + +Here's the same document parsed with Python's built-in HTML +parser:: + + BeautifulSoup("<a></p>", "html.parser") + # <a></a> + +Like html5lib, this parser ignores the closing </p> tag. Unlike +html5lib or lxml, this parser makes no attempt to create a +well-formed HTML document by adding <html> or <body> tags. + +Since the document "<a></p>" is invalid, none of these techniques is +the 'correct' way to handle it. The html5lib parser uses techniques +that are part of the HTML5 standard, so it has the best claim on being +the 'correct' way, but all three techniques are legitimate. + +Differences between parsers can affect your script. If you're planning +on distributing your script to other people, or running it on multiple +machines, you should specify a parser in the ``BeautifulSoup`` +constructor. That will reduce the chances that your users parse a +document differently from the way you parse it. + +Encodings +========= + +Any HTML or XML document is written in a specific encoding like ASCII +or UTF-8. But when you load that document into Beautiful Soup, you'll +discover it's been converted to Unicode:: + + markup = "<h1>Sacr\xc3\xa9 bleu!</h1>" + soup = BeautifulSoup(markup, 'html.parser') + soup.h1 + # <h1>Sacré bleu!</h1> + soup.h1.string + # 'Sacr\xe9 bleu!' + +It's not magic. (That sure would be nice.) Beautiful Soup uses a +sub-library called `Unicode, Dammit`_ to detect a document's encoding +and convert it to Unicode. The autodetected encoding is available as +the ``.original_encoding`` attribute of the ``BeautifulSoup`` object:: + + soup.original_encoding + 'utf-8' + +Unicode, Dammit guesses correctly most of the time, but sometimes it +makes mistakes. Sometimes it guesses correctly, but only after a +byte-by-byte search of the document that takes a very long time. If +you happen to know a document's encoding ahead of time, you can avoid +mistakes and delays by passing it to the ``BeautifulSoup`` constructor +as ``from_encoding``. + +Here's a document written in ISO-8859-8. The document is so short that +Unicode, Dammit can't get a lock on it, and misidentifies it as +ISO-8859-7:: + + markup = b"<h1>\xed\xe5\xec\xf9</h1>" + soup = BeautifulSoup(markup, 'html.parser') + print(soup.h1) + # <h1>νεμω</h1> + print(soup.original_encoding) + # iso-8859-7 + +We can fix this by passing in the correct ``from_encoding``:: + + soup = BeautifulSoup(markup, 'html.parser', from_encoding="iso-8859-8") + print(soup.h1) + # <h1>םולש</h1> + print(soup.original_encoding) + # iso8859-8 + +If you don't know what the correct encoding is, but you know that +Unicode, Dammit is guessing wrong, you can pass the wrong guesses in +as ``exclude_encodings``:: + + soup = BeautifulSoup(markup, 'html.parser', exclude_encodings=["iso-8859-7"]) + print(soup.h1) + # <h1>םולש</h1> + print(soup.original_encoding) + # WINDOWS-1255 + +Windows-1255 isn't 100% correct, but that encoding is a compatible +superset of ISO-8859-8, so it's close enough. (``exclude_encodings`` +is a new feature in Beautiful Soup 4.4.0.) + +In rare cases (usually when a UTF-8 document contains text written in +a completely different encoding), the only way to get Unicode may be +to replace some characters with the special Unicode character +"REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do +this, it will set the ``.contains_replacement_characters`` attribute +to ``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This +lets you know that the Unicode representation is not an exact +representation of the original--some data was lost. If a document +contains �, but ``.contains_replacement_characters`` is ``False``, +you'll know that the � was there originally (as it is in this +paragraph) and doesn't stand in for missing data. + +Output encoding +--------------- + +When you write out a document from Beautiful Soup, you get a UTF-8 +document, even if the document wasn't in UTF-8 to begin with. Here's a +document written in the Latin-1 encoding:: + + markup = b''' + <html> + <head> + <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /> + </head> + <body> + <p>Sacr\xe9 bleu!</p> + </body> + </html> + ''' + + soup = BeautifulSoup(markup, 'html.parser') + print(soup.prettify()) + # <html> + # <head> + # <meta content="text/html; charset=utf-8" http-equiv="Content-type" /> + # </head> + # <body> + # <p> + # Sacré bleu! + # </p> + # </body> + # </html> + +Note that the <meta> tag has been rewritten to reflect the fact that +the document is now in UTF-8. + +If you don't want UTF-8, you can pass an encoding into ``prettify()``:: + + print(soup.prettify("latin-1")) + # <html> + # <head> + # <meta content="text/html; charset=latin-1" http-equiv="Content-type" /> + # ... + +You can also call encode() on the ``BeautifulSoup`` object, or any +element in the soup, just as if it were a Python string:: + + soup.p.encode("latin-1") + # b'<p>Sacr\xe9 bleu!</p>' + + soup.p.encode("utf-8") + # b'<p>Sacr\xc3\xa9 bleu!</p>' + +Any characters that can't be represented in your chosen encoding will +be converted into numeric XML entity references. Here's a document +that includes the Unicode character SNOWMAN:: + + markup = u"<b>\N{SNOWMAN}</b>" + snowman_soup = BeautifulSoup(markup, 'html.parser') + tag = snowman_soup.b + +The SNOWMAN character can be part of a UTF-8 document (it looks like +☃), but there's no representation for that character in ISO-Latin-1 or +ASCII, so it's converted into "☃" for those encodings:: + + print(tag.encode("utf-8")) + # b'<b>\xe2\x98\x83</b>' + + print(tag.encode("latin-1")) + # b'<b>☃</b>' + + print(tag.encode("ascii")) + # b'<b>☃</b>' + +Unicode, Dammit +--------------- + +You can use Unicode, Dammit without using Beautiful Soup. It's useful +whenever you have data in an unknown encoding and you just want it to +become Unicode:: + + from bs4 import UnicodeDammit + dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'utf-8' + +Unicode, Dammit's guesses will get a lot more accurate if you install +the ``chardet`` or ``cchardet`` Python libraries. The more data you +give Unicode, Dammit, the more accurately it will guess. If you have +your own suspicions as to what the encoding might be, you can pass +them in as a list:: + + dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'latin-1' + +Unicode, Dammit has two special features that Beautiful Soup doesn't +use. + +Smart quotes +^^^^^^^^^^^^ + +You can use Unicode, Dammit to convert Microsoft smart quotes to HTML or XML +entities:: + + markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>" + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup + # '<p>I just “love” Microsoft Word’s smart quotes</p>' + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup + # '<p>I just “love” Microsoft Word’s smart quotes</p>' + +You can also convert Microsoft smart quotes to ASCII quotes:: + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="ascii").unicode_markup + # '<p>I just "love" Microsoft Word\'s smart quotes</p>' + +Hopefully you'll find this feature useful, but Beautiful Soup doesn't +use it. Beautiful Soup prefers the default behavior, which is to +convert Microsoft smart quotes to Unicode characters along with +everything else:: + + UnicodeDammit(markup, ["windows-1252"]).unicode_markup + # '<p>I just “love” Microsoft Word’s smart quotes</p>' + +Inconsistent encodings +^^^^^^^^^^^^^^^^^^^^^^ + +Sometimes a document is mostly in UTF-8, but contains Windows-1252 +characters such as (again) Microsoft smart quotes. This can happen +when a website includes data from multiple sources. You can use +``UnicodeDammit.detwingle()`` to turn such a document into pure +UTF-8. Here's a simple example:: + + snowmen = (u"\N{SNOWMAN}" * 3) + quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") + doc = snowmen.encode("utf8") + quote.encode("windows_1252") + +This document is a mess. The snowmen are in UTF-8 and the quotes are +in Windows-1252. You can display the snowmen or the quotes, but not +both:: + + print(doc) + # ☃☃☃�I like snowmen!� + + print(doc.decode("windows-1252")) + # ☃☃☃“I like snowmen!” + +Decoding the document as UTF-8 raises a ``UnicodeDecodeError``, and +decoding it as Windows-1252 gives you gibberish. Fortunately, +``UnicodeDammit.detwingle()`` will convert the string to pure UTF-8, +allowing you to decode it to Unicode and display the snowmen and quote +marks simultaneously:: + + new_doc = UnicodeDammit.detwingle(doc) + print(new_doc.decode("utf8")) + # ☃☃☃“I like snowmen!” + +``UnicodeDammit.detwingle()`` only knows how to handle Windows-1252 +embedded in UTF-8 (or vice versa, I suppose), but this is the most +common case. + +Note that you must know to call ``UnicodeDammit.detwingle()`` on your +data before passing it into ``BeautifulSoup`` or the ``UnicodeDammit`` +constructor. Beautiful Soup assumes that a document has a single +encoding, whatever it might be. If you pass it a document that +contains both UTF-8 and Windows-1252, it's likely to think the whole +document is Windows-1252, and the document will come out looking like +``☃☃☃“I like snowmen!”``. + +``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0. + +Line numbers +============ + +The ``html.parser`` and ``html5lib`` parsers can keep track of where in +the original document each Tag was found. You can access this +information as ``Tag.sourceline`` (line number) and ``Tag.sourcepos`` +(position of the start tag within a line):: + + markup = "<p\n>Paragraph 1</p>\n <p>Paragraph 2</p>" + soup = BeautifulSoup(markup, 'html.parser') + for tag in soup.find_all('p'): + print(repr((tag.sourceline, tag.sourcepos, tag.string))) + # (1, 0, 'Paragraph 1') + # (3, 4, 'Paragraph 2') + +Note that the two parsers mean slightly different things by +``sourceline`` and ``sourcepos``. For html.parser, these numbers +represent the position of the initial less-than sign. For html5lib, +these numbers represent the position of the final greater-than sign:: + + soup = BeautifulSoup(markup, 'html5lib') + for tag in soup.find_all('p'): + print(repr((tag.sourceline, tag.sourcepos, tag.string))) + # (2, 0, 'Paragraph 1') + # (3, 6, 'Paragraph 2') + +You can shut off this feature by passing ``store_line_numbers=False` +into the ``BeautifulSoup`` constructor:: + + markup = "<p\n>Paragraph 1</p>\n <p>Paragraph 2</p>" + soup = BeautifulSoup(markup, 'html.parser', store_line_numbers=False) + print(soup.p.sourceline) + # None + +`This feature is new in 4.8.1, and the parsers based on lxml don't +support it.` + +Comparing objects for equality +============================== + +Beautiful Soup says that two ``NavigableString`` or ``Tag`` objects +are equal when they represent the same HTML or XML markup. In this +example, the two <b> tags are treated as equal, even though they live +in different parts of the object tree, because they both look like +"<b>pizza</b>":: + + markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>" + soup = BeautifulSoup(markup, 'html.parser') + first_b, second_b = soup.find_all('b') + print(first_b == second_b) + # True + + print(first_b.previous_element == second_b.previous_element) + # False + +If you want to see whether two variables refer to exactly the same +object, use `is`:: + + print(first_b is second_b) + # False + +Copying Beautiful Soup objects +============================== + +You can use ``copy.copy()`` to create a copy of any ``Tag`` or +``NavigableString``:: + + import copy + p_copy = copy.copy(soup.p) + print(p_copy) + # <p>I want <b>pizza</b> and more <b>pizza</b>!</p> + +The copy is considered equal to the original, since it represents the +same markup as the original, but it's not the same object:: + + print(soup.p == p_copy) + # True + + print(soup.p is p_copy) + # False + +The only real difference is that the copy is completely detached from +the original Beautiful Soup object tree, just as if ``extract()`` had +been called on it:: + + print(p_copy.parent) + # None + +This is because two different ``Tag`` objects can't occupy the same +space at the same time. + +Advanced parser customization +============================= + +Beautiful Soup offers a number of ways to customize how the parser +treats incoming HTML and XML. This section covers the most commonly +used customization techniques. + +Parsing only part of a document +------------------------------- + +Let's say you want to use Beautiful Soup look at a document's <a> +tags. It's a waste of time and memory to parse the entire document and +then go over it again looking for <a> tags. It would be much faster to +ignore everything that wasn't an <a> tag in the first place. The +``SoupStrainer`` class allows you to choose which parts of an incoming +document are parsed. You just create a ``SoupStrainer`` and pass it in +to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. + +(Note that *this feature won't work if you're using the html5lib parser*. +If you use html5lib, the whole document will be parsed, no +matter what. This is because html5lib constantly rearranges the parse +tree as it works, and if some part of the document didn't actually +make it into the parse tree, it'll crash. To avoid confusion, in the +examples below I'll be forcing Beautiful Soup to use Python's +built-in parser.) + +``SoupStrainer`` +^^^^^^^^^^^^^^^^ + +The ``SoupStrainer`` class takes the same arguments as a typical +method from `Searching the tree`_: :ref:`name <name>`, :ref:`attrs +<attrs>`, :ref:`string <string>`, and :ref:`**kwargs <kwargs>`. Here are +three ``SoupStrainer`` objects:: + + from bs4 import SoupStrainer + + only_a_tags = SoupStrainer("a") + + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return string is not None and len(string) < 10 + + only_short_strings = SoupStrainer(string=is_short_string) + +I'm going to bring back the "three sisters" document one more time, +and we'll see what the document looks like when it's parsed with these +three ``SoupStrainer`` objects:: + + html_doc = """<html><head><title>The Dormouse's story</title></head> + <body> + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) + # Elsie + # , + # Lacie + # and + # Tillie + # ... + # + +You can also pass a ``SoupStrainer`` into any of the methods covered +in `Searching the tree`_. This probably isn't terribly useful, but I +thought I'd mention it:: + + soup = BeautifulSoup(html_doc, 'html.parser') + soup.find_all(only_short_strings) + # ['\n\n', '\n\n', 'Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', + # '\n\n', '...', '\n'] + +Customizing multi-valued attributes +----------------------------------- + +In an HTML document, an attribute like ``class`` is given a list of +values, and an attribute like ``id`` is given a single value, because +the HTML specification treats those attributes differently:: + + markup = '<a class="cls1 cls2" id="id1 id2">' + soup = BeautifulSoup(markup, 'html.parser') + soup.a['class'] + # ['cls1', 'cls2'] + soup.a['id'] + # 'id1 id2' + +You can turn this off by passing in +``multi_valued_attributes=None``. Than all attributes will be given a +single value:: + + soup = BeautifulSoup(markup, 'html.parser', multi_valued_attributes=None) + soup.a['class'] + # 'cls1 cls2' + soup.a['id'] + # 'id1 id2' + +You can customize this behavior quite a bit by passing in a +dictionary for ``multi_valued_attributes``. If you need this, look at +``HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`` to see the +configuration Beautiful Soup uses by default, which is based on the +HTML specification. + +`(This is a new feature in Beautiful Soup 4.8.0.)` + +Handling duplicate attributes +----------------------------- + +When using the ``html.parser`` parser, you can use the +``on_duplicate_attribute`` constructor argument to customize what +Beautiful Soup does when it encounters a tag that defines the same +attribute more than once:: + + markup = '<a href="http://url1/" href="http://url2/">' + +The default behavior is to use the last value found for the tag:: + + soup = BeautifulSoup(markup, 'html.parser') + soup.a['href'] + # http://url2/ + + soup = BeautifulSoup(markup, 'html.parser', on_duplicate_attribute='replace') + soup.a['href'] + # http://url2/ + +With ``on_duplicate_attribute='ignore'`` you can tell Beautiful Soup +to use the `first` value found and ignore the rest:: + + soup = BeautifulSoup(markup, 'html.parser', on_duplicate_attribute='ignore') + soup.a['href'] + # http://url1/ + +(lxml and html5lib always do it this way; their behavior can't be +configured from within Beautiful Soup.) + +If you need more, you can pass in a function that's called on each duplicate value:: + + def accumulate(attributes_so_far, key, value): + if not isinstance(attributes_so_far[key], list): + attributes_so_far[key] = [attributes_so_far[key]] + attributes_so_far[key].append(value) + + soup = BeautifulSoup(markup, 'html.parser', on_duplicate_attribute=accumulate) + soup.a['href'] + # ["http://url1/", "http://url2/"] + +`(This is a new feature in Beautiful Soup 4.9.1.)` + +Instantiating custom subclasses +------------------------------- + +When a parser tells Beautiful Soup about a tag or a string, Beautiful +Soup will instantiate a ``Tag`` or ``NavigableString`` object to +contain that information. Instead of that default behavior, you can +tell Beautiful Soup to instantiate `subclasses` of ``Tag`` or +``NavigableString``, subclasses you define with custom behavior:: + + from bs4 import Tag, NavigableString + class MyTag(Tag): + pass + + + class MyString(NavigableString): + pass + + + markup = "<div>some text</div>" + soup = BeautifulSoup(markup, 'html.parser') + isinstance(soup.div, MyTag) + # False + isinstance(soup.div.string, MyString) + # False + + my_classes = { Tag: MyTag, NavigableString: MyString } + soup = BeautifulSoup(markup, 'html.parser', element_classes=my_classes) + isinstance(soup.div, MyTag) + # True + isinstance(soup.div.string, MyString) + # True + +This can be useful when incorporating Beautiful Soup into a test +framework. + +`(This is a new feature in Beautiful Soup 4.8.1.)` + +Troubleshooting +=============== + +.. _diagnose: + +``diagnose()`` +-------------- + +If you're having trouble understanding what Beautiful Soup does to a +document, pass the document into the ``diagnose()`` function. (New in +Beautiful Soup 4.2.0.) Beautiful Soup will print out a report showing +you how different parsers handle the document, and tell you if you're +missing a parser that Beautiful Soup could be using:: + + from bs4.diagnose import diagnose + with open("bad.html") as fp: + data = fp.read() + + diagnose(data) + + # Diagnostic running on Beautiful Soup 4.2.0 + # Python version 2.7.3 (default, Aug 1 2012, 05:16:07) + # I noticed that html5lib is not installed. Installing it may help. + # Found lxml version 2.3.2.0 + # + # Trying to parse your data with html.parser + # Here's what html.parser did with the document: + # ... + +Just looking at the output of diagnose() may show you how to solve the +problem. Even if not, you can paste the output of ``diagnose()`` when +asking for help. + +Errors when parsing a document +------------------------------ + +There are two different kinds of parse errors. There are crashes, +where you feed a document to Beautiful Soup and it raises an +exception, usually an ``HTMLParser.HTMLParseError``. And there is +unexpected behavior, where a Beautiful Soup parse tree looks a lot +different than the document used to create it. + +Almost none of these problems turn out to be problems with Beautiful +Soup. This is not because Beautiful Soup is an amazingly well-written +piece of software. It's because Beautiful Soup doesn't include any +parsing code. Instead, it relies on external parsers. If one parser +isn't working on a certain document, the best solution is to try a +different parser. See `Installing a parser`_ for details and a parser +comparison. + +The most common parse errors are ``HTMLParser.HTMLParseError: +malformed start tag`` and ``HTMLParser.HTMLParseError: bad end +tag``. These are both generated by Python's built-in HTML parser +library, and the solution is to :ref:`install lxml or +html5lib. <parser-installation>` + +The most common type of unexpected behavior is that you can't find a +tag that you know is in the document. You saw it going in, but +``find_all()`` returns ``[]`` or ``find()`` returns ``None``. This is +another common problem with Python's built-in HTML parser, which +sometimes skips tags it doesn't understand. Again, the best solution is to +:ref:`install lxml or html5lib. <parser-installation>` + +Version mismatch problems +------------------------- + +* ``SyntaxError: Invalid syntax`` (on the line ``ROOT_TAG_NAME = + '[document]'``): Caused by running the Python 2 version of + Beautiful Soup under Python 3, without converting the code. + +* ``ImportError: No module named HTMLParser`` - Caused by running the + Python 2 version of Beautiful Soup under Python 3. + +* ``ImportError: No module named html.parser`` - Caused by running the + Python 3 version of Beautiful Soup under Python 2. + +* ``ImportError: No module named BeautifulSoup`` - Caused by running + Beautiful Soup 3 code on a system that doesn't have BS3 + installed. Or, by writing Beautiful Soup 4 code without knowing that + the package name has changed to ``bs4``. + +* ``ImportError: No module named bs4`` - Caused by running Beautiful + Soup 4 code on a system that doesn't have BS4 installed. + +.. _parsing-xml: + +Parsing XML +----------- + +By default, Beautiful Soup parses documents as HTML. To parse a +document as XML, pass in "xml" as the second argument to the +``BeautifulSoup`` constructor:: + + soup = BeautifulSoup(markup, "xml") + +You'll need to :ref:`have lxml installed <parser-installation>`. + +Other parser problems +--------------------- + +* If your script works on one computer but not another, or in one + virtual environment but not another, or outside the virtual + environment but not inside, it's probably because the two + environments have different parser libraries available. For example, + you may have developed the script on a computer that has lxml + installed, and then tried to run it on a computer that only has + html5lib installed. See `Differences between parsers`_ for why this + matters, and fix the problem by mentioning a specific parser library + in the ``BeautifulSoup`` constructor. + +* Because `HTML tags and attributes are case-insensitive + <http://www.w3.org/TR/html5/syntax.html#syntax>`_, all three HTML + parsers convert tag and attribute names to lowercase. That is, the + markup <TAG></TAG> is converted to <tag></tag>. If you want to + preserve mixed-case or uppercase tags and attributes, you'll need to + :ref:`parse the document as XML. <parsing-xml>` + +.. _misc: + +Miscellaneous +------------- + +* ``UnicodeEncodeError: 'charmap' codec can't encode character + '\xfoo' in position bar`` (or just about any other + ``UnicodeEncodeError``) - This problem shows up in two main + situations. First, when you try to print a Unicode character that + your console doesn't know how to display. (See `this page on the + Python wiki <http://wiki.python.org/moin/PrintFails>`_ for help.) + Second, when you're writing to a file and you pass in a Unicode + character that's not supported by your default encoding. In this + case, the simplest solution is to explicitly encode the Unicode + string into UTF-8 with ``u.encode("utf8")``. + +* ``KeyError: [attr]`` - Caused by accessing ``tag['attr']`` when the + tag in question doesn't define the ``attr`` attribute. The most + common errors are ``KeyError: 'href'`` and ``KeyError: 'class'``. + Use ``tag.get('attr')`` if you're not sure ``attr`` is + defined, just as you would with a Python dictionary. + +* ``AttributeError: 'ResultSet' object has no attribute 'foo'`` - This + usually happens because you expected ``find_all()`` to return a + single tag or string. But ``find_all()`` returns a _list_ of tags + and strings--a ``ResultSet`` object. You need to iterate over the + list and look at the ``.foo`` of each one. Or, if you really only + want one result, you need to use ``find()`` instead of + ``find_all()``. + +* ``AttributeError: 'NoneType' object has no attribute 'foo'`` - This + usually happens because you called ``find()`` and then tried to + access the `.foo`` attribute of the result. But in your case, + ``find()`` didn't find anything, so it returned ``None``, instead of + returning a tag or a string. You need to figure out why your + ``find()`` call isn't returning anything. + +* ``AttributeError: 'NavigableString' object has no attribute + 'foo'`` - This usually happens because you're treating a string as + though it were a tag. You may be iterating over a list, expecting + that it contains nothing but tags, when it actually contains both tags and + strings. + + +Improving Performance +--------------------- + +Beautiful Soup will never be as fast as the parsers it sits on top +of. If response time is critical, if you're paying for computer time +by the hour, or if there's any other reason why computer time is more +valuable than programmer time, you should forget about Beautiful Soup +and work directly atop `lxml <http://lxml.de/>`_. + +That said, there are things you can do to speed up Beautiful Soup. If +you're not using lxml as the underlying parser, my advice is to +:ref:`start <parser-installation>`. Beautiful Soup parses documents +significantly faster using lxml than using html.parser or html5lib. + +You can speed up encoding detection significantly by installing the +`cchardet <http://pypi.python.org/pypi/cchardet/>`_ library. + +`Parsing only part of a document`_ won't save you much time parsing +the document, but it can save a lot of memory, and it'll make +`searching` the document much faster. + +Translating this documentation +============================== + +New translations of the Beautiful Soup documentation are greatly +appreciated. Translations should be licensed under the MIT license, +just like Beautiful Soup and its English documentation are. + +There are two ways of getting your translation into the main code base +and onto the Beautiful Soup website: + +1. Create a branch of the Beautiful Soup repository, add your + translation, and propose a merge with the main branch, the same + as you would do with a proposed change to the source code. +2. Send a message to the Beautiful Soup discussion group with a link to + your translation, or attach your translation to the message. + +Use the Chinese or Brazilian Portuguese translations as your model. In +particular, please translate the source file ``doc/source/index.rst``, +rather than the HTML version of the documentation. This makes it +possible to publish the documentation in a variety of formats, not +just HTML. + +Beautiful Soup 3 +================ + +Beautiful Soup 3 is the previous release series, and is no longer +being actively developed. It's currently packaged with all major Linux +distributions: + +:kbd:`$ apt-get install python-beautifulsoup` + +It's also published through PyPi as ``BeautifulSoup``.: + +:kbd:`$ easy_install BeautifulSoup` + +:kbd:`$ pip install BeautifulSoup` + +You can also `download a tarball of Beautiful Soup 3.2.0 +<http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_. + +If you ran ``easy_install beautifulsoup`` or ``easy_install +BeautifulSoup``, but your code doesn't work, you installed Beautiful +Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``. + +`The documentation for Beautiful Soup 3 is archived online +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. + +Porting code to BS4 +------------------- + +Most code written against Beautiful Soup 3 will work against Beautiful +Soup 4 with one simple change. All you should have to do is change the +package name from ``BeautifulSoup`` to ``bs4``. So this:: + + from BeautifulSoup import BeautifulSoup + +becomes this:: + + from bs4 import BeautifulSoup + +* If you get the ``ImportError`` "No module named BeautifulSoup", your + problem is that you're trying to run Beautiful Soup 3 code, but you + only have Beautiful Soup 4 installed. + +* If you get the ``ImportError`` "No module named bs4", your problem + is that you're trying to run Beautiful Soup 4 code, but you only + have Beautiful Soup 3 installed. + +Although BS4 is mostly backwards-compatible with BS3, most of its +methods have been deprecated and given new names for `PEP 8 compliance +<http://www.python.org/dev/peps/pep-0008/>`_. There are numerous other +renames and changes, and a few of them break backwards compatibility. + +Here's what you'll need to know to convert your BS3 code and habits to BS4: + +You need a parser +^^^^^^^^^^^^^^^^^ + +Beautiful Soup 3 used Python's ``SGMLParser``, a module that was +deprecated and removed in Python 3.0. Beautiful Soup 4 uses +``html.parser`` by default, but you can plug in lxml or html5lib and +use that instead. See `Installing a parser`_ for a comparison. + +Since ``html.parser`` is not the same parser as ``SGMLParser``, you +may find that Beautiful Soup 4 gives you a different parse tree than +Beautiful Soup 3 for the same markup. If you swap out ``html.parser`` +for lxml or html5lib, you may find that the parse tree changes yet +again. If this happens, you'll need to update your scraping code to +deal with the new tree. + +Method names +^^^^^^^^^^^^ + +* ``renderContents`` -> ``encode_contents`` +* ``replaceWith`` -> ``replace_with`` +* ``replaceWithChildren`` -> ``unwrap`` +* ``findAll`` -> ``find_all`` +* ``findAllNext`` -> ``find_all_next`` +* ``findAllPrevious`` -> ``find_all_previous`` +* ``findNext`` -> ``find_next`` +* ``findNextSibling`` -> ``find_next_sibling`` +* ``findNextSiblings`` -> ``find_next_siblings`` +* ``findParent`` -> ``find_parent`` +* ``findParents`` -> ``find_parents`` +* ``findPrevious`` -> ``find_previous`` +* ``findPreviousSibling`` -> ``find_previous_sibling`` +* ``findPreviousSiblings`` -> ``find_previous_siblings`` +* ``getText`` -> ``get_text`` +* ``nextSibling`` -> ``next_sibling`` +* ``previousSibling`` -> ``previous_sibling`` + +Some arguments to the Beautiful Soup constructor were renamed for the +same reasons: + +* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` +* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` + +I renamed one method for compatibility with Python 3: + +* ``Tag.has_key()`` -> ``Tag.has_attr()`` + +I renamed one attribute to use more accurate terminology: + +* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` + +I renamed three attributes to avoid using words that have special +meaning to Python. Unlike the others, these changes are *not backwards +compatible.* If you used these attributes in BS3, your code will break +on BS4 until you change them. + +* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` +* ``Tag.next`` -> ``Tag.next_element`` +* ``Tag.previous`` -> ``Tag.previous_element`` + +These methods are left over from the Beautiful Soup 2 API. They've +been deprecated since 2006, and should not be used at all: + +* ``Tag.fetchNextSiblings`` +* ``Tag.fetchPreviousSiblings`` +* ``Tag.fetchPrevious`` +* ``Tag.fetchPreviousSiblings`` +* ``Tag.fetchParents`` +* ``Tag.findChild`` +* ``Tag.findChildren`` + + +Generators +^^^^^^^^^^ + +I gave the generators PEP 8-compliant names, and transformed them into +properties: + +* ``childGenerator()`` -> ``children`` +* ``nextGenerator()`` -> ``next_elements`` +* ``nextSiblingGenerator()`` -> ``next_siblings`` +* ``previousGenerator()`` -> ``previous_elements`` +* ``previousSiblingGenerator()`` -> ``previous_siblings`` +* ``recursiveChildGenerator()`` -> ``descendants`` +* ``parentGenerator()`` -> ``parents`` + +So instead of this:: + + for parent in tag.parentGenerator(): + ... + +You can write this:: + + for parent in tag.parents: + ... + +(But the old code will still work.) + +Some of the generators used to yield ``None`` after they were done, and +then stop. That was a bug. Now the generators just stop. + +There are two new generators, :ref:`.strings and +.stripped_strings <string-generators>`. ``.strings`` yields +NavigableString objects, and ``.stripped_strings`` yields Python +strings that have had whitespace stripped. + +XML +^^^ + +There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To +parse XML you pass in "xml" as the second argument to the +``BeautifulSoup`` constructor. For the same reason, the +``BeautifulSoup`` constructor no longer recognizes the ``isHTML`` +argument. + +Beautiful Soup's handling of empty-element XML tags has been +improved. Previously when you parsed XML you had to explicitly say +which tags were considered empty-element tags. The ``selfClosingTags`` +argument to the constructor is no longer recognized. Instead, +Beautiful Soup considers any empty tag to be an empty-element tag. If +you add a child to an empty-element tag, it stops being an +empty-element tag. + +Entities +^^^^^^^^ + +An incoming HTML or XML entity is always converted into the +corresponding Unicode character. Beautiful Soup 3 had a number of +overlapping ways of dealing with entities, which have been +removed. The ``BeautifulSoup`` constructor no longer recognizes the +``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode, +Dammit`_ still has ``smart_quotes_to``, but its default is now to turn +smart quotes into Unicode.) The constants ``HTML_ENTITIES``, +``XML_ENTITIES``, and ``XHTML_ENTITIES`` have been removed, since they +configure a feature (transforming some but not all entities into +Unicode characters) that no longer exists. + +If you want to turn Unicode characters back into HTML entities on +output, rather than turning them into UTF-8 characters, you need to +use an :ref:`output formatter <output_formatters>`. + +Miscellaneous +^^^^^^^^^^^^^ + +:ref:`Tag.string <.string>` now operates recursively. If tag A +contains a single tag B and nothing else, then A.string is the same as +B.string. (Previously, it was None.) + +`Multi-valued attributes`_ like ``class`` have lists of strings as +their values, not strings. This may affect the way you search by CSS +class. + +``Tag`` objects now implement the ``__hash__`` method, such that two +``Tag`` objects are considered equal if they generate the same +markup. This may change your script's behavior if you put ``Tag`` +objects into a dictionary or set. + +If you pass one of the ``find*`` methods both :ref:`string <string>` `and` +a tag-specific argument like :ref:`name <name>`, Beautiful Soup will +search for tags that match your tag-specific criteria and whose +:ref:`Tag.string <.string>` matches your value for :ref:`string +<string>`. It will `not` find the strings themselves. Previously, +Beautiful Soup ignored the tag-specific arguments and looked for +strings. + +The ``BeautifulSoup`` constructor no longer recognizes the +`markupMassage` argument. It's now the parser's responsibility to +handle markup correctly. + +The rarely-used alternate parser classes like +``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been +removed. It's now the parser's decision how to handle ambiguous +markup. + +The ``prettify()`` method now returns a Unicode string, not a bytestring. diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/__init__.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/__init__.py new file mode 100644 index 00000000000..74cab3f7294 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/__init__.py @@ -0,0 +1,791 @@ +"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". + +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. + +Beautiful Soup works with Python 2.7 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.9.3" +__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +from collections import Counter +import os +import re +import sys +import traceback +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + PYTHON_SPECIFIC_ENCODINGS, + ResultSet, + Script, + Stylesheet, + SoupStrainer, + Tag, + TemplateString, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +# Define some custom warnings. +class GuessedAtParserWarning(UserWarning): + """The warning issued when BeautifulSoup has to guess what parser to + use -- probably because no parser was specified in the constructor. + """ + +class MarkupResemblesLocatorWarning(UserWarning): + """The warning issued when BeautifulSoup is given 'markup' that + actually looks like a resource locator -- a URL or a path to a file + on disk. + """ + + +class BeautifulSoup(Tag): + """A data structure representing a parsed HTML or XML document. + + Most of the methods you'll call on a BeautifulSoup object are inherited from + PageElement or Tag. + + Internally, this class defines the basic interface called by the + tree builders when converting an HTML/XML document into a data + structure. The interface abstracts away the differences between + parsers. To write a new tree builder, you'll need to understand + these methods as a whole. + + These methods will be called by the BeautifulSoup constructor: + * reset() + * feed(markup) + + The tree builder may call these methods from its feed() implementation: + * handle_starttag(name, attrs) # See note about return value + * handle_endtag(name) + * handle_data(data) # Appends to the current data node + * endData(containerClass) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's <br> tag), call handle_starttag and then + handle_endtag. + """ + + # Since BeautifulSoup subclasses Tag, it's possible to treat it as + # a Tag with a .name. This name makes it clear the BeautifulSoup + # object isn't a real markup tag. + ROOT_TAG_NAME = '[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + # A string containing all ASCII whitespace characters, used in + # endData() to detect data chunks that seem 'empty'. + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + element_classes=None, **kwargs): + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be + used. This may be the name of a specific parser ("lxml", + "lxml-xml", "html.parser", or "html5lib") or it may be the + type of markup to be used ("html", "html5", "xml"). It's + recommended that you name a specific parser, so that + Beautiful Soup gives you the same results across platforms + and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param element_classes: A dictionary mapping BeautifulSoup + classes like Tag and NavigableString, to other classes you'd + like to be instantiated instead as the parse tree is + built. This is useful for subclassing Tag or NavigableString + to modify default behavior. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be + ignored. + + Apart from this, any keyword arguments passed into the + BeautifulSoup constructor are propagated to the TreeBuilder + constructor. This makes it possible to configure a + TreeBuilder by passing in arguments, not just by saying which + one to use. + """ + if 'convertEntities' in kwargs: + del kwargs['convertEntities'] + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if from_encoding and isinstance(markup, str): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + + self.element_classes = element_classes or dict() + + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, str): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ) and markup: + # The user did not tell us which TreeBuilder to use, + # and we had to guess. Issue a warning. + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn( + self.NO_PARSER_SPECIFIED_WARNING % values, + GuessedAtParserWarning, stacklevel=2 + ) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + + self.builder = builder + self.is_xml = builder.is_xml + self.known_xml = self.is_xml + self._namespaces = dict() + self.parse_only = parse_only + + self.builder.initialize_soup(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, str) and not '<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, str) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception as e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + warnings.warn( + '"%s" looks like a filename, not markup. You should' + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % self._decode_markup(markup), + MarkupResemblesLocatorWarning + ) + self._check_markup_is_url(markup) + + rejections = [] + success = False + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + success = True + break + except ParserRejectedMarkup as e: + rejections.append(e) + pass + + if not success: + other_exceptions = [str(e) for e in rejections] + raise ParserRejectedMarkup( + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @classmethod + def _decode_markup(cls, markup): + """Ensure `markup` is bytes so it's safe to send into warnings.warn. + + TODO: warnings.warn had this problem back in 2010 but it might not + anymore. + """ + if isinstance(markup, bytes): + decoded = markup.decode('utf-8', 'replace') + else: + decoded = markup + return decoded + + @classmethod + def _check_markup_is_url(cls, markup): + """Error-handling method to raise a warning if incoming markup looks + like a URL. + + :param markup: A string. + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, str): + space = ' ' + cant_start_with = ("http:", "https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % cls._decode_markup( + markup + ), + MarkupResemblesLocatorWarning + ) + + def _feed(self): + """Internal method that parses previously set markup, creating a large + number of Tag and NavigableString objects. + """ + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + """Reset this object to a state as though it had never parsed any + markup. + """ + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.open_tag_counter = Counter() + self.preserve_whitespace_tag_stack = [] + self.string_container_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, + sourceline=None, sourcepos=None, **kwattrs): + """Create a new Tag associated with this BeautifulSoup object. + + :param name: The name of the new Tag. + :param namespace: The URI of the new Tag's XML namespace, if any. + :param prefix: The prefix for the new Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values; can + be used instead of `kwattrs` for attributes like 'class' + that are reserved words in Python. + :param sourceline: The line number where this tag was + (purportedly) found in its source document. + :param sourcepos: The character position within `sourceline` where this + tag was (purportedly) found. + :param kwattrs: Keyword arguments for the new Tag's attribute values. + + """ + kwattrs.update(attrs) + return self.element_classes.get(Tag, Tag)( + None, self.builder, name, namespace, nsprefix, kwattrs, + sourceline=sourceline, sourcepos=sourcepos + ) + + def string_container(self, base_class=None): + container = base_class or NavigableString + + # There may be a general override of NavigableString. + container = self.element_classes.get( + container, container + ) + + # On top of that, we may be inside a tag that needs a special + # container class. + if self.string_container_stack: + container = self.builder.string_containers.get( + self.string_container_stack[-1].name, container + ) + return container + + def new_string(self, s, subclass=None): + """Create a new NavigableString associated with this BeautifulSoup + object. + """ + container = self.string_container(subclass) + return container(s) + + def insert_before(self, *args): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, *args): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + """Internal method called by _popToTag when a tag is closed.""" + tag = self.tagStack.pop() + if tag.name in self.open_tag_counter: + self.open_tag_counter[tag.name] -= 1 + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + if self.string_container_stack and tag == self.string_container_stack[-1]: + self.string_container_stack.pop() + #print("Pop", tag.name) + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + """Internal method called by handle_starttag when a tag is opened.""" + #print("Push", tag.name) + if self.currentTag is not None: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name != self.ROOT_TAG_NAME: + self.open_tag_counter[tag.name] += 1 + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + if tag.name in self.builder.string_containers: + self.string_container_stack.append(tag) + + def endData(self, containerClass=None): + """Method called by the TreeBuilder when the end of a data segment + occurs. + """ + containerClass = self.string_container(containerClass) + + if self.current_data: + current_data = ''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Method called by the TreeBuilder to integrate an object into the parse tree.""" + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if previous_element is None: + previous_element = o.previous_element + + fix = parent.next_element is not None + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. + + If there are no open tags with the given name, nothing will be + popped. + + :param name: Pop up to the most recent tag with this name. + :param nsprefix: The namespace prefix that goes with `name`. + :param inclusivePop: It this is false, pops the tag stack up + to but *not* including the most recent instqance of the + given tag. + + """ + #print("Popping to %s" % name) + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + if not self.open_tag_counter.get(name): + break + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, + sourcepos=None): + """Called by the tree builder when a new tag is encountered. + + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + :param attrs: A dictionary of attribute values. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + + If this method returns None, the tag was rejected by an active + SoupStrainer. You should proceed as if the tag had not occurred + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + # print("Start tag %s: %s" % (name, attrs)) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = self.element_classes.get(Tag, Tag)( + self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element, + sourceline=sourceline, sourcepos=sourcepos + ) + if tag is None: + return tag + if self._most_recent_element is not None: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + """Called by the tree builder when an ending tag is encountered. + + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + """ + #print("End tag: " + name) + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + """Called by the tree builder when a chunk of textual data is encountered.""" + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of the parse tree + as an HTML or XML document. + + :param pretty_print: If this is True, indentation will be used to + make the document more readable. + :param eventual_encoding: The encoding of the final document. + If this is None, the document will be a Unicode string. + """ + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + # This is a special Python encoding; it can't actually + # go into an XML document because it means nothing + # outside of Python. + eventual_encoding = None + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = '<?xml version="1.0"%s?>\n' % encoding_part + else: + prefix = '' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + """Exception raised by a TreeBuilder if it's unable to continue parsing.""" + pass + +class FeatureNotFound(ValueError): + """Exception raised by the BeautifulSoup constructor if no parser with the + requested features is found. + """ + pass + + +#If this file is run as a script, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print((soup.prettify())) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/__init__.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/__init__.py new file mode 100644 index 00000000000..03fbd6a8114 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/__init__.py @@ -0,0 +1,519 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + Stylesheet, + Script, + TemplateString, + nonwhitespace_re +) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of Treebuilder. its .features + attribute should list its features. + """ + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a textual document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + # Whitespace should be preserved inside these tags. + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + # The textual contents of tags with these names should be + # instantiated with some class other than NavigableString. + DEFAULT_STRING_CONTAINERS = {} + + USE_DEFAULT = object() + + # Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS = False + + def __init__(self, multi_valued_attributes=USE_DEFAULT, + preserve_whitespace_tags=USE_DEFAULT, + store_line_numbers=USE_DEFAULT, + string_containers=USE_DEFAULT, + ): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this to a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: A list of tags to treat + the way <pre> tags are treated in HTML. Tags in this list + are immune from pretty-printing; their contents will always be + output as-is. + + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing DEFAULT_STRING_CONTAINERS. + + :param store_line_numbers: If the parser keeps track of the + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. + """ + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. + """ + self.soup = soup + + def reset(self): + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p/>" or "<p>". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will + be left alone. + + :param tag_name: The name of a markup tag. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + + This method is not implemented in TreeBuilder; it must be + implemented in subclasses. + + :return: None. + """ + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + """ + yield markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + + :param fragment: A string -- fragment of HTML. + :return: A string -- a full HTML document. + """ + return fragment + + def set_up_substitutions(self, tag): + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. + + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, str): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, but it demonstrates + how a simple TreeBuilder would work. + """ + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print("Start %s, %r" % (name, attrs)) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print("End %s" % name) + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines an unusual content model for these tags. + # We represent this by using a string class other than NavigableString + # inside these tags. + # + # I made this list by going through the HTML spec + # (https://html.spec.whatwg.org/#metadata-content) and looking for + # "metadata content" elements that can contain strings. + # + # TODO: Arguably <noscript> could go here but it seems + # qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS = { + 'style': Stylesheet, + 'script': Script, + 'template': TemplateString, + } + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + def set_up_substitutions(self, tag): + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ + # We are only interested in <meta> tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + this_module = sys.modules[__name__] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + """An Exception to be raised when the underlying parser simply + refuses to parse the given markup. + """ + def __init__(self, message_or_exception): + """Explain why the parser rejected the given markup, either + with a textual explanation or another exception. + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception + message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last resort. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_html5lib.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_html5lib.py new file mode 100644 index 00000000000..69aefd728d2 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_html5lib.py @@ -0,0 +1,467 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +import re +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, + prefixes, + ) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError as e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree. + + Note that this TreeBuilder does not support some features common + to HTML TreeBuilders. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + * This TreeBuilder doesn't use different subclasses of NavigableString + based on the name of the tag in which the string was found. + + * You can't use a SoupStrainer to parse only part of a document. + """ + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + # html5lib can tell us which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + self.underlying_builder.parser = parser + extra_kwargs = dict() + if not isinstance(markup, str): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, str): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + self.underlying_builder.parser = None + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup, + store_line_numbers=self.store_line_numbers + ) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '<html><head></head><body>%s</body></html>' % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, namespaceHTMLElements, soup=None, + store_line_numbers=True, **kwargs): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup( + "", "html.parser", store_line_numbers=store_line_numbers, + **kwargs + ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + # This will be set later to an html5lib.html5parser.HTMLParser + # object, which we can use to track the current line number. + self.parser = None + self.store_line_numbers = store_line_numbers + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + kwargs = {} + if self.parser and self.store_line_numbers: + # This represents the point immediately after the end of the + # tag. We don't know when the tag started, but we do know + # where it ended -- the character just before this one. + sourceline, sourcepos = self.parser.tokenizer.stream.position() + kwargs['sourceline'] = sourceline + kwargs['sourcepos'] = sourcepos-1 + tag = self.soup.new_tag(name, namespace, **kwargs) + + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup("", "html.parser") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) + else: + rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in list(element.attrs.items()): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = nonwhitespace_re.findall(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(treebuilder_base.Node): + def __init__(self, element, soup, namespace): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, str): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + node.parent = self + else: + child = node.element + node.parent = self + + if not isinstance(child, str) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "a</a>a</a>a</a>..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in list(attributes.items()): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print("MOVE", self.element.contents) + # print("FROM", self.element) + # print("TO", new_parent.element) + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element is not None: + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print("DONE WITH MOVE") + # print("FROM", self.element) + # print("TO", new_parent_element) + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_htmlparser.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_htmlparser.py new file mode 100644 index 00000000000..88860a994c4 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_htmlparser.py @@ -0,0 +1,477 @@ +# encoding: utf-8 +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from html.parser import HTMLParser + +try: + from html.parser import HTMLParseError +except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + """A subclass of the Python standard library's HTMLParser class, which + listens for HTMLParser events and translates them into calls + to Beautiful Soup's tree construction API. + """ + + # Strategies for handling duplicate attributes + IGNORE = 'ignore' + REPLACE = 'replace' + + def __init__(self, *args, **kwargs): + """Constructor. + + :param on_duplicate_attribute: A strategy for what to do if a + tag includes the same attribute more than once. Accepted + values are: REPLACE (replace earlier values with later + ones, the default), IGNORE (keep the earliest value + encountered), or a callable. A callable must take three + arguments: the dictionary of attributes already processed, + the name of the duplicate attribute, and the most recent value + encountered. + """ + self.on_duplicate_attribute = kwargs.pop( + 'on_duplicate_attribute', self.REPLACE + ) + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although + this requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() by raising an exception, + which we don't want to do. + + In any event, this method is called only on very strange + markup and our best strategy is to pretend it didn't happen + and keep going. + """ + warnings.warn(msg) + + def handle_startendtag(self, name, attrs): + """Handle an incoming empty-element tag. + + This is only called when the markup looks like <tag/>. + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + """ + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): + """Handle an opening tag, e.g. '<tag>' + + :param name: Name of the tag. + :param attrs: Dictionary of the tag's attributes. + :param handle_empty_element: True if this tag is known to be + an empty-element tag (i.e. there is not expected to be any + closing tag). + """ + # XXX namespace + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + if key in attr_dict: + # A single attribute shows up multiple times in this + # tag. How to handle it depends on the + # on_duplicate_attribute setting. + on_dupe = self.on_duplicate_attribute + if on_dupe == self.IGNORE: + pass + elif on_dupe in (None, self.REPLACE): + attr_dict[key] = value + else: + on_dupe(attr_dict, key, value) + else: + attr_dict[key] = value + attrvalue = '""' + #print("START", name) + sourceline, sourcepos = self.getpos() + tag = self.soup.handle_starttag( + name, None, None, attr_dict, sourceline=sourceline, + sourcepos=sourcepos + ) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # <tag/>.) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + """Handle a closing tag, e.g. '</tag>' + + :param name: A tag name. + :param check_already_closed: True if this tag is expected to + be the closing portion of an empty-element tag, + e.g. '<tag></tag>'. + """ + #print("END", name) + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + #print("ALREADY CLOSED", name) + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + + def handle_data(self, data): + """Handle some textual data that shows up between tags.""" + self.soup.handle_data(data) + + def handle_charref(self, name): + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Character number, possibly in hexadecimal. + """ + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): + """Handle a named entity reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Name of the entity reference. + """ + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name + self.handle_data(data) + + def handle_comment(self, data): + """Handle an HTML comment. + + :param data: The text of the comment. + """ + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + """Handle a DOCTYPE declaration. + + :param data: The text of the declaration. + """ + self.soup.endData() + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + """Handle a declaration of unknown type -- probably a CDATA block. + + :param data: The text of the declaration. + """ + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + """Handle a processing instruction. + + :param data: The text of the instruction. + """ + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, + found in the Python standard library. + """ + is_xml = False + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] + + # The html.parser knows which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + """Constructor. + + :param parser_args: Positional arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param parser_kwargs: Keyword arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param kwargs: Keyword arguments for the superclass constructor. + """ + # Some keyword arguments will be pulled out of kwargs and placed + # into parser_kwargs. + extra_parser_kwargs = dict() + for arg in ('on_duplicate_attribute',): + if arg in kwargs: + value = kwargs.pop(arg) + extra_parser_kwargs[arg] = value + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + parser_kwargs.update(extra_parser_kwargs) + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: + parser_kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ + if isinstance(markup, str): + # Parse Unicode as-is. + yield (markup, None, None, False) + return + + # Ask UnicodeDammit to sniff the most likely encoding. + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + """ + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + parser.close() + except HTMLParseError as e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + parser.already_closed_empty_element = [] + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like <p></p> as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: <span attr="value" /> + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_lxml.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_lxml.py new file mode 100644 index 00000000000..432a2c86aeb --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/builder/_lxml.py @@ -0,0 +1,332 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable + +from io import BytesIO +from io import StringIO +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in list(d.items())) + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + # NOTE: If we parsed Element objects and looked at .sourceline, + # we'd be able to see the line numbers from the original document. + # But instead we build an XMLParser or HTMLParser object to serve + # as the target of parse messages, and those messages don't include + # line numbers. + # See: https://bugs.launchpad.net/lxml/+bug/1846906 + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + + :param soup: A `BeautifulSoup`. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + + :param mapping: A dictionary mapping namespace prefixes to URIs. + """ + for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + + def default_parser(self, encoding): + """Find the default parser for the given encoding. + + :param encoding: A string. + :return: Either a parser object or a class, which + will be instantiated with default arguments. + """ + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + """Instantiate an appropriate parser for the given encoding. + + :param encoding: A string. + :return: A parser object such as an `etree.XMLParser`. + """ + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, Callable): + # Instantiate the parser with default arguments + parser = parser( + target=self, strip_cdata=False, recover=True, encoding=encoding + ) + return parser + + def __init__(self, parser=None, empty_element_tags=None, **kwargs): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + lxml really wants to get a bytestring and convert it to + Unicode itself. So instead of using UnicodeDammit to convert + the bytestring to Unicode using different encodings, this + implementation uses EncodingDetector to iterate over the + encodings, and tell lxml to try to parse the document as each + one in turn. + + :param markup: Some markup -- hopefully a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, str): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in list(attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + processing_instruction_class = ProcessingInstruction + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '<html><body>%s</body></html>' % fragment diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/dammit.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/dammit.py new file mode 100644 index 00000000000..ee3708f5c36 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/dammit.py @@ -0,0 +1,939 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. +""" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import codecs +from html.entities import codepoint2name +import re +import logging +import string + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + if isinstance(s, str): + return None + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + if isinstance(s, str): + return None + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +# +# TODO: This doesn't work anymore and the closest thing, iconv_codecs, +# is GPL-licensed. Check whether this is still necessary. +try: + import iconv_codec +except ImportError: + pass + +# Build bytestring and Unicode versions of regular expressions for finding +# a declared encoding inside an XML or HTML document. +xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' +encoding_res = dict() +encoding_res[bytes] = { + 'html' : re.compile(html_meta.encode("ascii"), re.I), + 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), +} +encoding_res[str] = { + 'html' : re.compile(html_meta, re.I), + 'xml' : re.compile(xml_encoding, re.I) +} + +class EntitySubstitution(object): + """The ability to substitute XML or HTML entities for certain characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = chr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate HTML entity for a special character.""" + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for a special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + + :param s: A Unicode string. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a <meta> tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + """Constructor. + + :param markup: Some markup in an unknown encoding. + :param override_encodings: These encodings will be tried first. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be tried, even + if they otherwise would be. + """ + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + """Should we even bother to try this encoding? + + :param encoding: Name of an encoding. + :param tried: Encodings that have already been tried. This will be modified + as a side effect. + """ + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup. + + :yield: A sequence of strings. + """ + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies. + + :param data: Some markup. + :return: A 2-tuple (modified data, implied encoding) + """ + encoding = None + if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a <meta> tag, hopefully near the + beginning of the document. + + :param markup: Some markup. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param search_entire_document: Since an encoding is supposed to declared near the beginning + of the document, most of the time it's only necessary to search a few kilobytes of data. + Set this to True to force this method to search the entire document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: + res = encoding_res[str] + + xml_re = res['xml'] + html_re = res['html'] + declared_encoding = None + declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0] + if declared_encoding: + if isinstance(declared_encoding, bytes): + declared_encoding = declared_encoding.decode('ascii', 'replace') + return declared_encoding.lower() + return None + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[]): + """Constructor. + + :param markup: A bytestring representing markup in an unknown encoding. + :param override_encodings: These encodings will be tried first, + before any sniffing code is run. + + :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted + to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. + Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' + will convert them to HTML entity references. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be considered, even + if the sniffing code thinks they might make sense. + """ + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, str) or markup == '': + self.markup = markup + self.unicode_markup = str(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + self.log.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER." + ) + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + """Attempt to convert the markup to the proposed encoding. + + :param proposed: The name of a character encoding. + """ + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print("Trying to convert document to %s (errors=%s)" % ( + # proposed, errors)) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print("That didn't work!") + #print(e) + return None + #print("Correct encoding: %s" % proposed) + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + """Given a string and its encoding, decodes the string into Unicode. + + :param encoding: The name of an encoding. + """ + return str(data, encoding, errors) + + @property + def declared_html_encoding(self): + """If the markup is an HTML document, returns the encoding declared _within_ + the document. + """ + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset): + """Convert the name of a character set to a codec name. + + :param charset: The name of a character set. + :return: The name of a codec. + """ + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', # + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + :param in_bytes: A bytestring that you suspect contains + characters from multiple encodings. Note that this _must_ + be a bytestring. If you've already converted the document + to Unicode, you're too late. + :param main_encoding: The primary encoding of `in_bytes`. + :param embedded_encoding: The encoding that was used to embed characters + in the main document. + :return: A bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/diagnose.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/diagnose.py new file mode 100644 index 00000000000..500e92df4d3 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/diagnose.py @@ -0,0 +1,242 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import cProfile +from io import StringIO +from html.parser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems. + + :param data: A string containing markup that needs to be explained. + :return: None; diagnostics are printed to standard output. + """ + print(("Diagnostic running on Beautiful Soup %s" % __version__)) + print(("Python version %s" % sys.version)) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print(( + "I noticed that %s is not installed. Installing it may help." % + name)) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree + print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) + except ImportError as e: + print( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print(("Found html5lib version %s" % html5lib.__version__)) + except ImportError as e: + print( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): + print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + else: + try: + if os.path.exists(data): + print(('"%s" looks like a filename. Reading data from the file.' % data)) + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print("") + + for parser in basic_parsers: + print(("Trying to parse your markup with %s" % parser)) + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True + except Exception as e: + print(("%s could not parse the markup." % parser)) + traceback.print_exc() + if success: + print(("Here's what %s did with the markup:" % parser)) + print((soup.prettify())) + + print(("-" * 80)) + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. You can use this to determine whether + an lxml-specific problem is in Beautiful Soup's lxml tree builders + or in lxml itself. + + :param data: Some markup. + :param html: If True, markup will be parsed with lxml's HTML parser. + if False, lxml's XML parser will be used. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print(("%s, %4s, %s" % (event, element.tag, element.text))) + +class AnnouncingParser(HTMLParser): + """Subclass of HTMLParser that announces parse events, without doing + anything else. + + You can use this to get a picture of how html.parser sees a given + document. The easiest way to do this is to call `htmlparser_trace`. + """ + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + + :param data: Some markup. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("</%s>" % tag_name) + return "<html>" + "\n".join(elements) + "</html>" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) + data = rdoc(num_elements) + print(("Generated a large invalid HTML document (%d bytes)." % len(data))) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception as e: + print(("%s could not parse the markup." % parser)) + traceback.print_exc() + if success: + print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print(("Raw lxml parsed the markup in %.2fs." % (b-a))) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) + +def profile(num_elements=100000, parser="lxml"): + """Use Python's profiler on a randomly generated document.""" + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +# If this file is run as a script, standard input is diagnosed. +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/element.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/element.py new file mode 100644 index 00000000000..81d9db90f9a --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/element.py @@ -0,0 +1,2175 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable +import re +import sys +import warnings +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +# These encodings are recognized by Python (so PageElement.encode +# could theoretically support them) but XML and HTML don't recognize +# them (so they should not show up in an XML or HTML document as that +# document's encoding). +# +# If an XML document is encoded in one of these encodings, no encoding +# will be mentioned in the XML declaration. If an HTML document is +# encoded in one of these encodings, and the HTML document has a +# <meta> tag that mentions an encoding, the encoding will be given as +# the empty string. +# +# Source: +# https://docs.python.org/3/library/codecs.html#python-specific-encodings +PYTHON_SPECIFIC_ENCODINGS = set([ + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", +]) + + +class NamespacedAttribute(str): + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ + + def __new__(cls, prefix, name=None, namespace=None): + if not name: + # This is the default namespace. Its name "has no value" + # per https://www.w3.org/TR/xml-names/#defaulting + name = None + + if name is None: + obj = str.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = str.__new__(cls, name) + else: + obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(str): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '<meta charset="utf8">', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + """When an HTML document is being encoded to a given encoding, the + value of a meta tag's 'charset' is the name of the encoding. + """ + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + <meta http-equiv="content-type" content="text/html; charset=utf8"> + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return str.__new__(str, original_value) + + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page: + that is, its current location in the parse tree. + + NavigableString, Tag, etc. are all subclasses of PageElement. + """ + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements. + + :param parent: The parent of this element. + + :param previous_element: The element parsed immediately before + this one. + + :param next_element: The element parsed immediately before + this one. + + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. + + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. + """ + self.parent = parent + + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s, formatter): + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look + up an XMLFormatter or HTMLFormatter in the appropriate + registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if isinstance(formatter, Callable): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + """Replace this PageElement with another one, keeping the rest of the + tree the same. + + :param replace_with: A PageElement. + :return: `self`, no longer part of the tree. + """ + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree.") + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract(_self_index=my_index) + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + """Replace this PageElement with its contents. + + :return: `self`, no longer part of the tree. + """ + my_parent = self.parent + if self.parent is None: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") + my_index = self.parent.index(self) + self.extract(_self_index=my_index) + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + """Wrap this PageElement inside another one. + + :param wrap_inside: A PageElement. + :return: `wrap_inside`, occupying the position in the tree that used + to be occupied by `self`, and with `self` inside it. + """ + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self, _self_index=None): + """Destructively rips this element out of the tree. + + :param _self_index: The location of this element in its parent's + .contents, if known. Passing this in allows for a performance + optimization. + + :return: `self`, no longer part of the tree. + """ + if self.parent is not None: + if _self_index is None: + _self_index = self.parent.index(self) + del self.parent.contents[_self_index] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if (self.previous_element is not None and + self.previous_element is not next_element): + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if (self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling): + self.previous_sibling.next_sibling = self.next_sibling + if (self.next_sibling is not None + and self.next_sibling is not self.previous_sibling): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self, is_initialized=True, accept_self=True): + """Finds the last element beneath this object to be parsed. + + :param is_initialized: Has `setup` been called on this PageElement + yet? + :param accept_self: Is `self` an acceptable answer to the question? + """ + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + """Insert a new PageElement in the list of this PageElement's children. + + This works the same way as `list.insert`. + + :param position: The numeric position that should be occupied + in `self.children` by the new PageElement. + :param new_child: A PageElement. + """ + if new_child is None: + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, str) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant(False) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given PageElement to the contents of this one. + + :param tag: A PageElement. + """ + self.insert(len(self.contents), tag) + + def extend(self, tags): + """Appends the given PageElements to this one's contents. + + :param tags: A list of PageElements. + """ + if isinstance(tags, Tag): + # Calling self.append() on another tag's contents will change + # the list we're iterating over. Make a list that won't + # change. + tags = list(tags.contents) + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + All the elements will have the same parent, and the given elements + will be immediately before this one. + + :param args: One or more PageElements. + """ + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements + will be immediately after this one. + + :param args: One or more PageElements. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Find all PageElements that match the given criteria and appear + later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet containing PageElements. + """ + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Find all siblings of this PageElement that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Look backwards in the document from this PageElement and find the + first PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Look backwards in the document from this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this PageElement that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1, **kwargs) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Find all parents of this PageElement that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + """The PageElement, if any, that was parsed just after this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self.next_element + + @property + def previous(self): + """The PageElement, if any, that was parsed just before this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + + if isinstance(name, SoupStrainer): + strainer = name + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + + if text is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) + elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) + else: + prefix = None + local_name = name + result = (element for element in generator + if isinstance(element, Tag) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) + ) + return ResultSet(strainer, result) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + """All PageElements that were parsed after this one. + + :yield: A sequence of PageElements. + """ + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + """All PageElements that are siblings of this one but were parsed + later. + + :yield: A sequence of PageElements. + """ + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + """All PageElements that are parents of this PageElement. + + :yield: A sequence of PageElements. + """ + i = self.parent + while i is not None: + yield i + i = i.parent + + @property + def decomposed(self): + """Check whether a PageElement has been decomposed. + + :rtype: bool + """ + return getattr(self, '_decomposed', False) or False + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(str, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a NavigableString for the string "penguin". + """ + + PREFIX = '' + SUFFIX = '' + + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, str): + u = str.__new__(cls, value) + else: + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + + def __copy__(self): + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) + + def __getnewargs__(self): + return (str(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter. + + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self): + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + """ + return None + + @name.setter + def name(self, name): + """Prevent NavigableString.name from ever being set.""" + raise AttributeError("A NavigableString cannot be given a name.") + + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + This is an abstract class used for special kinds of strings such + as comments (the Comment class) and CDATA blocks (the CData + class). + """ + + PREFIX = '' + SUFFIX = '' + + def output_ready(self, formatter=None): + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A Formatter object, or a string naming one + of the standard formatters. The string will be passed into the + Formatter, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + """A CDATA block.""" + PREFIX = '<![CDATA[' + SUFFIX = ']]>' + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX = '<?' + SUFFIX = '>' + +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = '<?' + SUFFIX = '?>' + +class Comment(PreformattedString): + """An HTML or XML comment.""" + PREFIX = '<!--' + SUFFIX = '-->' + + +class Declaration(PreformattedString): + """An XML declaration.""" + PREFIX = '<?' + SUFFIX = '?>' + + +class Doctype(PreformattedString): + """A document type declaration.""" + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + + :return: A Doctype. + """ + value = name or '' + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = '<!DOCTYPE ' + SUFFIX = '>\n' + + +class Stylesheet(NavigableString): + """A NavigableString representing an stylesheet (probably + CSS). + + Used to distinguish embedded stylesheets from textual content. + """ + pass + + +class Script(NavigableString): + """A NavigableString representing an executable script (probably + Javascript). + + Used to distinguish executable code from textual content. + """ + pass + + +class TemplateString(NavigableString): + """A NavigableString representing a string found inside an HTML + template embedded in a larger document. + + Used to distinguish such strings from the main body of the document. + """ + pass + + +class Tag(PageElement): + """Represents an HTML or XML tag that is part of a parse tree, along + with its attributes and contents. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a Tag object representing the <b> tag. + """ + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None, sourceline=None, sourcepos=None, + can_be_empty_element=None, cdata_list_attributes=None, + preserve_whitespace_tags=None + ): + """Basic constructor. + + :param parser: A BeautifulSoup object. + :param builder: A TreeBuilder. + :param name: The name of the tag. + :param namespace: The URI of this Tag's XML namespace, if any. + :param prefix: The prefix for this Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values. + :param parent: The PageElement to use as this Tag's parent. + :param previous: The PageElement that was parsed immediately before + this tag. + :param is_xml: If True, this is an XML tag. Otherwise, this is an + HTML tag. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + :param can_be_empty_element: If True, this tag should be + represented as <tag/>. If False, this tag should be represented + as <tag></tag>. + :param cdata_list_attributes: A list of attributes whose values should + be treated as CDATA if they ever show up on this tag. + :param preserve_whitespace_tags: A list of tag names whose contents + should have their whitespace preserved. + """ + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if ((not builder or builder.store_line_numbers) + and (sourceline is not None or sourcepos is not None)): + self.sourceline = sourceline + self.sourcepos = sourcepos + if attrs is None: + attrs = {} + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + else: + attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + if builder is None: + # In the absence of a TreeBuilder, use whatever values were + # passed in here. They're probably None, unless this is a copy of some + # other tag. + self.can_be_empty_element = can_be_empty_element + self.cdata_list_attributes = cdata_list_attributes + self.preserve_whitespace_tags = preserve_whitespace_tags + else: + # Set up any substitutions for this tag, such as the charset in a META tag. + builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. + self.can_be_empty_element = builder.can_be_empty_element(name) + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + parserClass = _alias("parser_class") # BS3 + + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)( + None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml, + sourceline=self.sourceline, sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags + ) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this + PageElement. + + TODO It might make sense to have NavigableString.string return + itself. + + :return: If this element has a single string child, return + value is that string. If this element has one child tag, + return value is the 'string' attribute of the child tag, + recursively. If this element is itself a string, has no + children, or has more than one child, return value is None. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + """Replace this PageElement's contents with `string`.""" + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + :param strip: If True, all strings will be stripped before being + yielded. + + :types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, this means only NavigableString and CData objects + will be considered. So no comments, processing instructions, + etc. + + :yield: A sequence of strings. + """ + for descendant in self.descendants: + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + + strings = property(_all_strings) + + @property + def stripped_strings(self): + """Yield all strings in the document, stripping them first. + + :yield: A sequence of stripped strings. + """ + for string in self._all_strings(True): + yield string + + def get_text(self, separator="", strip=False, + types=(NavigableString, CData)): + """Get all child strings, concatenated using the given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, this means only NavigableString and CData objects + will be considered. So no comments, processing instructions, + stylesheets, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys this PageElement and its children. + + This element will be removed from the tree and wiped out; so + will everything beneath it. + + The behavior of a decomposed PageElement is undefined and you + should never use one for anything, but if you need to _check_ + whether an element has been decomposed, you can use the + `decomposed` property. + """ + self.extract() + i = self + while i is not None: + n = i.next_element + i.__dict__.clear() + i.contents = [] + i._decomposed = True + i = n + + def clear(self, decompose=False): + """Wipe out all children of this PageElement by calling extract() + on them. + + :param decompose: If this is True, decompose() (a more + destructive method) will be called instead of extract(). + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def smooth(self): + """Smooth out this element's children by consolidating consecutive + strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + + def index(self, element): + """Find the index of a child by identity, not value. + + Avoids issues with tag.contents.index(element) getting the + index of equal elements. + + :param element: Look for this PageElement in `self.contents`. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this PageElement. + :return: A list of values, probably containing only a single + value. + """ + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + + def has_attr(self, key): + """Does this PageElement have an attribute with the given name?""" + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the Tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a Tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a Tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __bool__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a Tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + """Calling tag.subtag is the same as calling tag.find(name="subtag")""" + #print("Getattr %s.%s" % (self.__class__, tag)) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag == "contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this Tag has the same name, the same attributes, + and the same contents (recursively) as `other`.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this Tag is not identical to `other`, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding="unicode-escape"): + """Renders this PageElement as a string. + + :param encoding: The encoding to use (Python 2 only). + :return: Under Python 2, a bytestring; under Python 3, + a Unicode string. + """ + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) + + def __unicode__(self): + """Renders this PageElement as a Unicode string.""" + return self.decode() + + def __str__(self): + """Renders this PageElement as a generic string. + + :return: Under Python 2, a UTF-8 bytestring; under Python 3, + a Unicode string. + """ + if PY3K: + return self.decode() + else: + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + """Render a bytestring representation of this PageElement and its + contents. + + :param encoding: The destination encoding. + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :param errors: An error handling strategy such as + 'xmlcharrefreplace'. This value is passed along into + encode() and its value should be one of the constants + defined by Python. + :return: A bytestring. + + """ + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Render a Unicode representation of this PageElement and its + contents. + + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + """ + + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = formatter.void_element_close_prefix or '' + else: + closeTag = '</%s%s>' % (prefix, self.name) + + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) + if pretty_print: + space = indent_space + indent_contents = indent_level + 1 + else: + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter + ) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. + s.append("\n") + s = ''.join(s) + return s + + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed? + + Most of them should, but some (such as <pre> in HTML + documents) should not. + """ + return ( + indent_level is not None + and ( + not self.preserve_whitespace_tags + or self.name not in self.preserve_whitespace_tags + ) + ) + + def prettify(self, encoding=None, formatter="minimal"): + """Pretty-print this PageElement as a string. + + :param encoding: The eventual encoding of the string. If this is None, + a Unicode string will be returned. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :return: A Unicode string (if encoding==None) or a bytestring + (otherwise). + """ + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. decode_contents() is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + """ + # First off, turn a string formatter into a Formatter object. This + # will stop the lookup from happening over and over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: + text = text.strip() + if text: + if pretty_print and not preserve_whitespace: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print and not preserve_whitespace: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this PageElement as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + + :return: A bytestring. + """ + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Deprecated method for BS3 compatibility.""" + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Look in the children of this PageElement and find the first + PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find #BS2 + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Look in the children of this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find_all() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + """Iterate over all direct children of this PageElement. + + :yield: A sequence of PageElements. + """ + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + """Iterate over all children of this PageElement in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # CSS selector code + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A Tag. + :rtype: bs4.element.Tag + """ + value = self.select(selector, namespaces, 1, **kwargs) + if value: + return value[0] + return None + + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. + + This uses the SoupSieve library. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of Tags. + :rtype: bs4.element.ResultSet + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + results = soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # We do this because it's more consistent and because + # ResultSet.__getattr__ has a helpful error message. + return ResultSet(None, results) + + # Old names for backwards compatibility + def childGenerator(self): + """Deprecated generator.""" + return self.children + + def recursiveChildGenerator(self): + """Deprecated generator.""" + return self.descendants + + def has_key(self, key): + """Deprecated method. This was kind of misleading because has_key() + (attributes) was different from __in__ (contents). + + has_key() is gone in Python 3, anyway. + """ + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + string). + + This is primarily used to underpin the find_* methods, but you can + create one yourself and pass it in as `parse_only` to the + `BeautifulSoup` constructor, to parse a subset of a large + document. + """ + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + """Constructor. + + The SoupStrainer constructor takes the same arguments passed + into the find_* methods. See the online documentation for + detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + """ + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in list(attrs.items()): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, str)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return str(str(value)) + + def __str__(self): + """A human-readable representation of this SoupStrainer.""" + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + """Check whether a Tag with the given name and attributes would + match this SoupStrainer. + + Used prospectively to decide whether to even bother creating a Tag + object. + + :param markup_name: A tag name as found in some markup. + :param markup_attrs: A dictionary of attributes as found in some markup. + + :return: True if the prospective tag would match this SoupStrainer; + False otherwise. + """ + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + + if isinstance(self.name, str): + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. + if markup and not markup.prefix and self.name != markup.name: + return False + + call_function_with_tag_data = ( + isinstance(self.name, Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + + # For BS3 compatibility. + searchTag = search_tag + + def search(self, markup): + """Find all items in `markup` that match this SoupStrainer. + + Used by the core _find_all() method, which is ultimately + called by all find_* methods. + + :param markup: A PageElement or a list of them. + """ + # print('looking for %s in %s' % (self, markup)) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, str): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against, already_tried=None): + # print(u"Matching %s against %s" % (markup, match_against)) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + original_markup = markup + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, str)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, str): + # Exact string match + match = markup == match_against + + if not match and hasattr(match_against, 'search'): + # Regexp match + return match_against.search(markup) + + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source, result=()): + """Constructor. + + :param source: A SoupStrainer. + :param result: A list of PageElements. + """ + super(ResultSet, self).__init__(result) + self.source = source + + def __getattr__(self, key): + """Raise a helpful exception to explain a common code fix.""" + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key + ) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/formatter.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/formatter.py new file mode 100644 index 00000000000..2cbab4c713a --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/formatter.py @@ -0,0 +1,152 @@ +from bs4.dammit import EntitySubstitution + +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. + + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. + + Formatters are passed in as the `formatter` argument to methods + like `PageElement.encode`. Most people won't need to think about + formatters, and most people who need to think about them can pass + in one of these predefined strings as `formatter` rather than + making a new Formatter object: + + For HTML documents: + * 'html' - HTML entity substitution for generic HTML documents. (default) + * 'html5' - HTML entity substitution for HTML5 documents. + * 'minimal' - Only make the substitutions necessary to guarantee + valid HTML. + * None - Do not perform any substitution. This will be faster + but may result in invalid markup. + + For XML documents: + * 'html' - Entity substitution for XHTML documents. + * 'minimal' - Only make the substitutions necessary to guarantee + valid XML. (default) + * None - Do not perform any substitution. This will be faster + but may result in invalid markup. + """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} + + HTML = 'html' + XML = 'xml' + + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + ) + + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + ): + """Constructor. + + :param language: This should be Formatter.XML if you are formatting + XML markup and Formatter.HTML if you are formatting HTML markup. + + :param entity_substitution: A function to call to replace special + characters with XML/HTML entities. For examples, see + bs4.dammit.EntitySubstitution.substitute_html and substitute_xml. + :param void_element_close_prefix: By default, void elements + are represented as <tag/> (XML rules) rather than <tag> + (HTML rules). To get <tag>, pass in the empty string. + :param cdata_containing_tags: The list of tags that are defined + as containing CDATA in this dialect. For example, in HTML, + <script> and <style> tags are defined as containing CDATA, + and their contents should not be formatted. + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution. + This may be a string encountered in an attribute value or as + text. + + :param ns: A string. + :return: A string with certain characters replaced by named + or numeric entities. + """ + if not self.entity_substitution: + return ns + from .element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return self.entity_substitution(ns) + + def attribute_value(self, value): + """Process the value of an attribute. + + :param ns: A string. + :return: A string with certain characters replaced by named + or numeric entities. + """ + return self.substitute(value) + + def attributes(self, tag): + """Reorder a tag's attributes however you want. + + By default, attributes are sorted alphabetically. This makes + behavior consistent between Python 2 and Python 3, and preserves + backwards compatibility with older versions of Beautiful Soup. + """ + if tag.attrs is None: + return [] + return sorted(tag.attrs.items()) + + +class HTMLFormatter(Formatter): + """A generic Formatter for HTML.""" + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + + +class XMLFormatter(Formatter): + """A generic Formatter for XML.""" + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/testing.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/testing.py new file mode 100644 index 00000000000..9ca507bccd3 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/py3k/bs4/testing.py @@ -0,0 +1,1101 @@ +# encoding: utf-8 +"""Helper classes for tests.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import pickle +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + PYTHON_SPECIFIC_ENCODINGS, + SoupStrainer, + Script, + Stylesheet, + Tag +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + +BAD_DOCUMENT = """A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup, **kwargs): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder(**kwargs).test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + # Verify that the documents come out the same. + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + # Also run some checks on the BeautifulSoup object itself: + + # Verify that every tag that was opened was eventually closed. + + # There are no tags in the open tag counter. + assert all(v==0 for v in list(obj.open_tag_counter.values())) + + # The only tag in the tag stack is the one for the root + # document. + self.assertEqual( + [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack] + ) + + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e + + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + + def test_special_string_containers(self): + soup = self.soup( + "<style>Some CSS</style><script>Some Javascript</script>" + ) + assert isinstance(soup.style.string, Stylesheet) + assert isinstance(soup.script.string, Script) + + soup = self.soup( + "<style><!--Some CSS--></style>" + ) + assert isinstance(soup.style.string, Stylesheet) + # The contents of the style tag resemble an HTML comment, but + # it's not treated as a comment. + self.assertEqual("<!--Some CSS-->", soup.style.string) + assert isinstance(soup.style.string, Stylesheet) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + doctype_str + ) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"): + """Generate and parse a document with the given doctype.""" + doctype = '<!%s %s>' % (doctype_string, doctype_fragment) + markup = doctype + '\n<p>foo</p>' + soup = self.soup(markup) + return doctype.encode("utf8"), soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_mixed_case_doctype(self): + # A lowercase or mixed-case doctype becomes a Doctype. + for doctype_fragment in ("doctype", "DocType"): + doctype_str, soup = self._document_with_doctype( + "html", doctype_fragment + ) + + # Make sure a Doctype object was created and that the DOCTYPE + # is uppercase. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, "html") + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + b"<!DOCTYPE html>" + ) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + + def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = """<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A <p> tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("<p/>") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "<p></p>") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("<p>", "<p></p>") + self.assertSoupEquals("<b>", "<b></b>") + + self.assertSoupEquals("<br>", "<br/>") + + def test_br_is_always_empty_element_tag(self): + """A <br> tag is designated as an empty-element tag. + + Some parsers treat <br></br> as one <br/> tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("<br></br>") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "<br/>") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("<em><em></em></em>") + + def test_double_head(self): + html = '''<!DOCTYPE html> +<html> +<head> +<title>Ordinary HEAD element test</title> +</head> +<script type="text/javascript"> +alert("Help!"); +</script> +<body> +Hello, world! +</body> +</html> +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "<p>foo<!--foobar-->baz</p>" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in <pre> and <textarea> tags, + even if that would mean not prettifying the markup. + """ + pre_markup = "<pre> </pre>" + textarea_markup = "<textarea> woo\nwoo </textarea>" + self.assertSoupEquals(pre_markup) + self.assertSoupEquals(textarea_markup) + + soup = self.soup(pre_markup) + self.assertEqual(soup.pre.prettify(), pre_markup) + + soup = self.soup(textarea_markup) + self.assertEqual(soup.textarea.prettify(), textarea_markup) + + soup = self.soup("<textarea></textarea>") + self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>") + + def test_nested_inline_elements(self): + """Inline elements can be nested indefinitely.""" + b_tag = "<b>Inside a B tag</b>" + self.assertSoupEquals(b_tag) + + nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + self.assertSoupEquals( + markup, + '<table id="1"><tr><td>Here\'s another table:' + '<table id="2"><tr><td>foo</td></tr></table>' + '</td></tr></table>') + + self.assertSoupEquals( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '<table><div><div class="css"></div></div></table>' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # <html> tag. This has caused problems with multivalued + # attributes. + markup = '<html class="a b"></html>' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + "<p>• AT&T is in the s&p 500</p>", + "<p>\u2022 AT&T is in the s&p 500</p>" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + "<p>Bob's Bar</p>", + "<p>Bob's Bar</p>", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEqual("“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): + expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("<p>I said "good day!"</p>", + '<p>I said "good day!"</p>') + + def test_out_of_range_entity(self): + expect = "\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>") + self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>") + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<html><head></head> + <link></link> + <body>foo</body> +</html> +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<!DOCTYPE html> +<html> + <body> + <article id="a" > + <div><a href="1"></div> + <footer> + <a href="2"></a> + </footer> + </article> + </body> +</html> +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'<a class="foo bar">' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", + parse_only=strainer) + self.assertEqual(soup.decode(), "<b>bold</b>") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("<foo attr='bar'></foo>", + '<foo attr="bar"></foo>') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('<this is="really messed up & stuff"></this>', + '<this is="really messed up & stuff"></this>') + + self.assertSoupEquals( + '<a href="http://example.org?a=1&b=2;3">foo</a>', + '<a href="http://example.org?a=1&b=2;3">foo</a>') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "<p><<sacré bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"<p>\x91Foo\x92</p>" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("<a> </a>") + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "<p><<sacré bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'<html><head></head><body><pre>' + b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' + b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' + b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' + b'</pre></body></html>') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/>') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta id="encoding" charset="x-sjis" />') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_python_specific_encodings_not_used_in_charset(self): + # You can encode an HTML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. Instead, the document will appear to + # have no encoding. + for markup in [ + b'<meta charset="utf8"></head>' + b'<meta id="encoding" charset="utf-8" />' + ]: + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + 'idna', 'mbcs', 'oem', 'undefined', + 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'meta charset=""' in encoded + assert encoding.encode("ascii") not in encoded + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("<a>text</a>") + data.a['foo'] = 'bar' + self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + + def test_closing_tag_with_no_opening_tag(self): + # Without BeautifulSoup.open_tag_counter, the </span> tag will + # cause _popToTag to be called over and over again as we look + # for a <span> tag that wasn't there. The result is that 'text2' + # will show up outside the body of the document. + soup = self.soup("<body><div><p>text1</p></span>text2</div></body>") + self.assertEqual( + "<body><div><p>text1</p>text2</div></body>", soup.body.decode() + ) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class XMLTreeBuilderSmokeTest(object): + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def test_docstring_generated(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') + + def test_xml_declaration(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_python_specific_encodings_not_used_in_xml_declaration(self): + # You can encode an XML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. + markup = b"""<?xml version="1.0"?>\n<foo/>""" + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + 'idna', 'mbcs', 'oem', 'undefined', + 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'<?xml version="1.0"?>' in encoded + assert encoding.encode("ascii") not in encoded + + def test_processing_instruction(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_nested_namespaces(self): + doc = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<parent xmlns="http://ns1/"> +<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> +<grandchild ns3:attr="value" xmlns="http://ns4/"/> +</child> +</parent>""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + <script type="text/javascript"> + </script> +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' + soup = self.soup(markup) + self.assertEqual( + str(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode("latin1"), + b'<?xml version="1.0" encoding="latin1"?>\n<root/>') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' + + b'0' * (2**12) + + b'</root>') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("<p>", "<p/>") + self.assertSoupEquals("<p>foo</p>") + + def test_namespaces_are_preserved(self): + markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' + soup = self.soup(markup) + self.assertEqual(str(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = '<foo xml:lang="fr">bar</foo>' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_find_by_prefixed_name(self): + doc = """<?xml version="1.0" encoding="utf-8"?> +<Document xmlns="http://example.com/ns0" + xmlns:ns1="http://example.com/ns1" + xmlns:ns2="http://example.com/ns2" + <ns1:tag>foo</ns1:tag> + <ns1:tag>bar</ns1:tag> + <ns2:tag key="value">baz</ns2:tag> +</Document> +""" + soup = self.soup(doc) + + # There are three <tag> tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + def test_copy_tag_preserves_namespace(self): + xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<w:document xmlns:w="http://example.com/ns0"/>""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + self.assertEqual(tag.prefix, duplicate.prefix) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "<a>" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '<svg><circle/></svg>' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '<math><msqrt>5</msqrt></math>' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/scripts/demonstrate_parser_differences.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/scripts/demonstrate_parser_differences.py new file mode 100644 index 00000000000..d84670a53a6 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/scripts/demonstrate_parser_differences.py @@ -0,0 +1,95 @@ +"""Demonstrate how different parsers parse the same markup. + +Beautiful Soup can use any of a number of different parsers. Every +parser should behave more or less the same on valid markup, and +Beautiful Soup's unit tests make sure this is the case. But every +parser handles invalid markup differently. Even different versions of +the same parser handle invalid markup differently. So instead of unit +tests I've created this educational demonstration script. + +The file demonstration_markup.txt contains many lines of HTML. This +script tests each line of markup against every parser you have +installed, and prints out how each parser sees that markup. This may +help you choose a parser, or understand why Beautiful Soup presents +your document the way it does. +""" + +import os +import sys +from bs4 import BeautifulSoup +parsers = ['html.parser'] + +try: + from bs4.builder import _lxml + parsers.append('lxml') +except ImportError, e: + pass + +try: + from bs4.builder import _html5lib + parsers.append('html5lib') +except ImportError, e: + pass + +class Demonstration(object): + def __init__(self, markup): + self.results = {} + self.markup = markup + + def run_against(self, *parser_names): + uniform_results = True + previous_output = None + for parser in parser_names: + try: + soup = BeautifulSoup(self.markup, parser) + if markup.startswith("<div>"): + # Extract the interesting part + output = soup.div + else: + output = soup + except Exception, e: + output = "[EXCEPTION] %s" % str(e) + self.results[parser] = output + if previous_output is None: + previous_output = output + elif previous_output != output: + uniform_results = False + return uniform_results + + def dump(self): + print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) + for parser, output in self.results.items(): + print "%s: %s" % (parser.rjust(13), output.encode("utf8")) + +different_results = [] +uniform_results = [] + +print "= Testing the following parsers: %s =" % ", ".join(parsers) +print + +input_file = sys.stdin +if sys.stdin.isatty(): + for filename in [ + "demonstration_markup.txt", + os.path.join("scripts", "demonstration_markup.txt")]: + if os.path.exists(filename): + input_file = open(filename) + +for markup in input_file: + demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) + is_uniform = demo.run_against(*parsers) + if is_uniform: + uniform_results.append(demo) + else: + different_results.append(demo) + +print "== Markup that's handled the same in every parser ==" +print +for demo in uniform_results: + demo.dump() + print +print "== Markup that's not handled the same in every parser ==" +print +for demo in different_results: + demo.dump() + print diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/scripts/demonstration_markup.txt b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/scripts/demonstration_markup.txt new file mode 100644 index 00000000000..a7914a0b04a --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/scripts/demonstration_markup.txt @@ -0,0 +1,34 @@ +A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our☃>Tag name contains Unicode characters</our☃></div> +<div><a ☃="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/setup.cfg b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/setup.cfg new file mode 100644 index 00000000000..8bfd5a12f85 --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/setup.py b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/setup.py new file mode 100644 index 00000000000..7b4b3931c3d --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/setup.py @@ -0,0 +1,45 @@ +from setuptools import ( + setup, + find_packages, +) +import sys + +with open("README.md", "r") as fh: + long_description = fh.read() + +setup( + name="beautifulsoup4", + # NOTE: We can't import __version__ from bs4 because bs4/__init__.py is Python 2 code, + # and converting it to Python 3 means going through this code to run 2to3. + # So we have to specify it twice for the time being. + version = '4.9.3', + author="Leonard Richardson", + author_email='leonardr@segfault.org', + url="http://www.crummy.com/software/BeautifulSoup/bs4/", + download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/", + description="Screen-scraping library", + install_requires=[ + "soupsieve >1.2; python_version>='3.0'", + "soupsieve >1.2, <2.0; python_version<'3.0'", + ], + long_description=long_description, + long_description_content_type="text/markdown", + license="MIT", + packages=find_packages(exclude=['tests*']), + extras_require = { + 'lxml' : [ 'lxml'], + 'html5lib' : ['html5lib'], + }, + use_2to3 = True, + classifiers=["Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 2.7", + 'Programming Language :: Python :: 3', + "Topic :: Text Processing :: Markup :: HTML", + "Topic :: Text Processing :: Markup :: XML", + "Topic :: Text Processing :: Markup :: SGML", + "Topic :: Software Development :: Libraries :: Python Modules", + ], +) diff --git a/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/test-all-versions b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/test-all-versions new file mode 100755 index 00000000000..01e436b10ac --- /dev/null +++ b/chromium/third_party/catapult/third_party/beautifulsoup4-4.9.3/test-all-versions @@ -0,0 +1 @@ +python2.7 -m unittest discover -s bs4 && ./convert-py3k |