forked from pool/python-beautifulsoup4
Compare commits
13 Commits
| Author | SHA256 | Date | |
|---|---|---|---|
| b186ae5eb2 | |||
| acf4a24269 | |||
| a5ca8aee55 | |||
| 9851dc5e98 | |||
| ee642d2d2b | |||
| 28f35ad36c | |||
| 1333d09e56 | |||
| b4f8e272ce | |||
| 8076d806c7 | |||
| e0fde13fec | |||
| 89dfc2f1db | |||
| 9b0a4c4b8c | |||
| 1c3cfc65ef |
@@ -1,3 +0,0 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
|
||||||
oid sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195
|
|
||||||
size 621067
|
|
||||||
3
beautifulsoup4-4.14.3.tar.gz
Normal file
3
beautifulsoup4-4.14.3.tar.gz
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86
|
||||||
|
size 627737
|
||||||
148
htmlparser.patch
Normal file
148
htmlparser.patch
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Leonard Richardson <leonardr@segfault.org>
|
||||||
|
Date: Mon, 8 Dec 2025 19:34:16 -0500
|
||||||
|
Subject: * Change the html.parser tree builder's code for handling numeric
|
||||||
|
character references, to avoid a crash when using Python versions that
|
||||||
|
include the fix to Python issue https://bugs.python.org/issue13633 (e.g.
|
||||||
|
Python 3.11.13). [bug=2134393]
|
||||||
|
|
||||||
|
---
|
||||||
|
CHANGELOG | 5 +++
|
||||||
|
bs4/builder/_htmlparser.py | 78 ++++++++++++++++++++++++++++++++++++--------
|
||||||
|
bs4/tests/test_htmlparser.py | 17 ++++++++++
|
||||||
|
3 files changed, 86 insertions(+), 14 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
|
||||||
|
index 165a3d8..ead800f 100644
|
||||||
|
--- a/bs4/builder/_htmlparser.py
|
||||||
|
+++ b/bs4/builder/_htmlparser.py
|
||||||
|
@@ -10,6 +10,7 @@ __all__ = [
|
||||||
|
]
|
||||||
|
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
+import re
|
||||||
|
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||||
|
"""Handle some textual data that shows up between tags."""
|
||||||
|
self.soup.handle_data(data)
|
||||||
|
|
||||||
|
+ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
|
||||||
|
+ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
|
||||||
|
+
|
||||||
|
+ @classmethod
|
||||||
|
+ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
|
||||||
|
+ """Convert a numeric character reference into an actual character.
|
||||||
|
+
|
||||||
|
+ :param name: The number of the character reference, as
|
||||||
|
+ obtained by html.parser
|
||||||
|
+
|
||||||
|
+ :return: A 3-tuple (dereferenced, replacement_added,
|
||||||
|
+ extra_data). `dereferenced` is the dereferenced character
|
||||||
|
+ reference, or the empty string if there was no
|
||||||
|
+ reference. `replacement_added` is True if the reference
|
||||||
|
+ could only be dereferenced by replacing content with U+FFFD
|
||||||
|
+ REPLACEMENT CHARACTER. `extra_data` is a portion of data
|
||||||
|
+ following the character reference, which was deemed to be
|
||||||
|
+ normal data and not part of the reference at all.
|
||||||
|
+ """
|
||||||
|
+ dereferenced:str = ""
|
||||||
|
+ replacement_added:bool = False
|
||||||
|
+ extra_data:str = ""
|
||||||
|
+
|
||||||
|
+ base:int = 10
|
||||||
|
+ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
|
||||||
|
+ if name.startswith("x") or name.startswith("X"):
|
||||||
|
+ # Hex reference
|
||||||
|
+ name = name[1:]
|
||||||
|
+ base = 16
|
||||||
|
+ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
|
||||||
|
+
|
||||||
|
+ real_name:Optional[int] = None
|
||||||
|
+ try:
|
||||||
|
+ real_name = int(name, base)
|
||||||
|
+ except ValueError:
|
||||||
|
+ # This is either bad data that starts with what looks like
|
||||||
|
+ # a numeric character reference, or a real numeric
|
||||||
|
+ # reference that wasn't terminated by a semicolon.
|
||||||
|
+ #
|
||||||
|
+ # The fix to https://bugs.python.org/issue13633 made it
|
||||||
|
+ # our responsibility to handle the extra data.
|
||||||
|
+ #
|
||||||
|
+ # To preserve the old behavior, we extract the numeric
|
||||||
|
+ # portion of the incoming "reference" and treat that as a
|
||||||
|
+ # numeric reference. All subsequent data will be processed
|
||||||
|
+ # as string data.
|
||||||
|
+ match = reg.search(name)
|
||||||
|
+ if match is not None:
|
||||||
|
+ real_name = int(match.groups()[0], base)
|
||||||
|
+ extra_data = match.groups()[1]
|
||||||
|
+
|
||||||
|
+ if real_name is None:
|
||||||
|
+ dereferenced = ""
|
||||||
|
+ extra_data = name
|
||||||
|
+ else:
|
||||||
|
+ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
|
||||||
|
+ return dereferenced, replacement_added, extra_data
|
||||||
|
+
|
||||||
|
def handle_charref(self, name: str) -> None:
|
||||||
|
"""Handle a numeric character reference by converting it to the
|
||||||
|
corresponding Unicode character and treating it as textual
|
||||||
|
@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||||
|
|
||||||
|
:param name: Character number, possibly in hexadecimal.
|
||||||
|
"""
|
||||||
|
- # TODO: This was originally a workaround for a bug in
|
||||||
|
- # HTMLParser. (http://bugs.python.org/issue13633) The bug has
|
||||||
|
- # been fixed, but removing this code still makes some
|
||||||
|
- # Beautiful Soup tests fail. This needs investigation.
|
||||||
|
- real_name:int
|
||||||
|
- if name.startswith("x"):
|
||||||
|
- real_name = int(name.lstrip("x"), 16)
|
||||||
|
- elif name.startswith("X"):
|
||||||
|
- real_name = int(name.lstrip("X"), 16)
|
||||||
|
- else:
|
||||||
|
- real_name = int(name)
|
||||||
|
-
|
||||||
|
- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
|
||||||
|
+ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
|
||||||
|
if replacement_added:
|
||||||
|
self.soup.contains_replacement_characters = True
|
||||||
|
- self.handle_data(data)
|
||||||
|
+ if dereferenced is not None:
|
||||||
|
+ self.handle_data(dereferenced)
|
||||||
|
+ if extra_data is not None:
|
||||||
|
+ self.handle_data(extra_data)
|
||||||
|
|
||||||
|
def handle_entityref(self, name: str) -> None:
|
||||||
|
"""Handle a named entity reference by converting it to the
|
||||||
|
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
|
||||||
|
index 0086a9d..cb85b53 100644
|
||||||
|
--- a/bs4/tests/test_htmlparser.py
|
||||||
|
+++ b/bs4/tests/test_htmlparser.py
|
||||||
|
@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
|
||||||
|
# Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
|
||||||
|
# lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
|
||||||
|
assert soup.contains_replacement_characters == True
|
||||||
|
+
|
||||||
|
+class TestBeautifulSoupHTMLParser:
|
||||||
|
+ def test_dereference_numeric_character_reference(self):
|
||||||
|
+ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
|
||||||
|
+ assert m("64") == ("@", False, "")
|
||||||
|
+ assert m("x64") == ("d", False, "")
|
||||||
|
+ assert m("X64") == ("d", False, "")
|
||||||
|
+ assert m("64andsomeextra") == ("@", False, "andsomeextra")
|
||||||
|
+ assert m("") == ("", False, "")
|
||||||
|
+ assert m("00whee") == ("<22>", True, "whee")
|
||||||
|
+ assert m("xfffdthatsit") == ("<22>", False, "thatsit")
|
||||||
|
+ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
|
||||||
|
+ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
|
||||||
|
+
|
||||||
|
+ # These are almost certainly wrong but at least it doesn't crash.
|
||||||
|
+ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
|
||||||
|
+ assert m("xffffffffffffffffffffffbeep") == ("<22>", True, "p")
|
||||||
|
--
|
||||||
|
cgit v1.2.3
|
||||||
|
|
||||||
|
|
||||||
@@ -1,3 +1,84 @@
|
|||||||
|
-------------------------------------------------------------------
|
||||||
|
Mon Dec 29 09:58:48 UTC 2025 - Markéta Machová <mmachova@suse.com>
|
||||||
|
|
||||||
|
- update to 4.14.3
|
||||||
|
* When using one of the lxml tree builders, you can pass in
|
||||||
|
huge_tree=True to disable lxml's security restrictions and process
|
||||||
|
files that include huge text nodes. ("huge" means more than
|
||||||
|
10,000,000 bytes of text in a single node). Without this, lxml may
|
||||||
|
silently stop processing the file after encountering a huge text
|
||||||
|
node.
|
||||||
|
* The html.parser tree builder processes numeric character entities
|
||||||
|
using the algorithm described in the HTML spec. If this means
|
||||||
|
replacing some other character with REPLACEMENT CHARACTER, it will
|
||||||
|
set BeautifulSoup.contains_replacement_characters.
|
||||||
|
* Added a general test of the html.parser tree builder's ability to
|
||||||
|
turn any parsing exception from html.parser into a
|
||||||
|
ParserRejectedMarkup exception. This makes it possible to remove
|
||||||
|
version-dependent tests that depended on the existence of specific
|
||||||
|
bugs in html.parser.
|
||||||
|
- Add htmlparser.patch to fix behaviour with cpython interpreters
|
||||||
|
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
Mon Oct 13 09:11:52 UTC 2025 - Dirk Müller <dmueller@suse.com>
|
||||||
|
|
||||||
|
- update to 4.14.2:
|
||||||
|
* Making ResultSet inherit from MutableSequence still resulted
|
||||||
|
in too many breaking changes in users of the library,
|
||||||
|
so I reverted the ResultSet code back to where it was in 4.13.5
|
||||||
|
and added tests of all known breaking behavior. [bug=2125906]
|
||||||
|
* Made ResultSet inherit from MutableSequence instead of
|
||||||
|
Sequence, since lots of existing code treats ResultSet as a
|
||||||
|
mutable list.
|
||||||
|
* This version adds function overloading to the find_* methods
|
||||||
|
to make it easier to write type-safe Python.
|
||||||
|
* The typing for find_parent() and find_parents() was improved
|
||||||
|
without any overloading. Casts should never be necessary,
|
||||||
|
since those methods only ever return Tag and ResultSet[Tag],
|
||||||
|
respectively.
|
||||||
|
* ResultSet now inherits from Sequence. This should make it
|
||||||
|
easier to incorporate ResultSet objects into your type system
|
||||||
|
without needing to handle ResultSet specially.
|
||||||
|
* Fixed an unhandled exception when creating the string
|
||||||
|
representation of a decomposed element.
|
||||||
|
* The default value for the 'attrs' attribute in find* methods
|
||||||
|
is now None, not the empty dictionary. This should have no visible
|
||||||
|
effect on anything.
|
||||||
|
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
Wed Sep 10 07:16:20 UTC 2025 - John Paul Adrian Glaubitz <adrian.glaubitz@suse.com>
|
||||||
|
|
||||||
|
- Update to 4.13.5
|
||||||
|
* Fixed an unhandled exception when parsing invalid markup that contains the { character
|
||||||
|
when using lxml==6.0.0. [bug=2116306]
|
||||||
|
* Fixed a regression when matching a multi-valued attribute against the
|
||||||
|
empty string. [bug=2115352]
|
||||||
|
* Unit tests and test case data are no longer packaged with the wheel. [bug=2107495]
|
||||||
|
* Fixed a bug that gave the wrong result when parsing the empty bytestring. [bug=2110492]
|
||||||
|
* Brought the Spanish translation of the documentation up to date with
|
||||||
|
4.13.4. Courtesy of Carlos Romero.
|
||||||
|
* For Python 3.13 and above, disabled tests that verify Beautiful Soup's handling of htmlparser's
|
||||||
|
exceptions when given very bad markup. The bug in htmlparser that caused
|
||||||
|
this behavior has been fixed. Patch courtesy of Stefano Rivera.
|
||||||
|
* Used overloading to improve type hints for prettify().
|
||||||
|
* Updated the SoupStrainer documentation to clarify that during initial
|
||||||
|
parsing, attribute values are always passed into the SoupStrainer as raw strings. [bug=2111651]
|
||||||
|
* Fixed all type checking errors issued by pyright. (Previously only mypy
|
||||||
|
was used for type checking.)
|
||||||
|
* Improved the type hints for PageElement.replace_with. [bug=2114746]
|
||||||
|
* Improved the type hint for the arguments of the lambda function that can
|
||||||
|
be used to match a tag's attribute. [bug=2110401]
|
||||||
|
* Modified some of the lxml tests to accommodate behavioral changes in libxml2
|
||||||
|
2.14.3. Specifically:
|
||||||
|
|
||||||
|
1. XML declarations and processing instructions in HTML documents
|
||||||
|
are rewritten as comments. Note that this means XHTML documents will
|
||||||
|
now turn into regular HTML documents if run through the 'lxml'
|
||||||
|
parser. The 'xml' parser is unaffected.
|
||||||
|
|
||||||
|
2. Out-of-range numeric entities are replaced with REPLACEMENT
|
||||||
|
CHARACTER rather than omitted entirely. [bug=2112242]
|
||||||
|
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
||||||
|
|
||||||
@@ -26,17 +107,17 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
excludes every item in the parse tree. This is used internally
|
excludes every item in the parse tree. This is used internally
|
||||||
in situations where the provided filters are logically
|
in situations where the provided filters are logically
|
||||||
inconsistent or match a value against the null set.
|
inconsistent or match a value against the null set.
|
||||||
|
|
||||||
Without this, it's not always possible to distinguish between a
|
Without this, it's not always possible to distinguish between a
|
||||||
SoupStrainer that excludes everything and one that excludes
|
SoupStrainer that excludes everything and one that excludes
|
||||||
nothing.
|
nothing.
|
||||||
|
|
||||||
This fixes a bug where calls to find_* methods with no
|
This fixes a bug where calls to find_* methods with no
|
||||||
arguments returned None, instead of the first item out of the
|
arguments returned None, instead of the first item out of the
|
||||||
iterator. [bug=2097333]
|
iterator. [bug=2097333]
|
||||||
|
|
||||||
Things added to the API to support this:
|
Things added to the API to support this:
|
||||||
|
|
||||||
- The ElementFilter.includes_everything property
|
- The ElementFilter.includes_everything property
|
||||||
- The MatchRule.exclude_everything member
|
- The MatchRule.exclude_everything member
|
||||||
- The _known_rules argument to ElementFilter.match. This is an
|
- The _known_rules argument to ElementFilter.match. This is an
|
||||||
@@ -59,9 +140,9 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
code, which I've fixed, but the result is a larger-than-usual
|
code, which I've fixed, but the result is a larger-than-usual
|
||||||
number of deprecations and changes that may break backwards
|
number of deprecations and changes that may break backwards
|
||||||
compatibility.
|
compatibility.
|
||||||
|
|
||||||
Chris Papademetrious deserves a special thanks for his work on
|
Chris Papademetrious deserves a special thanks for his work on
|
||||||
this release through its long beta process.
|
this release through its long beta process.
|
||||||
## Deprecation notices
|
## Deprecation notices
|
||||||
* These things now give DeprecationWarnings when you try to use
|
* These things now give DeprecationWarnings when you try to use
|
||||||
them, and are scheduled to be removed in Beautiful Soup 4.15.0.
|
them, and are scheduled to be removed in Beautiful Soup 4.15.0.
|
||||||
@@ -70,7 +151,7 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
deprecated for a very long time, but they didn't issue
|
deprecated for a very long time, but they didn't issue
|
||||||
DeprecationWarning when you tried to use them. Now they do, and
|
DeprecationWarning when you tried to use them. Now they do, and
|
||||||
they're all going away soon.
|
they're all going away soon.
|
||||||
|
|
||||||
This mainly refers to methods and attributes with camelCase
|
This mainly refers to methods and attributes with camelCase
|
||||||
names, for example: renderContents, replaceWith,
|
names, for example: renderContents, replaceWith,
|
||||||
replaceWithChildren, findAll, findAllNext, findAllPrevious,
|
replaceWithChildren, findAll, findAllNext, findAllPrevious,
|
||||||
@@ -83,7 +164,7 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
nextSiblingGenerator, previousGenerator,
|
nextSiblingGenerator, previousGenerator,
|
||||||
previousSiblingGenerator, recursiveChildGenerator, and
|
previousSiblingGenerator, recursiveChildGenerator, and
|
||||||
parentGenerator.
|
parentGenerator.
|
||||||
|
|
||||||
This also includes the BeautifulStoneSoup class.
|
This also includes the BeautifulStoneSoup class.
|
||||||
* The SAXTreeBuilder class, which was never officially supported
|
* The SAXTreeBuilder class, which was never officially supported
|
||||||
or tested.
|
or tested.
|
||||||
@@ -116,36 +197,36 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
that if you set an attribute value to a number, it will be
|
that if you set an attribute value to a number, it will be
|
||||||
converted to a string immediately, rather than being converted
|
converted to a string immediately, rather than being converted
|
||||||
when you output the document. [bug=2065525]
|
when you output the document. [bug=2065525]
|
||||||
|
|
||||||
More importantly for backwards compatibility, setting an HTML
|
More importantly for backwards compatibility, setting an HTML
|
||||||
attribute value to True will set the attribute's value to the
|
attribute value to True will set the attribute's value to the
|
||||||
appropriate string per the HTML spec. Setting an attribute
|
appropriate string per the HTML spec. Setting an attribute
|
||||||
value to False or None will remove the attribute value from the
|
value to False or None will remove the attribute value from the
|
||||||
tag altogether, rather than (effectively, as before) setting
|
tag altogether, rather than (effectively, as before) setting
|
||||||
the value to the string "False" or the string "None".
|
the value to the string "False" or the string "None".
|
||||||
|
|
||||||
This means that some programs that modify documents will
|
This means that some programs that modify documents will
|
||||||
generate different output than they would in earlier versions
|
generate different output than they would in earlier versions
|
||||||
of Beautiful Soup, but the new documents are more likely to
|
of Beautiful Soup, but the new documents are more likely to
|
||||||
represent the intent behind the modifications.
|
represent the intent behind the modifications.
|
||||||
|
|
||||||
To give a specific example, if you have code that looks
|
To give a specific example, if you have code that looks
|
||||||
something like this:
|
something like this:
|
||||||
|
|
||||||
checkbox1['checked'] = True checkbox2['checked'] = False
|
checkbox1['checked'] = True checkbox2['checked'] = False
|
||||||
|
|
||||||
Then a document that used to look like this (with most browsers
|
Then a document that used to look like this (with most browsers
|
||||||
treating both boxes as checked):
|
treating both boxes as checked):
|
||||||
|
|
||||||
<input type="checkbox" checked="True"/> <input type="checkbox"
|
<input type="checkbox" checked="True"/> <input type="checkbox"
|
||||||
checked="False"/>
|
checked="False"/>
|
||||||
|
|
||||||
Will now look like this (with browsers treating only the first
|
Will now look like this (with browsers treating only the first
|
||||||
box as checked):
|
box as checked):
|
||||||
|
|
||||||
<input type="checkbox" checked="checked"/> <input
|
<input type="checkbox" checked="checked"/> <input
|
||||||
type="checkbox"/>
|
type="checkbox"/>
|
||||||
|
|
||||||
You can get the old behavior back by instantiating a
|
You can get the old behavior back by instantiating a
|
||||||
TreeBuilder with `attribute_dict_class=dict`, or you can
|
TreeBuilder with `attribute_dict_class=dict`, or you can
|
||||||
customize how Beautiful Soup treates attribute values by
|
customize how Beautiful Soup treates attribute values by
|
||||||
@@ -183,15 +264,15 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
This is the sort of change that might break your unit test
|
This is the sort of change that might break your unit test
|
||||||
suite, but the resulting markup will be much more readable and
|
suite, but the resulting markup will be much more readable and
|
||||||
more HTML5-ish.
|
more HTML5-ish.
|
||||||
|
|
||||||
To quickly get the old behavior back, change code like this:
|
To quickly get the old behavior back, change code like this:
|
||||||
|
|
||||||
tag.encode(formatter='html5')
|
tag.encode(formatter='html5')
|
||||||
|
|
||||||
to this:
|
to this:
|
||||||
|
|
||||||
tag.encode(formatter='html5-4.12')
|
tag.encode(formatter='html5-4.12')
|
||||||
|
|
||||||
In the future, the 'html5' formatter may be become the default
|
In the future, the 'html5' formatter may be become the default
|
||||||
HTML formatter, which will change Beautiful Soup's default
|
HTML formatter, which will change Beautiful Soup's default
|
||||||
output. This will break a lot of test suites so it's not going
|
output. This will break a lot of test suites so it's not going
|
||||||
@@ -201,22 +282,22 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
sometimes an Optional[int], and sometimes they were
|
sometimes an Optional[int], and sometimes they were
|
||||||
Optional[Tag], the result of searching for a child tag called
|
Optional[Tag], the result of searching for a child tag called
|
||||||
<sourceline> or <sourcepos>. [bug=2065904]
|
<sourceline> or <sourcepos>. [bug=2065904]
|
||||||
|
|
||||||
If your code does search for a tag called <sourceline> or
|
If your code does search for a tag called <sourceline> or
|
||||||
<sourcepos>, it may stop finding that tag when you upgrade to
|
<sourcepos>, it may stop finding that tag when you upgrade to
|
||||||
Beautiful Soup 4.13. If this happens, you'll need to replace
|
Beautiful Soup 4.13. If this happens, you'll need to replace
|
||||||
code that treats "sourceline" or "sourcepos" as tag names:
|
code that treats "sourceline" or "sourcepos" as tag names:
|
||||||
|
|
||||||
tag.sourceline
|
tag.sourceline
|
||||||
|
|
||||||
with code that explicitly calls the find() method:
|
with code that explicitly calls the find() method:
|
||||||
|
|
||||||
tag.find("sourceline").name
|
tag.find("sourceline").name
|
||||||
|
|
||||||
Making the behavior of sourceline and sourcepos consistent has
|
Making the behavior of sourceline and sourcepos consistent has
|
||||||
the side effect of fixing a major performance problem when a
|
the side effect of fixing a major performance problem when a
|
||||||
Tag is copied.
|
Tag is copied.
|
||||||
|
|
||||||
With this change, the store_line_numbers argument to the
|
With this change, the store_line_numbers argument to the
|
||||||
BeautifulSoup constructor becomes much less useful, and its use
|
BeautifulSoup constructor becomes much less useful, and its use
|
||||||
is now discouraged, thought I'm not deprecating it yet. Please
|
is now discouraged, thought I'm not deprecating it yet. Please
|
||||||
@@ -235,7 +316,7 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
html5lib's test suite, and Beautiful Soup isn't integrated into
|
html5lib's test suite, and Beautiful Soup isn't integrated into
|
||||||
that test suite, so this code was long since unused and
|
that test suite, so this code was long since unused and
|
||||||
untested.
|
untested.
|
||||||
|
|
||||||
These methods are _not_ deprecated, since they are methods
|
These methods are _not_ deprecated, since they are methods
|
||||||
defined by html5lib. They may one day have real
|
defined by html5lib. They may one day have real
|
||||||
implementations, as part of a future effort to integrate
|
implementations, as part of a future effort to integrate
|
||||||
@@ -283,14 +364,14 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
* Defined a number of new iterators which are the same as
|
* Defined a number of new iterators which are the same as
|
||||||
existing iterators, but which yield the element itself before
|
existing iterators, but which yield the element itself before
|
||||||
beginning to traverse the tree. [bug=2052936] [bug=2067634]
|
beginning to traverse the tree. [bug=2052936] [bug=2067634]
|
||||||
|
|
||||||
- PageElement.self_and_parents
|
- PageElement.self_and_parents
|
||||||
- PageElement.self_and_descendants
|
- PageElement.self_and_descendants
|
||||||
- PageElement.self_and_next_elements
|
- PageElement.self_and_next_elements
|
||||||
- PageElement.self_and_next_siblings
|
- PageElement.self_and_next_siblings
|
||||||
- PageElement.self_and_previous_elements
|
- PageElement.self_and_previous_elements
|
||||||
- PageElement.self_and_previous_siblings
|
- PageElement.self_and_previous_siblings
|
||||||
|
|
||||||
self_and_parents yields the element you call it on and then all
|
self_and_parents yields the element you call it on and then all
|
||||||
of its parents. self_and_next_element yields the element you
|
of its parents. self_and_next_element yields the element you
|
||||||
call it on and then every element parsed afterwards; and so on.
|
call it on and then every element parsed afterwards; and so on.
|
||||||
@@ -299,7 +380,7 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
a mixed list of Tag and NavigableString objects. [bug=2044794]
|
a mixed list of Tag and NavigableString objects. [bug=2044794]
|
||||||
* Defined a new method, Tag.copy_self(), which creates a copy of
|
* Defined a new method, Tag.copy_self(), which creates a copy of
|
||||||
a Tag with the same attributes but no contents. [bug=2065120]
|
a Tag with the same attributes but no contents. [bug=2065120]
|
||||||
|
|
||||||
Note that this method used to be a private method named
|
Note that this method used to be a private method named
|
||||||
_clone(). The _clone() method has been removed, so if you were
|
_clone(). The _clone() method has been removed, so if you were
|
||||||
using it, change your code to call copy_self() instead.
|
using it, change your code to call copy_self() instead.
|
||||||
@@ -320,12 +401,12 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
notices something unusual but not guaranteed to be wrong, like
|
notices something unusual but not guaranteed to be wrong, like
|
||||||
markup that looks like a URL (MarkupResemblesLocatorWarning) or
|
markup that looks like a URL (MarkupResemblesLocatorWarning) or
|
||||||
XML being run through an HTML parser (XMLParsedAsHTMLWarning).
|
XML being run through an HTML parser (XMLParsedAsHTMLWarning).
|
||||||
|
|
||||||
The text of these warnings has been revamped to explain in more
|
The text of these warnings has been revamped to explain in more
|
||||||
detail what is going on, how to check if you've made a mistake,
|
detail what is going on, how to check if you've made a mistake,
|
||||||
and how to make the warning go away if you are acting
|
and how to make the warning go away if you are acting
|
||||||
deliberately.
|
deliberately.
|
||||||
|
|
||||||
If these warnings are interfering with your workflow, or simply
|
If these warnings are interfering with your workflow, or simply
|
||||||
annoying you, you can filter all of them by filtering
|
annoying you, you can filter all of them by filtering
|
||||||
UnusualUsageWarning, without worrying about losing the warnings
|
UnusualUsageWarning, without worrying about losing the warnings
|
||||||
@@ -388,14 +469,14 @@ Sun Jul 13 14:04:39 UTC 2025 - Ben Greiner <code@bnavigator.de>
|
|||||||
(e.g. .descendants, which powers the find* methods) should now
|
(e.g. .descendants, which powers the find* methods) should now
|
||||||
work correctly in those cases, or at least not raise
|
work correctly in those cases, or at least not raise
|
||||||
exceptions.
|
exceptions.
|
||||||
|
|
||||||
As part of this work, I changed when the list comprehension
|
As part of this work, I changed when the list comprehension
|
||||||
determines the next element. Previously it was done after the
|
determines the next element. Previously it was done after the
|
||||||
yield statement; now it's done before the yield statement. This
|
yield statement; now it's done before the yield statement. This
|
||||||
lets you remove the yielded element in calling code, or modify
|
lets you remove the yielded element in calling code, or modify
|
||||||
it in a way that would break this calculation, without causing
|
it in a way that would break this calculation, without causing
|
||||||
an exception.
|
an exception.
|
||||||
|
|
||||||
So if your code relies on modifying the tree in a way that
|
So if your code relies on modifying the tree in a way that
|
||||||
'steers' a list comprehension, rather than using the list
|
'steers' a list comprehension, rather than using the list
|
||||||
comprension to decide which bits of the tree to modify, it will
|
comprension to decide which bits of the tree to modify, it will
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#
|
#
|
||||||
# spec file for package python-beautifulsoup4
|
# spec file for package python-beautifulsoup4
|
||||||
#
|
#
|
||||||
# Copyright (c) 2025 SUSE LLC
|
# Copyright (c) 2025 SUSE LLC and contributors
|
||||||
#
|
#
|
||||||
# All modifications and additions to the file contributed by third parties
|
# All modifications and additions to the file contributed by third parties
|
||||||
# remain the property of their copyright owners, unless otherwise agreed
|
# remain the property of their copyright owners, unless otherwise agreed
|
||||||
@@ -18,12 +18,14 @@
|
|||||||
|
|
||||||
%{?sle15_python_module_pythons}
|
%{?sle15_python_module_pythons}
|
||||||
Name: python-beautifulsoup4
|
Name: python-beautifulsoup4
|
||||||
Version: 4.13.4
|
Version: 4.14.3
|
||||||
Release: 0
|
Release: 0
|
||||||
Summary: HTML/XML Parser for Quick-Turnaround Applications Like Screen-Scraping
|
Summary: HTML/XML Parser for Quick-Turnaround Applications Like Screen-Scraping
|
||||||
License: MIT
|
License: MIT
|
||||||
URL: https://www.crummy.com/software/BeautifulSoup/
|
URL: https://www.crummy.com/software/BeautifulSoup/
|
||||||
Source: https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-%{version}.tar.gz
|
Source: https://files.pythonhosted.org/packages/source/b/beautifulsoup4/beautifulsoup4-%{version}.tar.gz
|
||||||
|
# PATCH-FIX-UPSTREAM 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Change the html.parser tree builder's code for handling numeric character references
|
||||||
|
Patch0: htmlparser.patch
|
||||||
BuildRequires: %{python_module base >= 3.7}
|
BuildRequires: %{python_module base >= 3.7}
|
||||||
BuildRequires: %{python_module hatchling}
|
BuildRequires: %{python_module hatchling}
|
||||||
BuildRequires: %{python_module pip}
|
BuildRequires: %{python_module pip}
|
||||||
|
|||||||
Reference in New Issue
Block a user