Version in base suite: 0.4.2-1 Base version: lxml-html-clean_0.4.2-1 Target version: lxml-html-clean_0.4.4-1~deb13u1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/l/lxml-html-clean/lxml-html-clean_0.4.2-1.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/l/lxml-html-clean/lxml-html-clean_0.4.4-1~deb13u1.dsc .github/workflows/main.yml | 4 CHANGES.rst | 25 +++++ README.md | 2 debian/changelog | 25 +++++ debian/control | 2 docs/usage.rst | 5 + lxml_html_clean/clean.py | 27 ++++++ setup.cfg | 5 - tests/test_clean.py | 198 ++++++++++++++++++++++++++++++++++++++++++++- tests/test_clean.txt | 2 tox.ini | 2 11 files changed, 284 insertions(+), 13 deletions(-) dpkg-source: warning: cannot verify inline signature for /srv/release.debian.org/tmp/tmpeqmmcsjy/lxml-html-clean_0.4.2-1.dsc: no acceptable signature found dpkg-source: warning: cannot verify inline signature for /srv/release.debian.org/tmp/tmpeqmmcsjy/lxml-html-clean_0.4.4-1~deb13u1.dsc: no acceptable signature found diff -Nru lxml-html-clean-0.4.2/.github/workflows/main.yml lxml-html-clean-0.4.4/.github/workflows/main.yml --- lxml-html-clean-0.4.2/.github/workflows/main.yml 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/.github/workflows/main.yml 2026-02-27 09:32:37.000000000 +0000 @@ -19,17 +19,15 @@ uses: fedora-python/tox-github-action@main with: tox_env: ${{ matrix.tox_env }} - dnf_install: gcc libxml2-devel libxslt-devel strategy: matrix: tox_env: - - py36 - - py38 - py39 - py310 - py311 - py312 - py313 + - py314 - mypy # Use GitHub's Linux Docker host diff -Nru lxml-html-clean-0.4.2/CHANGES.rst lxml-html-clean-0.4.4/CHANGES.rst --- lxml-html-clean-0.4.2/CHANGES.rst 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/CHANGES.rst 2026-02-27 09:32:37.000000000 +0000 @@ -6,6 +6,31 @@ Unreleased ========== +0.4.4 (2026-02-26) +================== + +Bugs fixed +---------- + +* Fixed a bug where Unicode escapes in CSS were not properly decoded + before security checks. This prevents attackers from bypassing filters + using escape sequences. +* Fixed a security issue where ```` tags could be used for URL + hijacking attacks. The ```` tag is now automatically removed + whenever the ```` tag is removed (via ``page_structure=True`` + or manual configuration), as ```` must be inside ```` + according to HTML specifications. + +0.4.3 (2025-10-02) +================== + +Maintenance +----------- + +* Tests updated to work correctly with new lxml and libxml2 releases. +* Python 3.6 and 3.7 are no longer tested. +* Improved documentation about CSS removal behavior. + 0.4.2 (2025-04-09) ================== diff -Nru lxml-html-clean-0.4.2/README.md lxml-html-clean-0.4.4/README.md --- lxml-html-clean-0.4.2/README.md 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/README.md 2026-02-27 09:32:37.000000000 +0000 @@ -4,7 +4,7 @@ This project was initially a part of [lxml](https://github.com/lxml/lxml). Because HTML cleaner is designed as blocklist-based, many reports about possible security vulnerabilities were filed for lxml and that make the project problematic for security-sensitive environments. Therefore we decided to extract the problematic part to a separate project. -**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered appropriate **for security sensitive environments**. See e.g. [bleach](https://pypi.org/project/bleach/) for an alternative. +**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered appropriate **for security sensitive environments**. See e.g. [nh3](https://pypi.org/project/nh3/) for an alternative. This project uses functions from Python's `urllib.parse` for URL parsing which **do not validate inputs**. For more information on potential security risks, refer to the [URL parsing security](https://docs.python.org/3/library/urllib.parse.html#url-parsing-security) documentation. A maliciously crafted URL could potentially bypass the allowed hosts check in `Cleaner`. diff -Nru lxml-html-clean-0.4.2/debian/changelog lxml-html-clean-0.4.4/debian/changelog --- lxml-html-clean-0.4.2/debian/changelog 2025-04-23 05:47:04.000000000 +0000 +++ lxml-html-clean-0.4.4/debian/changelog 2026-06-18 17:48:59.000000000 +0000 @@ -1,3 +1,28 @@ +lxml-html-clean (0.4.4-1~deb13u1) trixie; urgency=medium + + * Non-maintainer upload. + * Rebuild for trixie. + - CVE-2026-28348: CSS @import Filter Bypass via Unicode Escapes + - CVE-2026-28350: tag injection through default Cleaner + configuration + + + -- Adrian Bunk Thu, 18 Jun 2026 20:48:59 +0300 + +lxml-html-clean (0.4.4-1) unstable; urgency=medium + + * New upstream version. + * Bump standards version. + + -- Matthias Klose Fri, 06 Mar 2026 08:37:31 +0100 + +lxml-html-clean (0.4.3-1) unstable; urgency=medium + + * New upstream version. Closes: #1114193. + * Bump standards version. + + -- Matthias Klose Sun, 05 Oct 2025 11:15:18 +0200 + lxml-html-clean (0.4.2-1) unstable; urgency=medium * New upstream version. diff -Nru lxml-html-clean-0.4.2/debian/control lxml-html-clean-0.4.4/debian/control --- lxml-html-clean-0.4.2/debian/control 2024-08-02 00:41:19.000000000 +0000 +++ lxml-html-clean-0.4.4/debian/control 2026-03-06 07:37:31.000000000 +0000 @@ -9,7 +9,7 @@ python3-setuptools, python3-all, python3-lxml, -Standards-Version: 4.7.0 +Standards-Version: 4.7.3 Homepage: https://github.com/fedora-python/lxml_html_clean Package: python3-lxml-html-clean diff -Nru lxml-html-clean-0.4.2/docs/usage.rst lxml-html-clean-0.4.4/docs/usage.rst --- lxml-html-clean-0.4.2/docs/usage.rst 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/docs/usage.rst 2026-02-27 09:32:37.000000000 +0000 @@ -109,6 +109,11 @@ +To control the removal of CSS styles, set the ``style`` and/or ``inline_style`` +keyword arguments to ``True`` when creating a ``Cleaner`` instance. +If neither option is enabled, only ``@import`` rules are automatically removed +from CSS content. + You can also whitelist some otherwise dangerous content with ``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow embedded media from YouTube, while still filtering out embedded media diff -Nru lxml-html-clean-0.4.2/lxml_html_clean/clean.py lxml-html-clean-0.4.4/lxml_html_clean/clean.py --- lxml-html-clean-0.4.2/lxml_html_clean/clean.py 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/lxml_html_clean/clean.py 2026-02-27 09:32:37.000000000 +0000 @@ -422,6 +422,12 @@ if self.annoying_tags: remove_tags.update(('blink', 'marquee')) + # Remove tags whenever is being removed. + # According to HTML spec, must be in , but browsers + # may interpret it even when misplaced, allowing URL hijacking attacks. + if 'head' in kill_tags or 'head' in remove_tags: + kill_tags.add('base') + _remove = deque() _kill = deque() for el in doc.iter(): @@ -578,6 +584,26 @@ _comments_re = re.compile(r'/\*.*?\*/', re.S) _find_comments = _comments_re.finditer _substitute_comments = _comments_re.sub + _css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?') + + def _decode_css_unicode_escapes(self, style): + """ + Decode CSS Unicode escape sequences like \\69 or \\000069 to their + actual character values. This prevents bypassing security checks + using CSS escape sequences. + + CSS escape syntax: backslash followed by 1-6 hex digits, + optionally followed by a whitespace character. + """ + def replace_escape(match): + hex_value = match.group(1) + try: + return chr(int(hex_value, 16)) + except (ValueError, OverflowError): + # Invalid unicode codepoint, keep original + return match.group(0) + + return self._css_unicode_escape_re.sub(replace_escape, style) def _has_sneaky_javascript(self, style): """ @@ -591,6 +617,7 @@ more sneaky attempts. """ style = self._substitute_comments('', style) + style = self._decode_css_unicode_escapes(style) style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() diff -Nru lxml-html-clean-0.4.2/setup.cfg lxml-html-clean-0.4.4/setup.cfg --- lxml-html-clean-0.4.2/setup.cfg 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/setup.cfg 2026-02-27 09:32:37.000000000 +0000 @@ -1,6 +1,6 @@ [metadata] name = lxml_html_clean -version = 0.4.2 +version = 0.4.4 description = HTML cleaner from lxml project long_description = file:README.md long_description_content_type = text/markdown @@ -13,14 +13,13 @@ license_files = LICENSE.txt classifiers = Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 [options] packages = diff -Nru lxml-html-clean-0.4.2/tests/test_clean.py lxml-html-clean-0.4.4/tests/test_clean.py --- lxml-html-clean-0.4.2/tests/test_clean.py 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/tests/test_clean.py 2026-02-27 09:32:37.000000000 +0000 @@ -331,20 +331,20 @@ def test_host_whitelist_valid(self): # Frame with valid hostname in src is allowed. - html = '

' expected = '

' cleaner = Cleaner(frames=False, host_whitelist=["example.com"]) self.assertEqual(expected, cleaner.clean_html(html)) def test_host_whitelist_invalid(self): - html = '

' expected = '

' cleaner = Cleaner(frames=False, host_whitelist=["example.com"]) self.assertEqual(expected, cleaner.clean_html(html)) def test_host_whitelist_sneaky_userinfo(self): # Regression test: Don't be fooled by hostname and colon in userinfo. - html = '

' expected = '

' cleaner = Cleaner(frames=False, host_whitelist=["example.com"]) self.assertEqual(expected, cleaner.clean_html(html)) @@ -393,3 +393,195 @@ self.assertEqual(len(w), 0) self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) + + def test_base_tag_removed_with_page_structure(self): + # Test that tags are removed when page_structure=True (default) + # This prevents URL hijacking attacks where redirects all relative URLs + + test_cases = [ + # in proper location (inside ) + 'link', + # outside + '

link

', + # Multiple tags + '

', + # with target attribute + '

content

', + # at various positions + 'test', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + # Verify tag is completely removed + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + self.assertNotIn('evil2.com', cleaned) + + def test_base_tag_kept_when_page_structure_false(self): + # When page_structure=False and head is not removed, should be kept + cleaner = Cleaner(page_structure=False) + html = 'test' + cleaned = cleaner.clean_html(html) + self.assertIn('', cleaned) + + def test_base_tag_removed_when_head_in_remove_tags(self): + # Even with page_structure=False, should be removed if head is manually removed + cleaner = Cleaner(page_structure=False, remove_tags=['head']) + html = 'test' + cleaned = cleaner.clean_html(html) + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + + def test_base_tag_removed_when_head_in_kill_tags(self): + # Even with page_structure=False, should be removed if head is in kill_tags + cleaner = Cleaner(page_structure=False, kill_tags=['head']) + html = 'test' + cleaned = cleaner.clean_html(html) + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + + def test_unicode_escape_in_style(self): + # Test that CSS Unicode escapes are properly decoded before security checks + # This prevents attackers from bypassing filters using escape sequences + # CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits) + + # Test inline style attributes (requires safe_attrs_only=False) + cleaner = Cleaner(safe_attrs_only=False) + inline_style_cases = [ + # \6a\61\76\61\73\63\72\69\70\74 = "javascript" + ('

test

', '

test

'), + # \69 = 'i', so \69mport = "import" + ('

test

', '

test

'), + # \69 with space after = 'i', space consumed as part of escape + ('

test

', '

test

'), + # \65\78\70\72\65\73\73\69\6f\6e = "expression" + ('

test

', '

test

'), + ] + + for html, expected in inline_style_cases: + with self.subTest(html=html): + cleaned = cleaner.clean_html(html) + self.assertEqual(expected, cleaned) + + # Test ', + # Unicode-escaped "javascript:" without url() + '', + # Unicode-escaped "expression" + '', + # Unicode-escaped @import with 'i' + '', + # Unicode-escaped "data:" scheme + '', + # Space after escape is consumed: \69 mport = "import" + '', + # 6-digit escape: \000069 = 'i' + '', + # 6-digit escape with space + '', + ] + + for html in style_tag_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_mixed_with_comments(self): + # Unicode escapes mixed with CSS comments should still be caught + test_cases = [ + # \69 = 'i' with comment before + '', + # \69 = 'i' with comment after + '', + # Multiple escapes with comments + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_case_insensitive(self): + # CSS hex escapes should work with both uppercase and lowercase hex digits + # \69 = 'i', \6D = 'm', etc. + test_cases = [ + # @import with uppercase hex digits: \69\6D\70\6F\72\74 + '', + # @import with some uppercase + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_various_schemes(self): + # Test Unicode escapes for various malicious schemes + test_cases = [ + # \76\62\73\63\72\69\70\74 = "vbscript" + '', + # \6a\73\63\72\69\70\74 = "jscript" + '', + # \6c\69\76\65\73\63\72\69\70\74 = "livescript" + '', + # \6d\6f\63\68\61 = "mocha" + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_with_whitespace_variations(self): + # Test different whitespace characters after Unicode escapes + cleaner = Cleaner(safe_attrs_only=False) + test_cases = [ + # Tab after escape + ('

test

', '

test

'), + # Newline after escape (note: actual newline, not \n) + ('

test

', '

test

'), + # Form feed after escape + ('

test

', '

test

'), + ] + + for html, expected in test_cases: + with self.subTest(html=html): + cleaned = cleaner.clean_html(html) + self.assertEqual(expected, cleaned) + + def test_backslash_removal_after_unicode_decode(self): + # After decoding Unicode escapes, remaining backslashes are removed + # This ensures double-obfuscation (unicode + backslashes) is caught + test_cases = [ + # Step 1: \69 → 'i', Step 2: remove \, Result: @import + '', + # Multiple unicode escapes with backslashes mixed in + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_backslash_obfuscation_without_unicode(self): + # Test that patterns using ONLY backslash obfuscation (no unicode) are caught + # Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern + test_cases = [ + # @\i\m\p\o\r\t → @import (caught by '@import' check) + '', + # Can also test combinations that create javascript schemes + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) diff -Nru lxml-html-clean-0.4.2/tests/test_clean.txt lxml-html-clean-0.4.4/tests/test_clean.txt --- lxml-html-clean-0.4.2/tests/test_clean.txt 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/tests/test_clean.txt 2026-02-27 09:32:37.000000000 +0000 @@ -84,7 +84,7 @@ a link - a control char link + a control char link data another link

a paragraph

diff -Nru lxml-html-clean-0.4.2/tox.ini lxml-html-clean-0.4.4/tox.ini --- lxml-html-clean-0.4.2/tox.ini 2025-04-09 11:14:25.000000000 +0000 +++ lxml-html-clean-0.4.4/tox.ini 2026-02-27 09:32:37.000000000 +0000 @@ -1,5 +1,5 @@ [tox] -envlist = py36,py38,py39,py310,py311,py312,py313,mypy +envlist = py39,py310,py311,py312,py313,py314,mypy skipsdist = True [testenv]