Version in base suite: 2.4.1-2 Base version: python-scrapy_2.4.1-2 Target version: python-scrapy_2.4.1-2+deb11u1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/p/python-scrapy/python-scrapy_2.4.1-2.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/p/python-scrapy/python-scrapy_2.4.1-2+deb11u1.dsc changelog | 12 + patches/CVE-2021-41125.patch | 206 +++++++++++++++++++++++++++++++++ patches/CVE-2022-0577.patch | 266 +++++++++++++++++++++++++++++++++++++++++++ patches/series | 2 4 files changed, 486 insertions(+) diff -Nru python-scrapy-2.4.1/debian/changelog python-scrapy-2.4.1/debian/changelog --- python-scrapy-2.4.1/debian/changelog 2021-02-28 13:55:45.000000000 +0000 +++ python-scrapy-2.4.1/debian/changelog 2022-05-20 20:11:00.000000000 +0000 @@ -1,3 +1,15 @@ +python-scrapy (2.4.1-2+deb11u1) bullseye; urgency=medium + + * Team upload. + * Security fix for CVE-2021-41125: Don't send authentication data with all + requests. Provide a http_auth_domain spider attribute to control which + domains are allowed to receive the configured HTTP authentication + credentials. + * Security Fix CVE-2022-0577: Don't expose cookies cross-domain when + redirected. (Closes: #1008234) + + -- Stefano Rivera Fri, 20 May 2022 16:11:00 -0400 + python-scrapy (2.4.1-2) unstable; urgency=medium * Skip tests that require network access (Closes: #980901). diff -Nru python-scrapy-2.4.1/debian/patches/CVE-2021-41125.patch python-scrapy-2.4.1/debian/patches/CVE-2021-41125.patch --- python-scrapy-2.4.1/debian/patches/CVE-2021-41125.patch 1970-01-01 00:00:00.000000000 +0000 +++ python-scrapy-2.4.1/debian/patches/CVE-2021-41125.patch 2022-05-20 20:11:00.000000000 +0000 @@ -0,0 +1,206 @@ +From: Andrey Rakhmatullin +Date: Fri, 16 Aug 2019 14:53:42 +0500 +Subject: Add http_auth_domain to HttpAuthMiddleware. + +Fixes CVE-2021-41125 +Origin: upstream, https://github.com/scrapy/scrapy/commit/b01d69a1bf48060daec8f751368622352d8b85a6 +--- + docs/topics/downloader-middleware.rst | 18 +++++- + scrapy/downloadermiddlewares/httpauth.py | 21 ++++++- + tests/test_downloadermiddleware_httpauth.py | 85 ++++++++++++++++++++++++++++- + 3 files changed, 118 insertions(+), 6 deletions(-) + +diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst +index 6801adc..e0a3205 100644 +--- a/docs/topics/downloader-middleware.rst ++++ b/docs/topics/downloader-middleware.rst +@@ -323,8 +323,21 @@ HttpAuthMiddleware + This middleware authenticates all requests generated from certain spiders + using `Basic access authentication`_ (aka. HTTP auth). + +- To enable HTTP authentication from certain spiders, set the ``http_user`` +- and ``http_pass`` attributes of those spiders. ++ To enable HTTP authentication for a spider, set the ``http_user`` and ++ ``http_pass`` spider attributes to the authentication data and the ++ ``http_auth_domain`` spider attribute to the domain which requires this ++ authentication (its subdomains will be also handled in the same way). ++ You can set ``http_auth_domain`` to ``None`` to enable the ++ authentication for all requests but usually this is not needed. ++ ++ .. warning:: ++ In the previous Scrapy versions HttpAuthMiddleware sent the ++ authentication data with all requests, which is a security problem if ++ the spider makes requests to several different domains. Currently if ++ the ``http_auth_domain`` attribute is not set, the middleware will use ++ the domain of the first request, which will work for some spider but ++ not for others. In the future the middleware will produce an error ++ instead. + + Example:: + +@@ -334,6 +347,7 @@ HttpAuthMiddleware + + http_user = 'someuser' + http_pass = 'somepass' ++ http_auth_domain = 'intranet.example.com' + name = 'intranet.example.com' + + # .. rest of the spider code omitted ... +diff --git a/scrapy/downloadermiddlewares/httpauth.py b/scrapy/downloadermiddlewares/httpauth.py +index 089bf0d..1bee3e2 100644 +--- a/scrapy/downloadermiddlewares/httpauth.py ++++ b/scrapy/downloadermiddlewares/httpauth.py +@@ -3,10 +3,14 @@ HTTP basic auth downloader middleware + + See documentation in docs/topics/downloader-middleware.rst + """ ++import warnings + + from w3lib.http import basic_auth_header + + from scrapy import signals ++from scrapy.exceptions import ScrapyDeprecationWarning ++from scrapy.utils.httpobj import urlparse_cached ++from scrapy.utils.url import url_is_from_any_domain + + + class HttpAuthMiddleware: +@@ -24,8 +28,23 @@ class HttpAuthMiddleware: + pwd = getattr(spider, 'http_pass', '') + if usr or pwd: + self.auth = basic_auth_header(usr, pwd) ++ if not hasattr(spider, 'http_auth_domain'): ++ warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security ' ++ 'problems if the spider makes requests to several different domains. http_auth_domain ' ++ 'will be set to the domain of the first request, please set it to the correct value ' ++ 'explicitly.', ++ category=ScrapyDeprecationWarning) ++ self.domain_unset = True ++ else: ++ self.domain = spider.http_auth_domain ++ self.domain_unset = False + + def process_request(self, request, spider): + auth = getattr(self, 'auth', None) + if auth and b'Authorization' not in request.headers: +- request.headers[b'Authorization'] = auth ++ domain = urlparse_cached(request).hostname ++ if self.domain_unset: ++ self.domain = domain ++ self.domain_unset = False ++ if not self.domain or url_is_from_any_domain(request.url, [self.domain]): ++ request.headers[b'Authorization'] = auth +diff --git a/tests/test_downloadermiddleware_httpauth.py b/tests/test_downloadermiddleware_httpauth.py +index 3381632..0362e20 100644 +--- a/tests/test_downloadermiddleware_httpauth.py ++++ b/tests/test_downloadermiddleware_httpauth.py +@@ -1,13 +1,60 @@ + import unittest + ++from w3lib.http import basic_auth_header ++ + from scrapy.http import Request + from scrapy.downloadermiddlewares.httpauth import HttpAuthMiddleware + from scrapy.spiders import Spider + + ++class TestSpiderLegacy(Spider): ++ http_user = 'foo' ++ http_pass = 'bar' ++ ++ + class TestSpider(Spider): + http_user = 'foo' + http_pass = 'bar' ++ http_auth_domain = 'example.com' ++ ++ ++class TestSpiderAny(Spider): ++ http_user = 'foo' ++ http_pass = 'bar' ++ http_auth_domain = None ++ ++ ++class HttpAuthMiddlewareLegacyTest(unittest.TestCase): ++ ++ def setUp(self): ++ self.spider = TestSpiderLegacy('foo') ++ ++ def test_auth(self): ++ mw = HttpAuthMiddleware() ++ mw.spider_opened(self.spider) ++ ++ # initial request, sets the domain and sends the header ++ req = Request('http://example.com/') ++ assert mw.process_request(req, self.spider) is None ++ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar')) ++ ++ # subsequent request to the same domain, should send the header ++ req = Request('http://example.com/') ++ assert mw.process_request(req, self.spider) is None ++ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar')) ++ ++ # subsequent request to a different domain, shouldn't send the header ++ req = Request('http://example-noauth.com/') ++ assert mw.process_request(req, self.spider) is None ++ self.assertNotIn('Authorization', req.headers) ++ ++ def test_auth_already_set(self): ++ mw = HttpAuthMiddleware() ++ mw.spider_opened(self.spider) ++ req = Request('http://example.com/', ++ headers=dict(Authorization='Digest 123')) ++ assert mw.process_request(req, self.spider) is None ++ self.assertEqual(req.headers['Authorization'], b'Digest 123') + + + class HttpAuthMiddlewareTest(unittest.TestCase): +@@ -20,13 +67,45 @@ class HttpAuthMiddlewareTest(unittest.TestCase): + def tearDown(self): + del self.mw + ++ def test_no_auth(self): ++ req = Request('http://example-noauth.com/') ++ assert self.mw.process_request(req, self.spider) is None ++ self.assertNotIn('Authorization', req.headers) ++ ++ def test_auth_domain(self): ++ req = Request('http://example.com/') ++ assert self.mw.process_request(req, self.spider) is None ++ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar')) ++ ++ def test_auth_subdomain(self): ++ req = Request('http://foo.example.com/') ++ assert self.mw.process_request(req, self.spider) is None ++ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar')) ++ ++ def test_auth_already_set(self): ++ req = Request('http://example.com/', ++ headers=dict(Authorization='Digest 123')) ++ assert self.mw.process_request(req, self.spider) is None ++ self.assertEqual(req.headers['Authorization'], b'Digest 123') ++ ++ ++class HttpAuthAnyMiddlewareTest(unittest.TestCase): ++ ++ def setUp(self): ++ self.mw = HttpAuthMiddleware() ++ self.spider = TestSpiderAny('foo') ++ self.mw.spider_opened(self.spider) ++ ++ def tearDown(self): ++ del self.mw ++ + def test_auth(self): +- req = Request('http://scrapytest.org/') ++ req = Request('http://example.com/') + assert self.mw.process_request(req, self.spider) is None +- self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==') ++ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar')) + + def test_auth_already_set(self): +- req = Request('http://scrapytest.org/', ++ req = Request('http://example.com/', + headers=dict(Authorization='Digest 123')) + assert self.mw.process_request(req, self.spider) is None + self.assertEqual(req.headers['Authorization'], b'Digest 123') diff -Nru python-scrapy-2.4.1/debian/patches/CVE-2022-0577.patch python-scrapy-2.4.1/debian/patches/CVE-2022-0577.patch --- python-scrapy-2.4.1/debian/patches/CVE-2022-0577.patch 1970-01-01 00:00:00.000000000 +0000 +++ python-scrapy-2.4.1/debian/patches/CVE-2022-0577.patch 2022-05-20 20:11:00.000000000 +0000 @@ -0,0 +1,266 @@ +From: =?utf-8?q?Adri=C3=A1n_Chaves?= +Date: Tue, 1 Mar 2022 12:26:05 +0100 +Subject: Merge pull request from GHSA-cjvr-mfj7-j4j8 + +* Do not carry over cookies to a different domain on redirect + +* Cover the cookie-domain redirect fix in the release notes + +* Cover 1.8.2 in the release notes + +* Fix redirect Cookie handling when the cookie middleware is disabled + +* Update the 1.8.2 release date + +Fixes CVE-2022-0577 +Origin: upstream, https://github.com/scrapy/scrapy/commit/8ce01b3b76d4634f55067d6cfdf632ec70ba304a +Bug-Debian: https://bugs.debian.org/1008234 +--- + scrapy/downloadermiddlewares/redirect.py | 30 +++++- + tests/test_downloadermiddleware_cookies.py | 155 +++++++++++++++++++++++++++++ + 2 files changed, 180 insertions(+), 5 deletions(-) + +diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py +index 4053fec..c8c84ff 100644 +--- a/scrapy/downloadermiddlewares/redirect.py ++++ b/scrapy/downloadermiddlewares/redirect.py +@@ -4,6 +4,7 @@ from urllib.parse import urljoin, urlparse + from w3lib.url import safe_url_string + + from scrapy.http import HtmlResponse ++from scrapy.utils.httpobj import urlparse_cached + from scrapy.utils.response import get_meta_refresh + from scrapy.exceptions import IgnoreRequest, NotConfigured + +@@ -11,6 +12,20 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured + logger = logging.getLogger(__name__) + + ++def _build_redirect_request(source_request, *, url, **kwargs): ++ redirect_request = source_request.replace( ++ url=url, ++ **kwargs, ++ cookies=None, ++ ) ++ if 'Cookie' in redirect_request.headers: ++ source_request_netloc = urlparse_cached(source_request).netloc ++ redirect_request_netloc = urlparse_cached(redirect_request).netloc ++ if source_request_netloc != redirect_request_netloc: ++ del redirect_request.headers['Cookie'] ++ return redirect_request ++ ++ + class BaseRedirectMiddleware: + + enabled_setting = 'REDIRECT_ENABLED' +@@ -47,10 +62,15 @@ class BaseRedirectMiddleware: + raise IgnoreRequest("max redirections reached") + + def _redirect_request_using_get(self, request, redirect_url): +- redirected = request.replace(url=redirect_url, method='GET', body='') +- redirected.headers.pop('Content-Type', None) +- redirected.headers.pop('Content-Length', None) +- return redirected ++ redirect_request = _build_redirect_request( ++ request, ++ url=redirect_url, ++ method='GET', ++ body='', ++ ) ++ redirect_request.headers.pop('Content-Type', None) ++ redirect_request.headers.pop('Content-Length', None) ++ return redirect_request + + + class RedirectMiddleware(BaseRedirectMiddleware): +@@ -80,7 +100,7 @@ class RedirectMiddleware(BaseRedirectMiddleware): + redirected_url = urljoin(request.url, location) + + if response.status in (301, 307, 308) or request.method == 'HEAD': +- redirected = request.replace(url=redirected_url) ++ redirected = _build_redirect_request(request, url=redirected_url) + return self._redirect(redirected, request, spider, response.status) + + redirected = self._redirect_request_using_get(request, redirected_url) +diff --git a/tests/test_downloadermiddleware_cookies.py b/tests/test_downloadermiddleware_cookies.py +index aff8542..5263f63 100644 +--- a/tests/test_downloadermiddleware_cookies.py ++++ b/tests/test_downloadermiddleware_cookies.py +@@ -6,8 +6,10 @@ import pytest + + from scrapy.downloadermiddlewares.cookies import CookiesMiddleware + from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware ++from scrapy.downloadermiddlewares.redirect import RedirectMiddleware + from scrapy.exceptions import NotConfigured + from scrapy.http import Response, Request ++from scrapy.settings import Settings + from scrapy.spiders import Spider + from scrapy.utils.python import to_bytes + from scrapy.utils.test import get_crawler +@@ -23,9 +25,11 @@ class CookiesMiddlewareTest(TestCase): + def setUp(self): + self.spider = Spider('foo') + self.mw = CookiesMiddleware() ++ self.redirect_middleware = RedirectMiddleware(settings=Settings()) + + def tearDown(self): + del self.mw ++ del self.redirect_middleware + + def test_basic(self): + req = Request('http://scrapytest.org/') +@@ -347,3 +351,154 @@ class CookiesMiddlewareTest(TestCase): + self.assertCookieValEqual(req1.headers['Cookie'], 'key=value1') + self.assertCookieValEqual(req2.headers['Cookie'], 'key=value2') + self.assertCookieValEqual(req3.headers['Cookie'], 'key=') ++ ++ def _test_cookie_redirect( ++ self, ++ source, ++ target, ++ *, ++ cookies1, ++ cookies2, ++ ): ++ input_cookies = {'a': 'b'} ++ ++ if not isinstance(source, dict): ++ source = {'url': source} ++ if not isinstance(target, dict): ++ target = {'url': target} ++ target.setdefault('status', 301) ++ ++ request1 = Request(cookies=input_cookies, **source) ++ self.mw.process_request(request1, self.spider) ++ cookies = request1.headers.get('Cookie') ++ self.assertEqual(cookies, b"a=b" if cookies1 else None) ++ ++ response = Response( ++ headers={ ++ 'Location': target['url'], ++ }, ++ **target, ++ ) ++ self.assertEqual( ++ self.mw.process_response(request1, response, self.spider), ++ response, ++ ) ++ ++ request2 = self.redirect_middleware.process_response( ++ request1, ++ response, ++ self.spider, ++ ) ++ self.assertIsInstance(request2, Request) ++ ++ self.mw.process_request(request2, self.spider) ++ cookies = request2.headers.get('Cookie') ++ self.assertEqual(cookies, b"a=b" if cookies2 else None) ++ ++ def test_cookie_redirect_same_domain(self): ++ self._test_cookie_redirect( ++ 'https://toscrape.com', ++ 'https://toscrape.com', ++ cookies1=True, ++ cookies2=True, ++ ) ++ ++ def test_cookie_redirect_same_domain_forcing_get(self): ++ self._test_cookie_redirect( ++ 'https://toscrape.com', ++ {'url': 'https://toscrape.com', 'status': 302}, ++ cookies1=True, ++ cookies2=True, ++ ) ++ ++ def test_cookie_redirect_different_domain(self): ++ self._test_cookie_redirect( ++ 'https://toscrape.com', ++ 'https://example.com', ++ cookies1=True, ++ cookies2=False, ++ ) ++ ++ def test_cookie_redirect_different_domain_forcing_get(self): ++ self._test_cookie_redirect( ++ 'https://toscrape.com', ++ {'url': 'https://example.com', 'status': 302}, ++ cookies1=True, ++ cookies2=False, ++ ) ++ ++ def _test_cookie_header_redirect( ++ self, ++ source, ++ target, ++ *, ++ cookies2, ++ ): ++ """Test the handling of a user-defined Cookie header when building a ++ redirect follow-up request. ++ ++ We follow RFC 6265 for cookie handling. The Cookie header can only ++ contain a list of key-value pairs (i.e. no additional cookie ++ parameters like Domain or Path). Because of that, we follow the same ++ rules that we would follow for the handling of the Set-Cookie response ++ header when the Domain is not set: the cookies must be limited to the ++ target URL domain (not even subdomains can receive those cookies). ++ ++ .. note:: This method tests the scenario where the cookie middleware is ++ disabled. Because of known issue #1992, when the cookies ++ middleware is enabled we do not need to be concerned about ++ the Cookie header getting leaked to unintended domains, ++ because the middleware empties the header from every request. ++ """ ++ if not isinstance(source, dict): ++ source = {'url': source} ++ if not isinstance(target, dict): ++ target = {'url': target} ++ target.setdefault('status', 301) ++ ++ request1 = Request(headers={'Cookie': b'a=b'}, **source) ++ ++ response = Response( ++ headers={ ++ 'Location': target['url'], ++ }, ++ **target, ++ ) ++ ++ request2 = self.redirect_middleware.process_response( ++ request1, ++ response, ++ self.spider, ++ ) ++ self.assertIsInstance(request2, Request) ++ ++ cookies = request2.headers.get('Cookie') ++ self.assertEqual(cookies, b"a=b" if cookies2 else None) ++ ++ def test_cookie_header_redirect_same_domain(self): ++ self._test_cookie_header_redirect( ++ 'https://toscrape.com', ++ 'https://toscrape.com', ++ cookies2=True, ++ ) ++ ++ def test_cookie_header_redirect_same_domain_forcing_get(self): ++ self._test_cookie_header_redirect( ++ 'https://toscrape.com', ++ {'url': 'https://toscrape.com', 'status': 302}, ++ cookies2=True, ++ ) ++ ++ def test_cookie_header_redirect_different_domain(self): ++ self._test_cookie_header_redirect( ++ 'https://toscrape.com', ++ 'https://example.com', ++ cookies2=False, ++ ) ++ ++ def test_cookie_header_redirect_different_domain_forcing_get(self): ++ self._test_cookie_header_redirect( ++ 'https://toscrape.com', ++ {'url': 'https://example.com', 'status': 302}, ++ cookies2=False, ++ ) diff -Nru python-scrapy-2.4.1/debian/patches/series python-scrapy-2.4.1/debian/patches/series --- python-scrapy-2.4.1/debian/patches/series 2021-02-28 13:55:45.000000000 +0000 +++ python-scrapy-2.4.1/debian/patches/series 2022-05-20 20:11:00.000000000 +0000 @@ -1 +1,3 @@ 0001-Disable-hoverxref-and-notfound-Sphinx-extensions.patch +CVE-2021-41125.patch +CVE-2022-0577.patch