From 8d721a877dd5c2bc0693d9c4d3954eb11fbd404b Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Wed, 5 Jun 2024 16:50:37 -0400 Subject: [PATCH] httputil: Only strip tabs and spaces from header values The RFC specifies that only tabs and spaces should be stripped. Removing additonal whitespace characters can lead to framing errors with certain proxies. --- tornado/httputil.py | 7 +++++-- tornado/test/httputil_test.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tornado/httputil.py b/tornado/httputil.py index b21d8046c4..9ce992d82b 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -62,6 +62,9 @@ from asyncio import Future # noqa: F401 import unittest # noqa: F401 +# To be used with str.strip() and related methods. +HTTP_WHITESPACE = " \t" + @lru_cache(1000) def _normalize_header(name: str) -> str: @@ -171,7 +174,7 @@ def parse_line(self, line: str) -> None: # continuation of a multi-line header if self._last_key is None: raise HTTPInputError("first header line cannot start with whitespace") - new_part = " " + line.lstrip() + new_part = " " + line.lstrip(HTTP_WHITESPACE) self._as_list[self._last_key][-1] += new_part self._dict[self._last_key] += new_part else: @@ -179,7 +182,7 @@ def parse_line(self, line: str) -> None: name, value = line.split(":", 1) except ValueError: raise HTTPInputError("no colon in header line") - self.add(name, value.strip()) + self.add(name, value.strip(HTTP_WHITESPACE)) @classmethod def parse(cls, headers: str) -> "HTTPHeaders": diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index aa9b6ee253..6d618839e0 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -334,6 +334,25 @@ def test_unicode_newlines(self): gen_log.warning("failed while trying %r in %s", newline, encoding) raise + def test_unicode_whitespace(self): + # Only tabs and spaces are to be stripped according to the HTTP standard. + # Other unicode whitespace is to be left as-is. In the context of headers, + # this specifically means the whitespace characters falling within the + # latin1 charset. + whitespace = [ + (" ", True), # SPACE + ("\t", True), # TAB + ("\u00a0", False), # NON-BREAKING SPACE + ("\u0085", False), # NEXT LINE + ] + for c, stripped in whitespace: + headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c) + if stripped: + expected = [("Transfer-Encoding", "chunked")] + else: + expected = [("Transfer-Encoding", "%schunked" % c)] + self.assertEqual(expected, list(headers.get_all())) + def test_optional_cr(self): # Both CRLF and LF should be accepted as separators. CR should not be # part of the data when followed by LF, but it is a normal char