diff --git a/include/upa/url.h b/include/upa/url.h index cf94156..59b1a92 100644 --- a/include/upa/url.h +++ b/include/upa/url.h @@ -1412,7 +1412,7 @@ inline bool url::username(StrT&& str) { std::string& str_username = urls.start_part(url::USERNAME); // UTF-8 percent encode it using the userinfo encode set - detail::AppendStringOfType(inp.begin(), inp.end(), userinfo_no_encode_set, str_username); + detail::append_utf8_percent_encoded(inp.begin(), inp.end(), userinfo_no_encode_set, str_username); urls.save_part(); return true; } @@ -1428,7 +1428,7 @@ inline bool url::password(StrT&& str) { std::string& str_password = urls.start_part(url::PASSWORD); // UTF-8 percent encode it using the userinfo encode set - detail::AppendStringOfType(inp.begin(), inp.end(), userinfo_no_encode_set, str_password); + detail::append_utf8_percent_encoded(inp.begin(), inp.end(), userinfo_no_encode_set, str_password); urls.save_part(); return true; } @@ -1812,12 +1812,12 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* if (not_empty_password || std::distance(pointer, it_colon) > 0 /*not empty username*/) { // username std::string& str_username = urls.start_part(url::USERNAME); - detail::AppendStringOfType(pointer, it_colon, userinfo_no_encode_set, str_username); // UTF-8 percent encode, @ -> %40 + detail::append_utf8_percent_encoded(pointer, it_colon, userinfo_no_encode_set, str_username); // UTF-8 percent encode, @ -> %40 urls.save_part(); // password if (not_empty_password) { std::string& str_password = urls.start_part(url::PASSWORD); - detail::AppendStringOfType(it_colon + 1, it_eta, userinfo_no_encode_set, str_password); // UTF-8 percent encode, @ -> %40 + detail::append_utf8_percent_encoded(it_colon + 1, it_eta, userinfo_no_encode_set, str_password); // UTF-8 percent encode, @ -> %40 urls.save_part(); } } @@ -2173,19 +2173,19 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* // the result to url’s query. // TODO: now supports UTF-8 encoding only, maybe later add other encodings std::string& str_query = urls.start_part(url::QUERY); - // detail::AppendStringOfType(pointer, end_of_query, query_cpset, str_query); + // detail::append_utf8_percent_encoded(pointer, end_of_query, query_cpset, str_query); while (pointer != end_of_query) { // UTF-8 percent encode c using the fragment percent-encode set // and ignore '\0' const auto uch = static_cast(*pointer); if (uch >= 0x80) { // invalid utf-8/16/32 sequences will be replaced with kUnicodeReplacementCharacter - detail::AppendUTF8EscapedChar(pointer, end_of_query, str_query); + detail::append_utf8_percent_encoded_char(pointer, end_of_query, str_query); } else { // Just append the 7-bit character, possibly escaping it. const auto uc = static_cast(uch); if (!detail::is_char_in_set(uc, query_cpset)) - detail::AppendEscapedChar(uch, str_query); + detail::append_percent_encoded_byte(uch, str_query); else str_query.push_back(uc); ++pointer; @@ -2215,7 +2215,7 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* const auto uch = static_cast(*pointer); if (uch >= 0x80) { // invalid utf-8/16/32 sequences will be replaced with kUnicodeReplacementCharacter - detail::AppendUTF8EscapedChar(pointer, last, str_frag); + detail::append_utf8_percent_encoded_char(pointer, last, str_frag); } else { // Just append the 7-bit character, possibly escaping it. const auto uc = static_cast(uch); @@ -2223,7 +2223,7 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* str_frag.push_back(uc); } else { // other characters are escaped - detail::AppendEscapedChar(uch, str_frag); + detail::append_percent_encoded_byte(uch, str_frag); } ++pointer; } @@ -2329,12 +2329,12 @@ inline bool url_parser::do_path_segment(const CharT* pointer, const CharT* last, const auto uch = static_cast(*pointer); if (uch >= 0x80) { // invalid utf-8/16/32 sequences will be replaced with 0xfffd - success &= detail::AppendUTF8EscapedChar(pointer, last, output); + success &= detail::append_utf8_percent_encoded_char(pointer, last, output); } else { // Just append the 7-bit character, possibly escaping it. const auto uc = static_cast(uch); if (!detail::is_char_in_set(uc, path_no_encode_set)) - detail::AppendEscapedChar(uc, output); + detail::append_percent_encoded_byte(uc, output); else output.push_back(uc); ++pointer; @@ -2358,11 +2358,11 @@ inline bool url_parser::do_simple_path(const CharT* pointer, const CharT* last, const auto uch = static_cast(*pointer); if (uch >= 0x7f) { // invalid utf-8/16/32 sequences will be replaced with 0xfffd - success &= detail::AppendUTF8EscapedChar(pointer, last, output); + success &= detail::append_utf8_percent_encoded_char(pointer, last, output); } else { // Just append the 7-bit character, escaping C0 control chars: if (uch <= 0x1f) - detail::AppendEscapedChar(uch, output); + detail::append_percent_encoded_byte(uch, output); else output.push_back(static_cast(uch)); ++pointer; @@ -3045,7 +3045,7 @@ inline url url_from_file_path(StrT&& str) { } // make URL - detail::AppendStringOfType(pointer, last, *no_encode_set, str_url); + detail::append_utf8_percent_encoded(pointer, last, *no_encode_set, str_url); return url(str_url); } diff --git a/include/upa/url_host.h b/include/upa/url_host.h index 88c9608..2cc3c65 100644 --- a/include/upa/url_host.h +++ b/include/upa/url_host.h @@ -286,7 +286,7 @@ inline validation_errc host_parser::parse_opaque_host(const CharT* first, const std::string& str_host = dest.hostStart(); //TODO: UTF-8 percent encode it using the C0 control percent-encode set - //detail::AppendStringOfType(first, last, detail::CHAR_C0_CTRL, str_host); + //detail::append_utf8_percent_encoded(first, last, detail::CHAR_C0_CTRL, str_host); using UCharT = typename std::make_unsigned::type; const CharT* pointer = first; @@ -295,11 +295,11 @@ inline validation_errc host_parser::parse_opaque_host(const CharT* first, const const auto uch = static_cast(*pointer); if (uch >= 0x7f) { // invalid utf-8/16/32 sequences will be replaced with 0xfffd - detail::AppendUTF8EscapedChar(pointer, last, str_host); + detail::append_utf8_percent_encoded_char(pointer, last, str_host); } else { // Just append the 7-bit character, escaping C0 control chars: if (uch <= 0x1f) - detail::AppendEscapedChar(uch, str_host); + detail::append_percent_encoded_byte(uch, str_host); else str_host.push_back(static_cast(uch)); ++pointer; diff --git a/include/upa/url_percent_encode.h b/include/upa/url_percent_encode.h index e9d70d0..a9d92f8 100644 --- a/include/upa/url_percent_encode.h +++ b/include/upa/url_percent_encode.h @@ -445,61 +445,52 @@ inline bool decode_hex_to_byte(const CharT*& first, const CharT* last, unsigned // ---------------------------------------------------------------------------- // Percent encode -// Write a single character, escaped, to the output. This always escapes: it -// does no checking that thee character requires escaping. -// Escaping makes sense only 8 bit chars, so code works in all cases of -// input parameters (8/16bit). +// Percent-encodes byte and appends to string +// See: https://url.spec.whatwg.org/#percent-encode + template -inline void AppendEscapedChar(UINCHAR ch, std::basic_string& output) { +inline void append_percent_encoded_byte(UINCHAR ch, std::basic_string& output) { output.push_back('%'); output.push_back(kHexCharLookup[(ch >> 4) & 0xf]); output.push_back(kHexCharLookup[ch & 0xf]); } -// Writes the given character to the output as UTF-8, escaping ALL -// characters (even when they are ASCII). This does NO checking of the -// validity of the Unicode characters; the caller should ensure that the value -// it is appending is valid to append. -inline void AppendUTF8EscapedValue(unsigned char_value, std::string& output) { - url_utf::append_utf8(char_value, output); -} - -// Writes the given character to the output as UTF-8, escaped. Call this -// function only when the input is wide. Returns true on success. Failure -// means there was some problem with the encoding, we'll still try to -// update the |*begin| pointer and add a placeholder character to the -// output so processing can continue. +// Reads one character from string (first, last), converts to UTF-8, then +// percent-encodes, and appends to `output`. Replaces invalid UTF-8, UTF-16 or UTF-32 +// sequences in input with Unicode replacement characters (U+FFFD) if present. template -inline bool AppendUTF8EscapedChar(const CharT*& first, const CharT* last, std::string& output) { +inline bool append_utf8_percent_encoded_char(const CharT*& first, const CharT* last, std::string& output) { // url_util::read_utf_char(..) will handle invalid characters for us and give // us the kUnicodeReplacementCharacter, so we don't have to do special // checking after failure, just pass through the failure to the caller. const auto cp_res = url_utf::read_utf_char(first, last); - AppendUTF8EscapedValue(cp_res.value, output); + // convert cp_res.value code point to UTF-8, then percent encode and append to `output` + url_utf::append_utf8(cp_res.value, output); return cp_res.result; } -// Appends the given string to the output, escaping characters that do not -// match the given |charsType| in CharsType. +// Converts input string (first, last) to UTF-8, then percent encodes bytes not +// in `cpset`, and appends to `output`. Replaces invalid UTF-8, UTF-16 or UTF-32 +// sequences in input with Unicode replacement characters (U+FFFD) if present. template -void AppendStringOfType(const CharT* first, const CharT* last, const code_point_set& cpset, std::string& output) { +void append_utf8_percent_encoded(const CharT* first, const CharT* last, const code_point_set& cpset, std::string& output) { using UCharT = typename std::make_unsigned::type; for (auto it = first; it < last; ) { const auto ch = static_cast(*it); if (ch >= 0x80) { // invalid utf-8/16/32 sequences will be replaced with kUnicodeReplacementCharacter - AppendUTF8EscapedChar(it, last, output); + append_utf8_percent_encoded_char(it, last, output); } else { - // Just append the 7-bit character, possibly escaping it. + // Just append the 7-bit character, possibly percent encoding it. const auto uch = static_cast(ch); if (is_char_in_set(uch, cpset)) { output.push_back(uch); } else { - // other characters are escaped - AppendEscapedChar(uch, output); + // other characters are percent encoded + append_percent_encoded_byte(uch, output); } ++it; } @@ -579,7 +570,7 @@ inline std::string percent_encode(StrT&& str, const code_point_set& no_encode_se const auto inp = make_str_arg(std::forward(str)); std::string out; - detail::AppendStringOfType(inp.begin(), inp.end(), no_encode_set, out); + detail::append_utf8_percent_encoded(inp.begin(), inp.end(), no_encode_set, out); return out; }