Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

experiment with UTF-8/16 C++ iterators #3096

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions icu4c/source/common/common.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,9 @@
<CustomBuild Include="unicode\utf16.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utf16cppiter.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utf32.h">
<Filter>strings</Filter>
</CustomBuild>
Expand Down
174 changes: 174 additions & 0 deletions icu4c/source/common/unicode/utf16cppiter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

// utf16cppiter.h
// created: 2024aug12 Markus W. Scherer

#ifndef __UTF16CPPITER_H__
#define __UTF16CPPITER_H__

#include <string_view>

#include "unicode/utypes.h"

#if U_SHOW_CPLUSPLUS_API

#include "unicode/utf16.h"
#include "unicode/uversion.h"

/**
* \file
* \brief C++ API: C++ iterators over Unicode 16-bit strings (=UTF-16 if well-formed).
*/

namespace U_HEADER_ONLY_NAMESPACE {

// Some defined behaviors for handling ill-formed 16-bit strings.
// TODO: Maybe share with 8-bit strings, but the SURROGATE option does not have an equivalent there.
//
// TODO: A possible alternative to an enum might be some kind of function template
// which would be fully customizable.
// The operator*() return value might then want to be a template parameter as well.
// For example, for a well-formed sequence, the return value could be
// a tuple of (code point, well-formed), or a string view, or...
// (And then the caller could choose between UChar32 and char32_t.)
// However, all of that would make the API more complex and daunting.
enum U16IllFormedBehavior {
U16_BEHAVIOR_NEGATIVE,
U16_BEHAVIOR_FFFD,
U16_BEHAVIOR_SURROGATE
};

/**
* A code unit sequence for one code point returned by U16Iterator.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @draft ICU 76
*/
template<typename Unit16>
class U16OneSeq {
public:
U16OneSeq(const U16OneSeq &other) = default;

const Unit16 *data() { return p; }
int32_t length() const { return len; }

std::basic_string_view<Unit16> stringView() const {
return std::basic_string_view<Unit16>(p, len);
}

bool isWellFormed() const { return ok; }

UChar32 codePoint() const { return c; }

// TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !ok)

private:
// TODO: Why can't we just use Unit16 here?
// error: declaration of 'Unit16' shadows template parameter
template<typename SomeOtherUnit16, U16IllFormedBehavior behavior>
friend class U16Iterator;

U16OneSeq(const Unit16 *p) : p(p) {}

void fwd1() { p += len; }

void readOneForward(const Unit16 *limit) {
if (p == limit) {
len = 0;
return;
}
// see U16_NEXT_OR_FFFD()
c = *p;
len = 1;
ok = true;
if (U16_IS_SURROGATE(c)) {
uint16_t c2;
if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) {
c = U16_GET_SUPPLEMENTARY(c, c2);
len = 2;
} else {
// TODO: U16IllFormedBehavior
c = 0xfffd;
ok = false;
}
}
}

const Unit16 *p;
UChar32 c = 0;
int8_t len = 0;
bool ok = false;
};

/**
* Validating iterator over the code points in a Unicode 16-bit string.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param U16IllFormedBehavior TODO
* @draft ICU 76
*/
template<typename Unit16, U16IllFormedBehavior behavior>
class U16Iterator {
public:
// TODO: make private, make friends
U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) :
start(start), limit(limit), seq(p) {
seq.readOneForward(limit);
}
// TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0.
// Test pointers for == or != but not < or >.

U16Iterator(const U16Iterator &other) = default;

bool operator==(const U16Iterator &other) const { return seq.p == other.seq.p; }
bool operator!=(const U16Iterator &other) const { return !operator==(other); }

const U16OneSeq<Unit16> &operator*() const {
return seq;
}

U16Iterator &operator++() { // pre-increment
// TODO: think about switching directions etc.
// Assume that readOneForward() was called and set seq.len.
// Skip the current code point, then read the next one.
seq.fwd1();
seq.readOneForward(limit);
return *this;
}

U16Iterator operator++(int) { // post-increment
U16Iterator result(*this);
// TODO: think about switching directions etc.
// Assume that readOneForward() was called and set seq.len.
// Skip the current code point, then read the next one.
seq.fwd1();
seq.readOneForward(limit);
return result;
}

private:
// In a validating iterator, we need start & limit so that when we read a code point
// (forward or backward) we can test if there are enough code units.
const Unit16 *start;
const Unit16 *limit;
U16OneSeq<Unit16> seq;
};

// ------------------------------------------------------------------------- ***

// TODO: Non-validating iterator over the code points in a Unicode 16-bit string.
// Assumes well-formed UTF-16. Otherwise the behavior is undefined.
// TODO: all @draft ICU 76
// template<typename Unit16>
// class U16UnsafeIterator
// TODO: only p, no start, no limit
// TODO: can/should we read the code point only in operator*()?
// if we read it in the constructor, then we would still need start/limit...

} // namespace U_HEADER_ONLY_NAMESPACE

#endif // U_SHOW_CPLUSPLUS_API
#endif // __UTF16CPPITER_H__
19 changes: 19 additions & 0 deletions icu4c/source/common/unicode/uversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];
* @stable ICU 2.4
*/

/**
* \def U_HEADER_ONLY_NAMESPACE
* Namespace used for header-only APIs.
* Different when used inside ICU to prevent public use of internal instantiations.
*
* @draft ICU 76
*/

# if U_DISABLE_RENAMING
# define U_ICU_NAMESPACE icu
namespace U_ICU_NAMESPACE { }
Expand All @@ -124,6 +132,17 @@ typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];
# if U_USING_ICU_NAMESPACE
U_NAMESPACE_USE
# endif

#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) || \
defined(U_LAYOUTEX_IMPLEMENTATION) || defined(U_TOOLUTIL_IMPLEMENTATION)
# define U_HEADER_ONLY_NAMESPACE U_ICU_NAMESPACE::internal
#else
# define U_HEADER_ONLY_NAMESPACE U_ICU_NAMESPACE::header
#endif

namespace U_HEADER_ONLY_NAMESPACE {}

#endif /* __cplusplus */

/*===========================================================================*/
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/intltest/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ numbertest_parse.o numbertest_doubleconversion.o numbertest_skeletons.o \
static_unisets_test.o numfmtdatadriventest.o numbertest_range.o erarulestest.o \
formattedvaluetest.o formatted_string_builder_test.o numbertest_permutation.o \
units_data_test.o units_router_test.o units_test.o displayoptions_test.o \
numbertest_simple.o uchar_type_build_test.o
numbertest_simple.o uchar_type_build_test.o utfcppitertest.o

DEPS = $(OBJECTS:.o=.d)

Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/intltest.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@
<ClCompile Include="sfwdchit.cpp" />
<ClCompile Include="strcase.cpp" />
<ClCompile Include="ustrtest.cpp" />
<ClCompile Include="utfcppitertest.cpp" />
<ClCompile Include="utxttest.cpp" />
<ClCompile Include="cpdtrtst.cpp" />
<ClCompile Include="ittrans.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/test/intltest/intltest.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@
<ClCompile Include="ustrtest.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="utfcppitertest.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="utxttest.cpp">
<Filter>strings</Filter>
</ClCompile>
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/itutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ extern IntlTest *createPluralMapTest();
extern IntlTest *createStaticUnicodeSetsTest();
#endif
static IntlTest *createUHashTest();
extern IntlTest *createU16IteratorTest();

void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
Expand Down Expand Up @@ -82,6 +83,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
TESTCASE_AUTO_CLASS(LocaleBuilderTest);
TESTCASE_AUTO_CREATE_CLASS(LocaleMatcherTest);
TESTCASE_AUTO_CREATE_CLASS(UHashTest);
TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest);
TESTCASE_AUTO_END;
}

Expand Down
63 changes: 63 additions & 0 deletions icu4c/source/test/intltest/utfcppitertest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

// utfcppitertest.cpp
// created: 2024aug12 Markus W. Scherer

#include <string_view>

#include "unicode/utypes.h"
#include "unicode/utf16cppiter.h"
#include "intltest.h"

// Makes u"literal"sv std::u16string_view literals possible.
// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv
using namespace std::string_view_literals;

using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE;
using U_HEADER_ONLY_NAMESPACE::U16Iterator;
using U_HEADER_ONLY_NAMESPACE::U16OneSeq;

class U16IteratorTest : public IntlTest {
public:
U16IteratorTest() {}

void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;

void testExperiment();
};

extern IntlTest *createU16IteratorTest() {
return new U16IteratorTest();
}

void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
if(exec) {
logln("TestSuite U16IteratorTest: ");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testExperiment);
TESTCASE_AUTO_END;
}

void U16IteratorTest::testExperiment() {
IcuTestErrorCode errorCode(*this, "testExperiment");
std::u16string_view good(u"abçカ🚴"sv);
const char16_t *goodLimit = good.data() + good.length();
U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodIter(good.data(), good.data(), goodLimit);
assertEquals("goodIter[0] * codePoint()", u'a', (*goodIter).codePoint());
++goodIter; // pre-increment
assertEquals("goodIter[1] * codePoint()", u'b', (*goodIter).codePoint());
++goodIter;
assertEquals("goodIter[2] * codePoint()", u'ç', (*goodIter++).codePoint()); // post-increment
assertEquals("goodIter[3] * codePoint()", u'カ', (*goodIter).codePoint());
++goodIter;
const U16OneSeq<char16_t> &seq = *goodIter++;
assertEquals("goodIter[4] * codePoint()", U'🚴', seq.codePoint());
assertEquals("goodIter[4] * length()", 2, seq.length());
assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv);
U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodEndIter(good.data(), goodLimit, goodLimit);
assertTrue("goodIter == goodEndIter", goodIter == goodEndIter);

// TODO: test ill-formed, and much more...
}
Loading