Skip to content

Commit

Permalink
CLDR-7428 Freeze collators; new class CollatorHelper (#4207)
Browse files Browse the repository at this point in the history
  • Loading branch information
btangmu authored Nov 20, 2024
1 parent 163a3df commit c0001a2
Show file tree
Hide file tree
Showing 39 changed files with 138 additions and 261 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,15 @@

import com.ibm.icu.dev.test.TestFmwk.TestGroup;
import com.ibm.icu.dev.test.TestLog;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import java.io.File;
import java.io.PrintWriter;
import java.sql.SQLException;
import java.util.logging.Logger;
import org.unicode.cldr.test.CheckCLDR;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRConfig.Environment;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.web.CLDRProgressIndicator;
import org.unicode.cldr.web.DBUtils;
import org.unicode.cldr.web.SurveyLog;
Expand Down Expand Up @@ -128,13 +122,6 @@ public TestAll() {
public static class WebTestInfo {
private static WebTestInfo INSTANCE = null;

private SupplementalDataInfo supplementalDataInfo;
private StandardCodes sc;
private Factory cldrFactory;
private CLDRFile english;
private CLDRFile root;
private RuleBasedCollator col;

public static WebTestInfo getInstance() {
synchronized (WebTestInfo.class) {
if (INSTANCE == null) {
Expand All @@ -145,62 +132,6 @@ public static WebTestInfo getInstance() {
}

private WebTestInfo() {}

public SupplementalDataInfo getSupplementalDataInfo() {
synchronized (this) {
if (supplementalDataInfo == null) {
supplementalDataInfo =
SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY);
}
}
return supplementalDataInfo;
}

public StandardCodes getStandardCodes() {
synchronized (this) {
if (sc == null) {
sc = StandardCodes.make();
}
}
return sc;
}

public Factory getCldrFactory() {
synchronized (this) {
if (cldrFactory == null) {
cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
}
}
return cldrFactory;
}

public CLDRFile getEnglish() {
synchronized (this) {
if (english == null) {
english = getCldrFactory().make("en", true);
}
}
return english;
}

public CLDRFile getRoot() {
synchronized (this) {
if (root == null) {
root = getCldrFactory().make("root", true);
}
}
return root;
}

public Collator getCollator() {
synchronized (this) {
if (col == null) {
col = (RuleBasedCollator) Collator.getInstance();
col.setNumericCollation(true);
}
}
return col;
}
}

static boolean dbSetup = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CollatorHelper;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.LocaleIDParser;
Expand Down Expand Up @@ -143,7 +144,9 @@ private static void showDefaultContent(String... strings) {

private static void showSortKey() {
String[] tests = "a ä A ぁ あ ァ ァ ア ア ㋐".split(" ");
RuleBasedCollator c = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH);
// TODO: freeze the Collator; problematic since changed in innermost for loop below
// Reference: https://unicode-org.atlassian.net/browse/CLDR-7428
RuleBasedCollator c = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
c.setStrength(RuleBasedCollator.QUATERNARY);
c.setCaseLevel(true);
c.setHiraganaQuaternary(true);
Expand Down Expand Up @@ -319,7 +322,7 @@ private static void showExemplarSize() {
String[] locales =
"en ru nl en-GB fr de it pl pt-BR es tr th ja zh-CN zh-TW ko ar bg sr uk ca hr cs da fil fi hu id lv lt no pt-PT ro sk sl es-419 sv vi el iw fa hi am af et is ms sw zu bn mr ta eu fr-CA gl zh-HK ur gu kn ml te"
.split(" ");
Set<String> nameAndInfo = new TreeSet<>(info.getCollator());
Set<String> nameAndInfo = new TreeSet<>(CollatorHelper.EMOJI_COLLATOR);
for (String localeCode : locales) {
String baseLanguage = ltp.set(localeCode).getLanguage();
R2<List<String>, String> temp = lang2replacement.get(baseLanguage);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;
import java.io.BufferedReader;
import java.io.File;
Expand All @@ -35,6 +33,7 @@
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.CollatorHelper;
import org.unicode.cldr.util.PathUtilities;
import org.unicode.cldr.util.PatternCache;
import org.unicode.cldr.util.SimpleFactory;
Expand Down Expand Up @@ -411,8 +410,7 @@ static void testProps() {
{UProperty.DOUBLE_START, UProperty.DOUBLE_START},
{UProperty.STRING_START, UProperty.STRING_LIMIT},
};
Collator col = Collator.getInstance(ULocale.ROOT);
((RuleBasedCollator) col).setNumericCollation(true);
Collator col = CollatorHelper.ROOT_NUMERIC;
Map<String, Set<String>> alpha = new TreeMap<>(col);

for (int range = 0; range < ranges.length; ++range) {
Expand Down Expand Up @@ -465,12 +463,6 @@ static void testProps() {
}
out.println("</table></td></tr>");
}
Collator c = Collator.getInstance(ULocale.ENGLISH);
((RuleBasedCollator) c).setNumericCollation(true);

// int enumValue = UCharacter.getIntPropertyValue(codePoint, propEnum);
// return UCharacter.getPropertyValueName(propEnum,enumValue, (int)nameChoice);

}

private static String getName(int index, String valueName, String shortValueName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.DecimalFormat;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import java.io.File;
import java.io.FileOutputStream;
Expand All @@ -28,6 +27,7 @@
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;
import org.unicode.cldr.util.CollatorHelper;
import org.unicode.cldr.util.LDMLUtilities;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
Expand Down Expand Up @@ -114,12 +114,7 @@ public static void main(String[] args) {
}

static Collator getDefaultCollation() {
// if (DEFAULT_COLLATION != null) return DEFAULT_COLLATION;
RuleBasedCollator temp = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH);
temp.setStrength(Collator.IDENTICAL);
temp.setNumericCollation(true);
// DEFAULT_COLLATION = temp;
return temp;
return CollatorHelper.ROOT_NUMERIC_IDENTICAL;
}

Hashtable<String, String> optionTable = new Hashtable<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CLDRTool;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.CollatorHelper;
import org.unicode.cldr.util.DateTimeCanonicalizer;
import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
import org.unicode.cldr.util.DowngradePaths;
Expand Down Expand Up @@ -3309,8 +3310,7 @@ private static int stepsFromRoot(String origLoc) {

/** Internal */
public static void testJavaSemantics() {
Collator caseInsensitive = Collator.getInstance(ULocale.ROOT);
caseInsensitive.setStrength(Collator.SECONDARY);
Collator caseInsensitive = CollatorHelper.ROOT_SECONDARY;
Set<String> setWithCaseInsensitive = new TreeSet<>(caseInsensitive);
setWithCaseInsensitive.addAll(Arrays.asList(new String[] {"a", "b", "c"}));
Set<String> plainSet = new TreeSet<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,6 @@ private void addCollator(Map<String, Data> data, String type, RuleBasedCollator
dataItem.collator = col;
}

// RuleBasedCollator ROOT = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);

private class Subchart extends Chart {
private static final String HIGH_COLLATION_PRIMARY = "\uFFFF";
String title;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSet.Builder;
import com.google.common.collect.Multimap;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
import java.io.IOException;
import java.util.Collection;
import java.util.Comparator;
Expand Down Expand Up @@ -76,8 +74,6 @@ public String getExplanation() {
+ "The data doesn't completely match wikipedia’s; there are some patches for CLDR languages.</p>\n";
}

Collator ENGLISH_ORDER = Collator.getInstance(ULocale.ENGLISH);

@Override
public void writeContents(FormattedFileWriter pw) throws IOException {

Expand Down Expand Up @@ -112,7 +108,9 @@ private void show(Multimap<String, String> lg, String parent, TablePrinter table
new Comparator<Pair<String, String>>() {
@Override
public int compare(Pair<String, String> o1, Pair<String, String> o2) {
int diff = ENGLISH_ORDER.compare(o1.getFirst(), o2.getFirst());
int diff =
CollatorHelper.ROOT_COLLATOR.compare(
o1.getFirst(), o2.getFirst());
if (diff != 0) {
return diff;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
import java.util.TreeSet;
import org.unicode.cldr.util.Annotations;
import org.unicode.cldr.util.Annotations.AnnotationSet;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CollatorHelper;

public class CheckAnnotations {
public static void main(String[] args) {
AnnotationSet data = Annotations.getDataSet("en");
CLDRConfig config = CLDRConfig.getInstance();
// UnicodeMap<Annotations> data2 = Annotations.getData("de");
Set<String> sorted = new TreeSet<>(config.getCollator());
Set<String> sorted = new TreeSet<>(CollatorHelper.EMOJI_COLLATOR);

int i = 0;
boolean needMore = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package org.unicode.cldr.tool;

import com.ibm.icu.dev.util.UOption;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
Expand All @@ -30,12 +27,6 @@ public class CompareData {

String[] directoryList = {"main", "collation", "segmentations"};

static RuleBasedCollator uca = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);

{
uca.setNumericCollation(true);
}

static PrettyPath prettyPathMaker = new PrettyPath();
static CLDRFile english;
static Set<String> locales;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Collator;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
Expand All @@ -16,6 +15,7 @@
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CollatorHelper;
import org.unicode.cldr.util.Emoji;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.SimpleFactory;
Expand All @@ -29,11 +29,12 @@ public class CompareEmoji {
static final Factory FACTORY_DERIVED = SimpleFactory.make(paths, ".*");

private static final Joiner BAR_JOINER = Joiner.on(" | ");
private static final Collator collator = CLDRConfig.getInstance().getCollator();

private static final String base =
"/Users/markdavis/github/private/DATA/cldr-private/emoji_diff/";
private static final Set<String> sorted =
ImmutableSet.copyOf(Emoji.getAllRgi().addAllTo(new TreeSet<>(collator)));
ImmutableSet.copyOf(
Emoji.getAllRgi().addAllTo(new TreeSet<>(CollatorHelper.EMOJI_COLLATOR)));

enum Status {
regular,
Expand Down Expand Up @@ -155,7 +156,7 @@ public static Map<String, Set<String>> loadItems(
continue;
}
String key = split[0];
Set<String> values = new TreeSet<>(collator);
Set<String> values = new TreeSet<>(CollatorHelper.EMOJI_COLLATOR);
for (int i = 1; i < split.length; ++i) {
values.add(split[i]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
import com.ibm.icu.impl.Relation;
import com.ibm.icu.impl.Row;
import com.ibm.icu.impl.Row.R2;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.ULocale;
import java.io.BufferedReader;
Expand Down Expand Up @@ -42,6 +40,7 @@
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.CollatorHelper;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.Iso639Data;
import org.unicode.cldr.util.Iso639Data.Scope;
Expand Down Expand Up @@ -1924,11 +1923,6 @@ public String toString() {

public static class GeneralCollator implements Comparator<String> {
static UTF16.StringComparator cpCompare = new UTF16.StringComparator(true, false, 0);
static RuleBasedCollator UCA = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);

static {
UCA.setNumericCollation(true);
}

@Override
public int compare(String s1, String s2) {
Expand All @@ -1937,7 +1931,7 @@ public int compare(String s1, String s2) {
} else if (s2 == null) {
return 1;
}
int result = UCA.compare(s1, s2);
int result = CollatorHelper.ROOT_NUMERIC.compare(s1, s2);
if (result != 0) return result;
return cpCompare.compare(s1, s2);
}
Expand Down
Loading

0 comments on commit c0001a2

Please sign in to comment.