3.6.6: noise handling in various language modes

OpenSextant · Dec 28, 2023 · b7f2c94 · b7f2c94
1 parent 783240e
commit b7f2c94
Show file tree

Hide file tree

Showing 11 changed files with 268 additions and 11 deletions.
diff --git a/Core/src/main/java/org/opensextant/util/TextUtils.java b/Core/src/main/java/org/opensextant/util/TextUtils.java
@@ -1714,7 +1714,7 @@ public static int countNonText(final String t) {
 
         int nonText = 0;
         for (char c : t.toCharArray()) {
-            if (!Character.isLetter(c) && Character.isDigit(c) && Character.isWhitespace(c)) {
+            if (!Character.isLetter(c) && !Character.isDigit(c)) {
                 ++nonText;
             }
         }

diff --git a/Core/src/test/java/TestTextUtils.java b/Core/src/test/java/TestTextUtils.java
@@ -122,7 +122,7 @@ public void testLanguageCodes() {
 
     @Test
     public void testMidEastLanguages() {
-        assertTrue(TextUtils.hasMiddleEasternText("تشییع پیکر سردار شهید سید رض\u200Cالسلام آغازABC 111  " ));
+        assertTrue(TextUtils.hasMiddleEasternText("تشییع پیکر سردار شهید سید رض\u200Cالسلام آغازABC 111  "));
         assertTrue(TextUtils.hasMiddleEasternText("עִבְרִית"));
         assertFalse(TextUtils.hasMiddleEasternText("1 2 3 4 Z Y X "));
     }
@@ -185,6 +185,11 @@ public void testCase() {
         assertTrue(TextUtils.isLower("øh baby") && TextUtils.isUpper("ØH BABY"));
     }
 
+    @Test
+    public void testCharCounting() {
+        assertEquals(1, TextUtils.countNonText("bob bob"));
+    }
+
     @Test
     public void testNumerics() {
         // Valid number patterns.

diff --git a/src/main/java/org/opensextant/extractors/geo/PlaceCandidate.java b/src/main/java/org/opensextant/extractors/geo/PlaceCandidate.java
@@ -62,6 +62,8 @@ public class PlaceCandidate extends TextMatch {
     private boolean anchor = false;
     private String nonDiacriticTextnorm = null;
     private boolean reviewed = false;
+    private boolean hasCJKtext = false;
+    private boolean hasMEtext = false;
 
     public final static String VAL_SAME_COUNTRY = "same-country";
     /**
@@ -90,6 +92,16 @@ public String getNDTextnorm() {
     public void setText(String name) {
         super.setText(name);
         this.nonDiacriticTextnorm = TextUtils.phoneticReduction(getTextnorm(), isASCII());
+        this.hasMEtext = TextUtils.hasMiddleEasternText(name);
+        this.hasCJKtext = TextUtils.hasCJKText(name);
+    }
+
+    public boolean hasCJKText() {
+        return this.hasCJKtext;
+    }
+
+    public boolean hasMiddleEasternText() {
+        return this.hasMEtext;
     }
 
     public boolean isAbbrevLength() {
@@ -768,6 +780,22 @@ public void setPostmatchTokens(String[] toks) {
         this.postTokens = toks;
     }
 
+    public String getSurroundingText() {
+        StringJoiner joiner = new StringJoiner(" ");
+        // Find if surrounding text is not uppercase.
+        if (getPrematchTokens() != null) {
+            for (String tok : getPrematchTokens()) {
+                joiner.add(tok);
+            }
+        }
+        if (getPostmatchTokens() != null) {
+            for (String tok : getPostmatchTokens()) {
+                joiner.add(tok);
+            }
+        }
+        return joiner.toString();
+    }
+
     /**
      * Given a path, 'a.b' ( province b in country a),
      * see if this name is present there.

diff --git a/src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java b/src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java
@@ -36,6 +36,7 @@
 import java.util.*;
 
 import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.util.Hash;
 import org.opensextant.ConfigException;
 import org.opensextant.data.*;
 import org.opensextant.extraction.ExtractionException;
@@ -49,6 +50,7 @@
 import org.opensextant.extractors.xtax.TaxonMatcher;
 import org.opensextant.processing.Parameters;
 import org.opensextant.util.GeonamesUtility;
+import org.opensextant.util.TextUtils;
 import org.slf4j.LoggerFactory;
 
 /**
@@ -92,7 +94,20 @@ public class PlaceGeocoder extends GazetteerMatcher
     private NameCodeRule nameWithAdminRule = null;
     private LocationChooserRule chooser = null;
     private ProvinceNameSetter provinceNameSetter = null;
+    private NonLatinNameRule langFilter = null;
 
+    private static final HashSet<String> LANG_SPECIFIC_FILTERS = new HashSet<>();
+    static {
+        // Most MidEast scripts:
+        LANG_SPECIFIC_FILTERS.add(TextUtils.arabicLang);
+        LANG_SPECIFIC_FILTERS.add(TextUtils.farsiLang);
+        LANG_SPECIFIC_FILTERS.add("ur");
+        // Most Asian scripts:
+        LANG_SPECIFIC_FILTERS.add(TextUtils.chineseLang);
+        LANG_SPECIFIC_FILTERS.add(TextUtils.chineseTradLang);
+        LANG_SPECIFIC_FILTERS.add(TextUtils.koreanLang);
+        LANG_SPECIFIC_FILTERS.add(TextUtils.japaneseLang);
+    }
     /**
      * A default Geocoding app that demonstrates how to invoke the geocoding pipline
      * start to finish. It makes use of XCoord to parse/geocode coordinates,
@@ -198,6 +213,8 @@ public void configure() throws ConfigException {
         // Otherwise such rules are configured, set during the request, and evaluated
         // adhoc as you need.
         //
+        langFilter = new NonLatinNameRule();
+
         /* assess country names and codes */
         countryRule = new CountryRule();
         countryRule.setCountryObserver(this);
@@ -503,6 +520,10 @@ public List<TextMatch> extract(TextInput input, Parameters jobParams) throws Ext
             // May contain found taxons from known places step above.
             return matches;
         }
+        if (LANG_SPECIFIC_FILTERS.contains(input.langid)) {
+            langFilter.reset();
+            langFilter.evaluate(candidates);
+        }
 
         // Evaluate independent rules, and any that user has added.
         //

diff --git a/src/main/java/org/opensextant/extractors/geo/rules/CountryRule.java b/src/main/java/org/opensextant/extractors/geo/rules/CountryRule.java
@@ -21,6 +21,7 @@
 import org.opensextant.data.Place;
 import org.opensextant.extractors.geo.PlaceCandidate;
 import org.opensextant.extractors.geo.ScoredPlace;
+import org.opensextant.util.TextUtils;
 
 public class CountryRule extends GeocodeRule {
 
@@ -37,11 +38,13 @@ public void evaluate(List<PlaceCandidate> names) {
 
         for (PlaceCandidate name : names) {
             // We do not want mixed case acronym/code/abbreviation matches.
-            if (name.isCountry && !name.isUpper() && name.getLength() < 4) {
-                // Just looking at country codes -- we'll only consider upper case codes if they are short.
-                name.setFilteredOut(true); /* TODO: possibly leave as filtered-in */
-                name.isCountry = false; /* definitely unmark as country */
-                continue;
+            if (name.isCountry){
+                if (!name.isUpper() && name.getLength() < 4 && !(name.hasCJKText() || name.hasMiddleEasternText())) {
+                    // Just looking at country codes -- we'll only consider upper case codes if they are short.
+                    name.setFilteredOut(true); /* TODO: possibly leave as filtered-in */
+                    name.isCountry = false; /* definitely unmark as country */
+                    continue;
+                }
             }
             for (ScoredPlace geo : name.getPlaces()) {
                 if (filterOutByFrequency(name, geo.getPlace())) {

diff --git a/src/main/java/org/opensextant/extractors/geo/rules/NameCodeRule.java b/src/main/java/org/opensextant/extractors/geo/rules/NameCodeRule.java
@@ -126,8 +126,7 @@ private PairValidation validMatch(PlaceCandidate nm) {
 
         remarkAbbreviation(nm);
 
-        validation.nameIsIgnorable = ignoreShortLowercase(nm)
-                || (nm.isAbbreviation && ignoreNonAdminCode(nm));
+        validation.nameIsIgnorable = canIgnore(nm);
         // Short names, lower case will not be assessed at all.
         if (validation.nameIsIgnorable) {
             nm.setFilteredOut(true);
@@ -138,6 +137,17 @@ private PairValidation validMatch(PlaceCandidate nm) {
 
         return validation;
     }
+    private boolean canIgnore(PlaceCandidate mention){
+        if (ignoreShortLowercase(mention)){
+            return true;
+        }
+        // Remarked mention
+        if (mention.isAbbreviation) {
+            return ignoreNonAdminCode(mention);
+        }
+
+        return false;
+    }
 
     private boolean validMatchPair(PlaceCandidate nm, PlaceCandidate code, PairValidation v) {
         remarkAbbreviation(code);
@@ -321,7 +331,12 @@ public void evaluate(final List<PlaceCandidate> names) {
             if (name.isFilteredOut()) {
                 continue;
             }
+            if (name.hasCJKText()){
+                continue;
+            }
 
+            // First deteremine if a given mention makese sense --
+            // filter out mis-matched UPPERCASE mentions with context and geotags don't suggest it.
             PairValidation validation = validMatch(name);
             if (!validation.valid) {
                 continue;
@@ -446,7 +461,7 @@ private void remarkAbbreviation(PlaceCandidate pc) {
      * </ul>
      *
      * @param pc match
-     * @return true if non-administrative feature is encountred
+     * @return true if non-administrative feature is encountered
      */
     private static boolean ignoreNonAdminCode(final PlaceCandidate pc) {
         // If found alone, unqualified what happens?

diff --git a/src/main/java/org/opensextant/extractors/geo/rules/NameRule.java b/src/main/java/org/opensextant/extractors/geo/rules/NameRule.java
@@ -3,6 +3,7 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.StringJoiner;
 
 import org.opensextant.data.Place;
 import org.opensextant.extractors.geo.PlaceCandidate;
@@ -15,6 +16,7 @@ public class NameRule extends GeocodeRule {
     public static final String ADM1 = "QualifiedName.Prov";
     public static final String ADM2 = "QualifiedName.Dist";
     public static final String DIACRITIC = "DiacriticName";
+    public static final String UPPERCASE_NOISE = "Noise.Uppercase";
 
     public static final Set<String> P_prefixes = new HashSet<>();
     public static final Set<String> A1_suffixes = new HashSet<>();
@@ -46,7 +48,12 @@ public void evaluate(List<PlaceCandidate> names) {
                 continue;
             }
 
-            if (!name.isASCII()) {
+            // Check short ASCII names vs. non-ASCII names.
+            if (name.isASCII()) {
+                if (isUppercaseNoise(name)) {
+                    continue;
+                }
+            } else {
                 name.addRule(DIACRITIC);
             }
 
@@ -83,6 +90,19 @@ public void evaluate(List<PlaceCandidate> names) {
         }
     }
 
+    private boolean isUppercaseNoise(PlaceCandidate name) {
+        // UPPPER CASE name surrounded by mixed case,... and the mention is not linked
+        // to any other geography.  That means it is a random acronym of no import...
+        if (!name.isValid() || name.getRelated()==null) {
+            if (isShort(name.getLength()) && name.isUpper() && !TextUtils.isUpper(name.getSurroundingText())) {
+                name.addRule(UPPERCASE_NOISE);
+                name.setFilteredOut(true);
+                return true;
+            }
+        }
+        return false;
+    }
+
     /**
      * This filter counts "admin code" Places for a given match.  If the match is indeed
      * a short code-like name , e.g., "BS", "MA"... and it is unassociated with a place name, then it is

diff --git a/src/main/java/org/opensextant/extractors/geo/rules/NonLatinNameRule.java b/src/main/java/org/opensextant/extractors/geo/rules/NonLatinNameRule.java
@@ -0,0 +1,42 @@
+package org.opensextant.extractors.geo.rules;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.opensextant.data.Place;
+import org.opensextant.extractors.geo.PlaceCandidate;
+import org.opensextant.extractors.geo.ScoredPlace;
+import org.opensextant.util.TextUtils;
+
+/**
+ * GeocodeRule called only if document is non-Latin such as C/J/K or MiddleEastern scripts.
+ */
+public class NonLatinNameRule extends GeocodeRule {
+
+    @Override
+    public boolean filterByNameOnly(PlaceCandidate name) {
+
+        // Assess lesser known places if only two chars long or so:
+        if (name.getLength() < 3 && !name.isCountry) {
+            name.setFilteredOut(true);
+            name.addRule("Lang.LengthHeuristic");
+            return true;
+        } else
+        // Assess general alpha to non-alpha content in the name
+        {
+            int charRatio = name.hasCJKText() ? 3 : name.hasMiddleEasternText() ? 5 : -1;
+            if ( charRatio > 0 && !NonsenseFilter.assessPhraseDensity(name.getText(), charRatio)) {
+                name.addRule("Lang.DensityHeuristic");
+                name.setFilteredOut(true);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    @Override
+    public void evaluate(PlaceCandidate name, Place geo) {
+        /* no-op */
+    }
+}
diff --git a/src/main/java/org/opensextant/extractors/geo/rules/NonsenseFilter.java b/src/main/java/org/opensextant/extractors/geo/rules/NonsenseFilter.java
@@ -7,6 +7,7 @@
 import java.util.regex.Pattern;
 
 import org.opensextant.data.Place;
+import org.opensextant.extraction.TextMatch;
 import org.opensextant.extractors.geo.PlaceCandidate;
 import org.opensextant.extractors.geo.ScoredPlace;
 import org.opensextant.util.TextUtils;
@@ -84,10 +85,19 @@ public void evaluate(List<PlaceCandidate> names) {
                 continue;
             }
 
+            if (p.hasMiddleEasternText() || p.hasCJKText()){
+                continue;
+            }
+
             /* Look at valid and invalid punctuation patterns */
             if (assessPunctuation(p)) {
                 continue;
             }
+            if (!assessPhraseDensity(p)) {
+                p.addRule("Noise.LowDensityText");
+                p.setFilteredOut(true);
+                continue;
+            }
 
             if (irregularCase(p.getText())) {
                 p.setFilteredOut(true);
@@ -151,6 +161,37 @@ public void evaluate(List<PlaceCandidate> names) {
     }
 
 
+    /** Names of places should have about N=5 chars to non-chars.
+     *
+     *   "A BC"  3:1      filtered out.
+     *   "AB CD"  4:1     filterd out.
+     *   "AB BCD"  5:1    possibly acceptable.
+     */
+    public static final int PHRASE_DENSITY_CHAR_RATIO = 5;
+
+    /**
+     *
+     * @param p
+     * @return True if alphanum to non-alphanum content is at or above default threshold
+     */
+    public static boolean assessPhraseDensity(TextMatch p) {
+        return assessPhraseDensity(p.getText(), PHRASE_DENSITY_CHAR_RATIO);
+    }
+
+    /**
+     *
+     * @param name
+     * @param charRatio
+     * @return True if alphanum to non-alphanum content is at or above charRatio threshold
+     */
+    public static boolean assessPhraseDensity(String name, int charRatio) {
+        int nonAlpha = TextUtils.countNonText(name);
+        if (nonAlpha==0){
+            return true;
+        }
+        return ((name.length() - nonAlpha) / nonAlpha) >= charRatio;
+    }
+
     /**
      * optimize punctuation detection and filtration.
      * This routine marks the candidate as filtered or not, as well as returning a status indicating