Skip to content

Commit

Permalink
3.6.6: noise handling in various language modes
Browse files Browse the repository at this point in the history
  • Loading branch information
mubaldino committed Dec 28, 2023
1 parent 783240e commit b7f2c94
Show file tree
Hide file tree
Showing 11 changed files with 268 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Core/src/main/java/org/opensextant/util/TextUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -1714,7 +1714,7 @@ public static int countNonText(final String t) {

int nonText = 0;
for (char c : t.toCharArray()) {
if (!Character.isLetter(c) && Character.isDigit(c) && Character.isWhitespace(c)) {
if (!Character.isLetter(c) && !Character.isDigit(c)) {
++nonText;
}
}
Expand Down
7 changes: 6 additions & 1 deletion Core/src/test/java/TestTextUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ public void testLanguageCodes() {

@Test
public void testMidEastLanguages() {
assertTrue(TextUtils.hasMiddleEasternText("تشییع پیکر سردار شهید سید رض\u200Cالسلام آغازABC 111 " ));
assertTrue(TextUtils.hasMiddleEasternText("تشییع پیکر سردار شهید سید رض\u200Cالسلام آغازABC 111 "));
assertTrue(TextUtils.hasMiddleEasternText("עִבְרִית"));
assertFalse(TextUtils.hasMiddleEasternText("1 2 3 4 Z Y X "));
}
Expand Down Expand Up @@ -185,6 +185,11 @@ public void testCase() {
assertTrue(TextUtils.isLower("øh baby") && TextUtils.isUpper("ØH BABY"));
}

@Test
public void testCharCounting() {
assertEquals(1, TextUtils.countNonText("bob bob"));
}

@Test
public void testNumerics() {
// Valid number patterns.
Expand Down
28 changes: 28 additions & 0 deletions src/main/java/org/opensextant/extractors/geo/PlaceCandidate.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ public class PlaceCandidate extends TextMatch {
private boolean anchor = false;
private String nonDiacriticTextnorm = null;
private boolean reviewed = false;
private boolean hasCJKtext = false;
private boolean hasMEtext = false;

public final static String VAL_SAME_COUNTRY = "same-country";
/**
Expand Down Expand Up @@ -90,6 +92,16 @@ public String getNDTextnorm() {
public void setText(String name) {
super.setText(name);
this.nonDiacriticTextnorm = TextUtils.phoneticReduction(getTextnorm(), isASCII());
this.hasMEtext = TextUtils.hasMiddleEasternText(name);
this.hasCJKtext = TextUtils.hasCJKText(name);
}

public boolean hasCJKText() {
return this.hasCJKtext;
}

public boolean hasMiddleEasternText() {
return this.hasMEtext;
}

public boolean isAbbrevLength() {
Expand Down Expand Up @@ -768,6 +780,22 @@ public void setPostmatchTokens(String[] toks) {
this.postTokens = toks;
}

public String getSurroundingText() {
StringJoiner joiner = new StringJoiner(" ");
// Find if surrounding text is not uppercase.
if (getPrematchTokens() != null) {
for (String tok : getPrematchTokens()) {
joiner.add(tok);
}
}
if (getPostmatchTokens() != null) {
for (String tok : getPostmatchTokens()) {
joiner.add(tok);
}
}
return joiner.toString();
}

/**
* Given a path, 'a.b' ( province b in country a),
* see if this name is present there.
Expand Down
21 changes: 21 additions & 0 deletions src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.*;

import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.util.Hash;
import org.opensextant.ConfigException;
import org.opensextant.data.*;
import org.opensextant.extraction.ExtractionException;
Expand All @@ -49,6 +50,7 @@
import org.opensextant.extractors.xtax.TaxonMatcher;
import org.opensextant.processing.Parameters;
import org.opensextant.util.GeonamesUtility;
import org.opensextant.util.TextUtils;
import org.slf4j.LoggerFactory;

/**
Expand Down Expand Up @@ -92,7 +94,20 @@ public class PlaceGeocoder extends GazetteerMatcher
private NameCodeRule nameWithAdminRule = null;
private LocationChooserRule chooser = null;
private ProvinceNameSetter provinceNameSetter = null;
private NonLatinNameRule langFilter = null;

private static final HashSet<String> LANG_SPECIFIC_FILTERS = new HashSet<>();
static {
// Most MidEast scripts:
LANG_SPECIFIC_FILTERS.add(TextUtils.arabicLang);
LANG_SPECIFIC_FILTERS.add(TextUtils.farsiLang);
LANG_SPECIFIC_FILTERS.add("ur");
// Most Asian scripts:
LANG_SPECIFIC_FILTERS.add(TextUtils.chineseLang);
LANG_SPECIFIC_FILTERS.add(TextUtils.chineseTradLang);
LANG_SPECIFIC_FILTERS.add(TextUtils.koreanLang);
LANG_SPECIFIC_FILTERS.add(TextUtils.japaneseLang);
}
/**
* A default Geocoding app that demonstrates how to invoke the geocoding pipline
* start to finish. It makes use of XCoord to parse/geocode coordinates,
Expand Down Expand Up @@ -198,6 +213,8 @@ public void configure() throws ConfigException {
// Otherwise such rules are configured, set during the request, and evaluated
// adhoc as you need.
//
langFilter = new NonLatinNameRule();

/* assess country names and codes */
countryRule = new CountryRule();
countryRule.setCountryObserver(this);
Expand Down Expand Up @@ -503,6 +520,10 @@ public List<TextMatch> extract(TextInput input, Parameters jobParams) throws Ext
// May contain found taxons from known places step above.
return matches;
}
if (LANG_SPECIFIC_FILTERS.contains(input.langid)) {
langFilter.reset();
langFilter.evaluate(candidates);
}

// Evaluate independent rules, and any that user has added.
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.opensextant.data.Place;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.ScoredPlace;
import org.opensextant.util.TextUtils;

public class CountryRule extends GeocodeRule {

Expand All @@ -37,11 +38,13 @@ public void evaluate(List<PlaceCandidate> names) {

for (PlaceCandidate name : names) {
// We do not want mixed case acronym/code/abbreviation matches.
if (name.isCountry && !name.isUpper() && name.getLength() < 4) {
// Just looking at country codes -- we'll only consider upper case codes if they are short.
name.setFilteredOut(true); /* TODO: possibly leave as filtered-in */
name.isCountry = false; /* definitely unmark as country */
continue;
if (name.isCountry){
if (!name.isUpper() && name.getLength() < 4 && !(name.hasCJKText() || name.hasMiddleEasternText())) {
// Just looking at country codes -- we'll only consider upper case codes if they are short.
name.setFilteredOut(true); /* TODO: possibly leave as filtered-in */
name.isCountry = false; /* definitely unmark as country */
continue;
}
}
for (ScoredPlace geo : name.getPlaces()) {
if (filterOutByFrequency(name, geo.getPlace())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ private PairValidation validMatch(PlaceCandidate nm) {

remarkAbbreviation(nm);

validation.nameIsIgnorable = ignoreShortLowercase(nm)
|| (nm.isAbbreviation && ignoreNonAdminCode(nm));
validation.nameIsIgnorable = canIgnore(nm);
// Short names, lower case will not be assessed at all.
if (validation.nameIsIgnorable) {
nm.setFilteredOut(true);
Expand All @@ -138,6 +137,17 @@ private PairValidation validMatch(PlaceCandidate nm) {

return validation;
}
private boolean canIgnore(PlaceCandidate mention){
if (ignoreShortLowercase(mention)){
return true;
}
// Remarked mention
if (mention.isAbbreviation) {
return ignoreNonAdminCode(mention);
}

return false;
}

private boolean validMatchPair(PlaceCandidate nm, PlaceCandidate code, PairValidation v) {
remarkAbbreviation(code);
Expand Down Expand Up @@ -321,7 +331,12 @@ public void evaluate(final List<PlaceCandidate> names) {
if (name.isFilteredOut()) {
continue;
}
if (name.hasCJKText()){
continue;
}

// First deteremine if a given mention makese sense --
// filter out mis-matched UPPERCASE mentions with context and geotags don't suggest it.
PairValidation validation = validMatch(name);
if (!validation.valid) {
continue;
Expand Down Expand Up @@ -446,7 +461,7 @@ private void remarkAbbreviation(PlaceCandidate pc) {
* </ul>
*
* @param pc match
* @return true if non-administrative feature is encountred
* @return true if non-administrative feature is encountered
*/
private static boolean ignoreNonAdminCode(final PlaceCandidate pc) {
// If found alone, unqualified what happens?
Expand Down
22 changes: 21 additions & 1 deletion src/main/java/org/opensextant/extractors/geo/rules/NameRule.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringJoiner;

import org.opensextant.data.Place;
import org.opensextant.extractors.geo.PlaceCandidate;
Expand All @@ -15,6 +16,7 @@ public class NameRule extends GeocodeRule {
public static final String ADM1 = "QualifiedName.Prov";
public static final String ADM2 = "QualifiedName.Dist";
public static final String DIACRITIC = "DiacriticName";
public static final String UPPERCASE_NOISE = "Noise.Uppercase";

public static final Set<String> P_prefixes = new HashSet<>();
public static final Set<String> A1_suffixes = new HashSet<>();
Expand Down Expand Up @@ -46,7 +48,12 @@ public void evaluate(List<PlaceCandidate> names) {
continue;
}

if (!name.isASCII()) {
// Check short ASCII names vs. non-ASCII names.
if (name.isASCII()) {
if (isUppercaseNoise(name)) {
continue;
}
} else {
name.addRule(DIACRITIC);
}

Expand Down Expand Up @@ -83,6 +90,19 @@ public void evaluate(List<PlaceCandidate> names) {
}
}

private boolean isUppercaseNoise(PlaceCandidate name) {
// UPPPER CASE name surrounded by mixed case,... and the mention is not linked
// to any other geography. That means it is a random acronym of no import...
if (!name.isValid() || name.getRelated()==null) {
if (isShort(name.getLength()) && name.isUpper() && !TextUtils.isUpper(name.getSurroundingText())) {
name.addRule(UPPERCASE_NOISE);
name.setFilteredOut(true);
return true;
}
}
return false;
}

/**
* This filter counts "admin code" Places for a given match. If the match is indeed
* a short code-like name , e.g., "BS", "MA"... and it is unassociated with a place name, then it is
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.opensextant.extractors.geo.rules;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.opensextant.data.Place;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.ScoredPlace;
import org.opensextant.util.TextUtils;

/**
* GeocodeRule called only if document is non-Latin such as C/J/K or MiddleEastern scripts.
*/
public class NonLatinNameRule extends GeocodeRule {

@Override
public boolean filterByNameOnly(PlaceCandidate name) {

// Assess lesser known places if only two chars long or so:
if (name.getLength() < 3 && !name.isCountry) {
name.setFilteredOut(true);
name.addRule("Lang.LengthHeuristic");
return true;
} else
// Assess general alpha to non-alpha content in the name
{
int charRatio = name.hasCJKText() ? 3 : name.hasMiddleEasternText() ? 5 : -1;
if ( charRatio > 0 && !NonsenseFilter.assessPhraseDensity(name.getText(), charRatio)) {
name.addRule("Lang.DensityHeuristic");
name.setFilteredOut(true);
return true;
}
}
return false;
}

@Override
public void evaluate(PlaceCandidate name, Place geo) {
/* no-op */
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.regex.Pattern;

import org.opensextant.data.Place;
import org.opensextant.extraction.TextMatch;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.ScoredPlace;
import org.opensextant.util.TextUtils;
Expand Down Expand Up @@ -84,10 +85,19 @@ public void evaluate(List<PlaceCandidate> names) {
continue;
}

if (p.hasMiddleEasternText() || p.hasCJKText()){
continue;
}

/* Look at valid and invalid punctuation patterns */
if (assessPunctuation(p)) {
continue;
}
if (!assessPhraseDensity(p)) {
p.addRule("Noise.LowDensityText");
p.setFilteredOut(true);
continue;
}

if (irregularCase(p.getText())) {
p.setFilteredOut(true);
Expand Down Expand Up @@ -151,6 +161,37 @@ public void evaluate(List<PlaceCandidate> names) {
}


/** Names of places should have about N=5 chars to non-chars.
*
* "A BC" 3:1 filtered out.
* "AB CD" 4:1 filterd out.
* "AB BCD" 5:1 possibly acceptable.
*/
public static final int PHRASE_DENSITY_CHAR_RATIO = 5;

/**
*
* @param p
* @return True if alphanum to non-alphanum content is at or above default threshold
*/
public static boolean assessPhraseDensity(TextMatch p) {
return assessPhraseDensity(p.getText(), PHRASE_DENSITY_CHAR_RATIO);
}

/**
*
* @param name
* @param charRatio
* @return True if alphanum to non-alphanum content is at or above charRatio threshold
*/
public static boolean assessPhraseDensity(String name, int charRatio) {
int nonAlpha = TextUtils.countNonText(name);
if (nonAlpha==0){
return true;
}
return ((name.length() - nonAlpha) / nonAlpha) >= charRatio;
}

/**
* optimize punctuation detection and filtration.
* This routine marks the candidate as filtered or not, as well as returning a status indicating
Expand Down
Loading

0 comments on commit b7f2c94

Please sign in to comment.