Skip to content

Commit

Permalink
Xponents REST API - noise filter in place, use a minimum length strat…
Browse files Browse the repository at this point in the history
…egy to filter out unqualified names/codes
  • Loading branch information
mubaldino committed May 21, 2024
1 parent ea1a886 commit f528581
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 18 deletions.
9 changes: 2 additions & 7 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.opensextant</groupId>
<artifactId>opensextant-xponents</artifactId>
<version>3.7.2</version>
<version>3.7.3</version>
<packaging>jar</packaging>
<name>OpenSextant Xponents</name>
<description>An information extraction toolkit focused on geography and temporal entities</description>
Expand Down Expand Up @@ -53,7 +53,7 @@
<slf4j.version>2.0.12</slf4j.version>
<log4j.version>2.23.0</log4j.version>
<restlet.version>2.4.3</restlet.version>
<xponents.version>3.7.2</xponents.version>
<xponents.version>3.7.3</xponents.version>
</properties>
<!-- OSS Sonatype instructions: list repositories -->
<distributionManagement>
Expand Down Expand Up @@ -189,11 +189,6 @@
<artifactId>spatial4j</artifactId>
<version>0.8</version>
</dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>findbugs-annotations</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ protected Parameters fromRequest(Form inputs) {
if (fmt != null) {
job.addOutputFormat(fmt);
}
String len = inputs.getFirstValue("minlen");
if (len != null){
job.minimum_tag_len = Integer.parseInt(len);
}

return job;
}
Expand Down Expand Up @@ -216,6 +220,9 @@ protected Parameters fromRequest(JSONObject inputs) throws JSONException {
if (job.clean_input || job.tag_lowercase) {
job.isdefault = false;
}
if (inputs.has("minlen")){
job.minimum_tag_len = inputs.getInt("minlen");
}

return job;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ public Representation process(TextInput input, Parameters jobParams) {
if (isDebug()) {
debug(String.format("CURRENT MEM USAGE(K)=%d", RuntimeTools.reportMemory()));
}
filter(matches, jobParams);

filter(matches, jobParams, input);
/*
* transform matches as JSON output.
*/
Expand All @@ -192,17 +193,32 @@ public Representation process(TextInput input, Parameters jobParams) {
}
}

private void filter(List<TextMatch> matches, Parameters jobParams) {
/**
* Complex filters -- primarily due to the addition of POSTAL feature tagging; That introduce significant
* noise which needs lots of special post-processing logic.
*
* @param matches
* @param jobParams
*/
private void filter(List<TextMatch> matches, Parameters jobParams, TextInput signal) {

// Language filter for text spans.
boolean isGeneralLang = true;
if (signal.getCharacterization() != null) {
isGeneralLang = !(signal.getCharacterization().hasCJK || signal.getCharacterization().hasMiddleEastern);
}

for (TextMatch m : matches) {
// Big loop for conditionals... Only one special condition currently:
// Big loop for conditionals...
//
// 1. IF Caller is not asking for "codes" output.... the omit any postal codes or state/ADM1 codes
// that are not fully resolved.
if (!jobParams.tag_codes) {
if (m instanceof PlaceCandidate) {
PlaceCandidate place = (PlaceCandidate) m;
Place resolvedPlace = place.getChosenPlace();
if (m instanceof PlaceCandidate) {
PlaceCandidate place = (PlaceCandidate) m;
Place resolvedPlace = place.getChosenPlace();
boolean validCountry = place.isCountry && place.getLength() > 2;
boolean no_qualifying_geolocation = !place.hasResolvedRelated();
if (!jobParams.tag_codes) {
// IF Caller is not asking for "codes" output.... then omit any postal codes or state/ADM1 codes
// that are not fully resolved.
if (resolvedPlace != null && resolvedPlace.isCode()) {
// This condition differentiates matches -- looking to evaluate only inferred places that are codes.
// Cases: CODE -- Bare CODE. although resolved, its likely noise.
Expand All @@ -211,11 +227,21 @@ private void filter(List<TextMatch> matches, Parameters jobParams) {
// Filter out non-Postal codes if user is not requesting "codes" to be listed.
if (!qualified && !GeonamesUtility.isPostal(resolvedPlace)) {
place.setFilteredOut(true);
} else if (place.isShortName() && !place.hasResolvedRelated()) {
} else if (place.isShortName() && no_qualifying_geolocation) {
place.setFilteredOut(true);
}
}
}
if (no_qualifying_geolocation) {
// When place has no related geography and is a form of code, abbrev or other noise omit. Cases:
// 1. if Place represents POSTAL area
// 2. is a trivial length match (but not a country). This applies to non-CjK, non-Arabic scripts
if (place.hasPostal()) {
place.setFilteredOut(true);
} else if (m.getLength() < jobParams.minimum_tag_len && !validCountry && isGeneralLang) {
place.setFilteredOut(true);
}
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/banner.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
//
// Xponents Geotagger version 3.7 (2024-Q1)
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
VERSION: 3.7.1
VERSION: 3.7.3

0 comments on commit f528581

Please sign in to comment.