Skip to content

Commit

Permalink
Xponents REST: behavior with phrase tagging needs improvement; here a…
Browse files Browse the repository at this point in the history
…ll taxons are reported
  • Loading branch information
mubaldino committed Jun 10, 2024
1 parent 89e4f8b commit 947a78a
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 16 deletions.
3 changes: 3 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Visit https://github.com/OpenSextant/Xponents/releases for latest library releas
were replaced.
* Migrated to a separate repo: Find this now maintained separately at [`Xponents-Core` repo](https://github.com/OpenSextant/Xponents-Core)
This is the first release ./Core/ folder will not appear in this source tree.
* **Xponents REST**:
* Noise filtering; added `minlen` parameter to Xlayer (Xponents REST) service. By default it filters out trivial stuff. About 3-5% reduction in
noise tags from codes, short names, etc, -- things typically not useful geocodings nor typically names of places.

**Xponents 3.6.7**: Springy
* **Core API**
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.opensextant</groupId>
<artifactId>opensextant-xponents</artifactId>
<version>3.7.3</version>
<version>3.7.4</version>
<packaging>jar</packaging>
<name>OpenSextant Xponents</name>
<description>An information extraction toolkit focused on geography and temporal entities</description>
Expand Down
16 changes: 15 additions & 1 deletion src/main/java/org/opensextant/output/Transforms.java
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,17 @@ private static JsonObject populateMatch(final TextMatch m) {
return o;
}

private static JsonArray populateTaxon(List<Taxon> taxons) {
JsonArray arr = new JsonArray();
for (Taxon t : taxons) {
JsonObject tj = new JsonObject();
tj.put("cat", t.catalog);
tj.put("taxon", t.name);
arr.add(tj);
}
return arr;
}

/**
* Return seconds of epoch.
*
Expand Down Expand Up @@ -349,8 +360,11 @@ public static JsonObject toJSON(final List<TextMatch> matches, final Parameters
// Only get one taxon from this match. That is sufficient, but not perfect.
Taxon n = match.getTaxons().get(0);
JsonObject node = populateMatch(name);
node.put("taxon", n.name); // Name of taxon
node.put("taxon", n.name); // Name of taxon, just the first and most common.
node.put("catalog", n.catalog); // Name of catalog or source
if (match.getTaxons().size() > 1) {
node.put("all-taxons", populateTaxon(match.getTaxons())); // Name of taxon, just the first and most common.
}
node.put("method", "TaxonMatcher");
resultArray.add(node);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ protected void parseParameters(Parameters p, Set<String> kv) {
}

// Request tagging on demand.
p.tag_all_taxons = kv.contains("all-taxons");
p.tag_all_taxons = kv.contains("all_taxons");
p.tag_taxons = (kv.contains("taxons") || kv.contains("orgs") || kv.contains("persons"));
p.tag_patterns = kv.contains("patterns") || kv.contains("dates");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.opensextant.extraction.MatchFilter;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.extractors.geo.PostalGeocoder;
import org.opensextant.extractors.xtax.TaxonMatcher;
import org.opensextant.extractors.xtemporal.XTemporal;
import org.opensextant.processing.Parameters;
import org.opensextant.xlayer.server.XlayerApp;
Expand Down Expand Up @@ -39,9 +40,10 @@ public synchronized Restlet createInboundRoot() {
banner();
configure();
Context ctx = getContext();
ctx.getAttributes().put("xgeo", tagger);
ctx.getAttributes().put("xtemp", dateTagger);
ctx.getAttributes().put("xpostal", postalGeocoder);
ctx.getAttributes().put(XponentsGeotagger.GEO_TAGGER, tagger);
ctx.getAttributes().put(XponentsGeotagger.DATE_TAGGER, dateTagger);
ctx.getAttributes().put(XponentsGeotagger.POSTAL_TAGGER, postalGeocoder);
ctx.getAttributes().put(XponentsGeotagger.TAXON_TAGGER, phraseTagger);

ctx.getAttributes().put("version", this.version);
info("%%%% Xponents Geo Phase Configured");
Expand All @@ -59,6 +61,7 @@ public synchronized Restlet createInboundRoot() {
private PlaceGeocoder tagger = null;
private XTemporal dateTagger = null;
private PostalGeocoder postalGeocoder = null;
private TaxonMatcher phraseTagger = null;

/**
* @throws ConfigException
Expand All @@ -73,6 +76,8 @@ public void configure() throws ConfigException {
tagger.setParameters(taggerParams);
tagger.configure();

phraseTagger = new TaxonMatcher();

// TODO: refine this filter list. Use "/filters/non-placenames,user.csv" going forward.
//
String userFilterPath = "/filters/non-placenames,user.csv";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.extractors.geo.PostalGeocoder;
import org.opensextant.extractors.xtax.TaxonMatcher;
import org.opensextant.extractors.xtemporal.XTemporal;
import org.opensextant.output.Transforms;
import org.opensextant.processing.Parameters;
Expand Down Expand Up @@ -157,25 +158,28 @@ public Representation process(TextInput input, Parameters jobParams) {
List<TextMatch> matches = new ArrayList<>();

// BOTH geo and taxons could be requested: features = "geo", "all-taxons"
// `geo` tagging
if (tag_geo(jobParams) || tag_taxons(jobParams)) {
if (!tag_geo(jobParams) && tag_taxons(jobParams)) {
// Taxonomic tags only
TaxonMatcher phraseTagger = (TaxonMatcher) getExtractor(TAXON_TAGGER);
matches.addAll(phraseTagger.extract(input, jobParams));
} else if (tag_geo(jobParams) || tag_taxons(jobParams)) {
// Geotagging
PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor(GEO_TAGGER);
matches.addAll(xgeo.extract(input, jobParams));
}

if (jobParams.tag_patterns) {
XTemporal xt = (XTemporal) getExtractor(DATE_TAGGER);
matches.addAll(xt.extract(input));
}
if (jobParams.tag_postal) {
PostalGeocoder pg = (PostalGeocoder) getExtractor(POSTAL_TAGGER);
if (pg != null) {
if (tag_geo(jobParams)) {
// OPTIMIZATION: reuse matches accumulated so far to prevent
// PostalGeocoder from repeating extract()
pg.setGeneralMatches(matches);
}
matches.addAll(pg.extract(input));
if (tag_geo(jobParams)) {
// OPTIMIZATION: reuse matches accumulated so far to prevent
// PostalGeocoder from repeating extract()
pg.setGeneralMatches(matches);
}
matches.addAll(pg.extract(input));
}
if (isDebug()) {
debug(String.format("CURRENT MEM USAGE(K)=%d", RuntimeTools.reportMemory()));
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/banner.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
//
// Xponents Geotagger version 3.7 (2024-Q1)
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
VERSION: 3.7.3
VERSION: 3.7.4

0 comments on commit 947a78a

Please sign in to comment.