diff --git a/RELEASE.md b/RELEASE.md index 89a99856..66e12490 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,6 +13,9 @@ Visit https://github.com/OpenSextant/Xponents/releases for latest library releas were replaced. * Migrated to a separate repo: Find this now maintained separately at [`Xponents-Core` repo](https://github.com/OpenSextant/Xponents-Core) This is the first release ./Core/ folder will not appear in this source tree. +* **Xponents REST**: + * Noise filtering; added `minlen` parameter to Xlayer (Xponents REST) service. By default it filters out trivial stuff. About 3-5% reduction in + noise tags from codes, short names, etc, -- things typically not useful geocodings nor typically names of places. **Xponents 3.6.7**: Springy * **Core API** diff --git a/pom.xml b/pom.xml index 3c94e50d..c50bd36a 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 org.opensextant opensextant-xponents - 3.7.3 + 3.7.4 jar OpenSextant Xponents An information extraction toolkit focused on geography and temporal entities diff --git a/src/main/java/org/opensextant/output/Transforms.java b/src/main/java/org/opensextant/output/Transforms.java index 5660f508..9861c370 100644 --- a/src/main/java/org/opensextant/output/Transforms.java +++ b/src/main/java/org/opensextant/output/Transforms.java @@ -296,6 +296,17 @@ private static JsonObject populateMatch(final TextMatch m) { return o; } + private static JsonArray populateTaxon(List taxons) { + JsonArray arr = new JsonArray(); + for (Taxon t : taxons) { + JsonObject tj = new JsonObject(); + tj.put("cat", t.catalog); + tj.put("taxon", t.name); + arr.add(tj); + } + return arr; + } + /** * Return seconds of epoch. * @@ -349,8 +360,11 @@ public static JsonObject toJSON(final List matches, final Parameters // Only get one taxon from this match. That is sufficient, but not perfect. Taxon n = match.getTaxons().get(0); JsonObject node = populateMatch(name); - node.put("taxon", n.name); // Name of taxon + node.put("taxon", n.name); // Name of taxon, just the first and most common. node.put("catalog", n.catalog); // Name of catalog or source + if (match.getTaxons().size() > 1) { + node.put("all-taxons", populateTaxon(match.getTaxons())); // Name of taxon, just the first and most common. + } node.put("method", "TaxonMatcher"); resultArray.add(node); } diff --git a/src/main/java/org/opensextant/xlayer/server/TaggerResource.java b/src/main/java/org/opensextant/xlayer/server/TaggerResource.java index a3a8fedf..33fec9c3 100644 --- a/src/main/java/org/opensextant/xlayer/server/TaggerResource.java +++ b/src/main/java/org/opensextant/xlayer/server/TaggerResource.java @@ -128,7 +128,7 @@ protected void parseParameters(Parameters p, Set kv) { } // Request tagging on demand. - p.tag_all_taxons = kv.contains("all-taxons"); + p.tag_all_taxons = kv.contains("all_taxons"); p.tag_taxons = (kv.contains("taxons") || kv.contains("orgs") || kv.contains("persons")); p.tag_patterns = kv.contains("patterns") || kv.contains("dates"); diff --git a/src/main/java/org/opensextant/xlayer/server/xgeo/XlayerRestlet.java b/src/main/java/org/opensextant/xlayer/server/xgeo/XlayerRestlet.java index 557a649b..36366bec 100644 --- a/src/main/java/org/opensextant/xlayer/server/xgeo/XlayerRestlet.java +++ b/src/main/java/org/opensextant/xlayer/server/xgeo/XlayerRestlet.java @@ -7,6 +7,7 @@ import org.opensextant.extraction.MatchFilter; import org.opensextant.extractors.geo.PlaceGeocoder; import org.opensextant.extractors.geo.PostalGeocoder; +import org.opensextant.extractors.xtax.TaxonMatcher; import org.opensextant.extractors.xtemporal.XTemporal; import org.opensextant.processing.Parameters; import org.opensextant.xlayer.server.XlayerApp; @@ -39,9 +40,10 @@ public synchronized Restlet createInboundRoot() { banner(); configure(); Context ctx = getContext(); - ctx.getAttributes().put("xgeo", tagger); - ctx.getAttributes().put("xtemp", dateTagger); - ctx.getAttributes().put("xpostal", postalGeocoder); + ctx.getAttributes().put(XponentsGeotagger.GEO_TAGGER, tagger); + ctx.getAttributes().put(XponentsGeotagger.DATE_TAGGER, dateTagger); + ctx.getAttributes().put(XponentsGeotagger.POSTAL_TAGGER, postalGeocoder); + ctx.getAttributes().put(XponentsGeotagger.TAXON_TAGGER, phraseTagger); ctx.getAttributes().put("version", this.version); info("%%%% Xponents Geo Phase Configured"); @@ -59,6 +61,7 @@ public synchronized Restlet createInboundRoot() { private PlaceGeocoder tagger = null; private XTemporal dateTagger = null; private PostalGeocoder postalGeocoder = null; + private TaxonMatcher phraseTagger = null; /** * @throws ConfigException @@ -73,6 +76,8 @@ public void configure() throws ConfigException { tagger.setParameters(taggerParams); tagger.configure(); + phraseTagger = new TaxonMatcher(); + // TODO: refine this filter list. Use "/filters/non-placenames,user.csv" going forward. // String userFilterPath = "/filters/non-placenames,user.csv"; diff --git a/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java b/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java index f7e658cf..23321141 100644 --- a/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java +++ b/src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java @@ -12,6 +12,7 @@ import org.opensextant.extractors.geo.PlaceCandidate; import org.opensextant.extractors.geo.PlaceGeocoder; import org.opensextant.extractors.geo.PostalGeocoder; +import org.opensextant.extractors.xtax.TaxonMatcher; import org.opensextant.extractors.xtemporal.XTemporal; import org.opensextant.output.Transforms; import org.opensextant.processing.Parameters; @@ -157,25 +158,28 @@ public Representation process(TextInput input, Parameters jobParams) { List matches = new ArrayList<>(); // BOTH geo and taxons could be requested: features = "geo", "all-taxons" - // `geo` tagging - if (tag_geo(jobParams) || tag_taxons(jobParams)) { + if (!tag_geo(jobParams) && tag_taxons(jobParams)) { + // Taxonomic tags only + TaxonMatcher phraseTagger = (TaxonMatcher) getExtractor(TAXON_TAGGER); + matches.addAll(phraseTagger.extract(input, jobParams)); + } else if (tag_geo(jobParams) || tag_taxons(jobParams)) { + // Geotagging PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor(GEO_TAGGER); matches.addAll(xgeo.extract(input, jobParams)); } + if (jobParams.tag_patterns) { XTemporal xt = (XTemporal) getExtractor(DATE_TAGGER); matches.addAll(xt.extract(input)); } if (jobParams.tag_postal) { PostalGeocoder pg = (PostalGeocoder) getExtractor(POSTAL_TAGGER); - if (pg != null) { - if (tag_geo(jobParams)) { - // OPTIMIZATION: reuse matches accumulated so far to prevent - // PostalGeocoder from repeating extract() - pg.setGeneralMatches(matches); - } - matches.addAll(pg.extract(input)); + if (tag_geo(jobParams)) { + // OPTIMIZATION: reuse matches accumulated so far to prevent + // PostalGeocoder from repeating extract() + pg.setGeneralMatches(matches); } + matches.addAll(pg.extract(input)); } if (isDebug()) { debug(String.format("CURRENT MEM USAGE(K)=%d", RuntimeTools.reportMemory())); diff --git a/src/main/resources/banner.txt b/src/main/resources/banner.txt index 35c8171b..c774d196 100644 --- a/src/main/resources/banner.txt +++ b/src/main/resources/banner.txt @@ -12,5 +12,5 @@ // // Xponents Geotagger version 3.7 (2024-Q1) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| -VERSION: 3.7.3 +VERSION: 3.7.4