-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
3.6.6: noise handling in various language modes
- Loading branch information
Showing
11 changed files
with
268 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
src/main/java/org/opensextant/extractors/geo/rules/NonLatinNameRule.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package org.opensextant.extractors.geo.rules; | ||
|
||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.opensextant.data.Place; | ||
import org.opensextant.extractors.geo.PlaceCandidate; | ||
import org.opensextant.extractors.geo.ScoredPlace; | ||
import org.opensextant.util.TextUtils; | ||
|
||
/** | ||
* GeocodeRule called only if document is non-Latin such as C/J/K or MiddleEastern scripts. | ||
*/ | ||
public class NonLatinNameRule extends GeocodeRule { | ||
|
||
@Override | ||
public boolean filterByNameOnly(PlaceCandidate name) { | ||
|
||
// Assess lesser known places if only two chars long or so: | ||
if (name.getLength() < 3 && !name.isCountry) { | ||
name.setFilteredOut(true); | ||
name.addRule("Lang.LengthHeuristic"); | ||
return true; | ||
} else | ||
// Assess general alpha to non-alpha content in the name | ||
{ | ||
int charRatio = name.hasCJKText() ? 3 : name.hasMiddleEasternText() ? 5 : -1; | ||
if ( charRatio > 0 && !NonsenseFilter.assessPhraseDensity(name.getText(), charRatio)) { | ||
name.addRule("Lang.DensityHeuristic"); | ||
name.setFilteredOut(true); | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
@Override | ||
public void evaluate(PlaceCandidate name, Place geo) { | ||
/* no-op */ | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.