-
Notifications
You must be signed in to change notification settings - Fork 3
/
RuleBasedTranscriber.kt
137 lines (123 loc) · 6.76 KB
/
RuleBasedTranscriber.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
package com.github.medavox.ipa_transcribers
import com.github.medavox.ipa_transcribers.err
import com.github.medavox.ipa_transcribers.unicodeName
/***This API takes a context-free approach:
* Regex is matched to the start of the string only,
* and the output String is not interpreted as Regex.
*
* Therefore, there is no state held by the Transcriber;
* only simple substitutions matched by Regular expressions may be used.
**/
abstract class RuleBasedTranscriber:Transcriber, BaseRules {
abstract val completionStatus:CompletionStatus
data class UnmatchedOutput(val newWorkingInput:String, val output:(soFar:String) -> String) {
constructor(newWorkingInput: String, output:String):this(newWorkingInput, {it+output})
}
private var reportedChars:String = ""
fun reportOnceAndCopy(it:String):UnmatchedOutput {
if(!reportedChars.contains(it[0])) {
err.println("copying unknown char '${it[0]}'/'${it[0].toInt().unicodeName}' to output...")
reportedChars += it[0]
}
return UnmatchedOutput(it.substring(1), it[0].toString())
}
val reportAndSkip:(String) -> UnmatchedOutput get() = {
err.println("unknown char '${it[0]}'; skipping...")
UnmatchedOutput(it.substring(1), "")
}
val reportAndCopy:(String) -> UnmatchedOutput get() = {
err.println("copying unknown char '${it[0]}' to output...")
UnmatchedOutput(it.substring(1), it[0].toString())
}
val copy:(String) -> UnmatchedOutput get() = {
UnmatchedOutput(it.substring(1), it[0].toString())
}
/**Applies the rule which consumes the most characters.
*
* Attempt 2
* This greedy matcher matches against the most specific rule,
* not the one that consumes the most characters.
Given the example:
Rule ("abc", "def", 1),
Rule("ab", "xy")
The first rule will match, even though the second one consumes more characters.
Includes the consumed match (if any) in the specificity metric.
*///todo: when 2 rules are of equal specificity, use the one that appears first
fun String.processGreedily(rules:List<IRule>, onNoRuleMatch:(unmatched:String) -> UnmatchedOutput) : String {
var out:String = ""
var processingWord:String = this
var consumed = ""
loop@ while(processingWord.isNotEmpty()) {
//get the regex result of the unconsumedInput and alreadyConsumed matchers,
//because we'll be using them a lot.
//using a triple makes it easier to keep the IRule together with its MatchResults
val candidateRules = rules.map {
Triple<IRule, MatchResult?, MatchResult?>(it,
it.unconsumedMatcher.find(processingWord),
it.consumedMatcher?.findAll(consumed)?.lastOrNull())
}.filter {(_, unconsumed, consumedMatch) -> //filter out rules that don't match:
//the unconsumed matcher must match at the start, and
unconsumed?.range?.start == 0 &&
//the consumed matcher must either be null (unspecified), or
(consumedMatch == null ||
//it must match at the end of the "already-consumed input" string
consumedMatch.range.endInclusive == consumed.length-1)
}.map { (rule, uncon, con) -> //make the unconsumed MatchResult non-null
Triple<IRule, MatchResult, MatchResult?>(rule, uncon!!, con)
}
if(candidateRules.isEmpty()) {//no rule matched; call the lambda!
val unmatchedOutput = onNoRuleMatch(processingWord)
processingWord = unmatchedOutput.newWorkingInput
out = unmatchedOutput.output(out)
}else {
//find the rule that matches (but does not necessarily consume) the most characters
val (rule, unconsumedMatch) = candidateRules.maxBy { (_, uncon, con) ->
uncon.value.length + (con?.value?.length ?: 0)
}!!
//println("rule '$rule' matches '$processingWord'")
out = rule.outputString(out, unconsumedMatch.groups)
//number of letters consumed is the match length, unless explicitly specified
val actualLettersConsumed = rule.lettersConsumed ?: unconsumedMatch.value.length
if (actualLettersConsumed > 0) {
consumed += processingWord.substring(0, actualLettersConsumed)
processingWord = processingWord.substring(actualLettersConsumed)
continue@loop
}//else keep going through the rule list, staying at the same position in the input
}
}
return out
}
fun String.processWithRules(rules:List<IRule>, onNoRuleMatch:(unmatched:String) -> UnmatchedOutput) : String {
var out:String = ""
var processingWord:String = this
var consumed = ""
loop@ while(processingWord.isNotEmpty()) {
//uses the first rule which matches -- so rule order matters
for (rule in rules) {
val unconsumedMatch:MatchResult? = rule.unconsumedMatcher.find(processingWord)
val consumedMatches:Boolean = rule.consumedMatcher == null ||// if it's null, that counts as matching:
//rules that don't specify a consumedMatcher aren't checked against it
//if it has been specified by this rule, it has to match at the end of the already-consumed string
rule.consumedMatcher.findAll(consumed).lastOrNull()?.range?.endInclusive == consumed.length-1
//if the rule matches the start of the remaining string, and the end of the consumed string
if(consumedMatches && unconsumedMatch?.range?.start == 0) {
//println("rule '$rule' matches '$processingWord'")
out = rule.outputString(out, unconsumedMatch.groups)
//number of letters consumed is the match length, unless explicitly specified
val actualLettersConsumed = rule.lettersConsumed ?: unconsumedMatch.value.length
if(actualLettersConsumed > 0) {
consumed += processingWord.substring(0, actualLettersConsumed)
processingWord = processingWord.substring(actualLettersConsumed)
continue@loop
}//else keep going through the rule list
}
}
//no rule matched; call the lambda!
val unmatchedOutput = onNoRuleMatch(processingWord)
processingWord = unmatchedOutput.newWorkingInput
out = unmatchedOutput.output(out)
}
//System.out.println("consumed: $consumed")
return out
}
}