-
-
Notifications
You must be signed in to change notification settings - Fork 9
/
error_analysis.py
120 lines (98 loc) · 6.32 KB
/
error_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from pathlib import Path
from lstm_word_segmentation.word_segmenter import pick_lstm_model
from lstm_word_segmentation.text_helpers import get_lines_of_text
from lstm_word_segmentation.line import Line
from lstm_word_segmentation.preprocess import evaluate_existing_algorithms
import deepcut
import timeit
# Analyzing how different word segmenters perform for a set of unsegmented lines
# '''
# Picking models for error analysis
word_segmenter1 = pick_lstm_model(model_name="Thai_graphclust_exclusive_model4_heavy", embedding="grapheme_clusters_tf",
train_data="BEST", eval_data="BEST")
word_segmenter2 = pick_lstm_model(model_name="Thai_graphclust_model5_heavy", embedding="grapheme_clusters_tf",
train_data="BEST", eval_data="BEST")
word_segmenter3 = pick_lstm_model(model_name="Thai_graphclust_model7_heavy", embedding="grapheme_clusters_tf",
train_data="BEST", eval_data="BEST")
word_segmenter4 = pick_lstm_model(model_name="Thai_graphclust_exclusive_model4_heavy", embedding="grapheme_clusters_tf",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenter5 = pick_lstm_model(model_name="Thai_codepoints_exclusive_model4_heavy", embedding="codepoints",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenter6 = pick_lstm_model(model_name="Thai_codepoints_exclusive_model7_heavy", embedding="codepoints",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenters = [word_segmenter1, word_segmenter2, word_segmenter3, word_segmenter4, word_segmenter5, word_segmenter6]
# Show output of models
verbose = True
# Write the output of models into a output_file defined below
write = False
if write:
output_file = Path.joinpath(Path(__file__).parent.absolute(), 'Data/wiki_thai_sample_exclusive_results.txt')
output = open(str(output_file), 'w')
# Read the input from a file directly. The alternative is inserting them directly below
read = False
if read:
file = Path.joinpath(Path(__file__).parent.absolute(), 'Data/wiki_thai_sample_exclusive.txt')
lines = get_lines_of_text(file, "unsegmented")
else:
inp_lines = ["|เพราะ|เขา|เห็น|โอกาส|ใน|การ|ซื้อ|", "|การ|เดินทาง|ใน|", "|นั่ง|นายก|ฯ|ต่อ|สมัย|หน้า|",
"|พร้อม|จัดตั้ง|",
"|เพราะ|ดนตรี|ที่|ชอบ|นั้น|"]
# inp_lines = ["|ဖော်ပြ|ထားသည့်|", "|အသားအရောင်|အားဖြင့်|", "|သဘာဝ|အားဖြင့်|", "|ထို့ပြင်|", "|နိုင်ငံရေး|ဆိုင်ရာ|"]
lines = []
for inp_line in inp_lines:
lines.append(Line(inp_line, "man_segmented"))
# Running different models
for line in lines:
deepcut_words = deepcut.tokenize(line.unsegmented)
deepcut_segmented = "|"
for word in deepcut_words:
deepcut_segmented += word + "|"
if verbose:
print("***************************************************************************************************")
print("{:<40} : {}".format("Unsegmented", line.unsegmented))
print("{:<40} : {}".format("Deepcut", deepcut_segmented))
print("{:<40} : {}".format("ICU", line.icu_segmented))
for word_seg in word_segmenters:
print("{:<40} : {}".format(word_seg.name, word_seg.segment_arbitrary_line(line.unsegmented)))
print("***************************************************************************************************")
if write:
output.write("********************************************************************************\n")
output.write("{:<40} : {}\n".format("Unsegmented", line.unsegmented))
output.write("{:<40} : {}\n".format("Deepcut", deepcut_segmented))
output.write("{:<40} : {}\n".format("ICU", line.icu_segmented))
for word_seg in word_segmenters:
output.write("{:<40} : {}\n".format(word_seg.name, word_seg.segment_arbitrary_line(line.unsegmented)))
output.write("********************************************************************************\n")
# '''
# Measuring the time that different models need to evaluate texts
# '''
word_segmenter1 = pick_lstm_model(model_name="Thai_codepoints_exclusive_model4_heavy", embedding="codepoints",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenter2 = pick_lstm_model(model_name="Thai_graphclust_model5_heavy", embedding="grapheme_clusters_tf",
train_data="BEST", eval_data="BEST")
word_segmenter3 = pick_lstm_model(model_name="Thai_graphclust_model7_heavy", embedding="grapheme_clusters_tf",
train_data="BEST", eval_data="BEST")
word_segmenter4 = pick_lstm_model(model_name="Thai_codepoints_exclusive_model4_heavy", embedding="codepoints",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenter5 = pick_lstm_model(model_name="Thai_codepoints_exclusive_model5_heavy", embedding="codepoints",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenter6 = pick_lstm_model(model_name="Thai_codepoints_exclusive_model7_heavy", embedding="codepoints",
train_data="exclusive BEST", eval_data="exclusive BEST")
word_segmenters = [word_segmenter1, word_segmenter2, word_segmenter3, word_segmenter4, word_segmenter5, word_segmenter6]
# Testing all models using the same BEST data: texts 40-60
for word_seg in word_segmenters:
start = timeit.default_timer()
word_seg.test_model_line_by_line(verbose=False, fast=True)
stop = timeit.default_timer()
print("{} Time using test_line_by_line: {}".format(word_seg.name, stop - start))
# ICU
start = timeit.default_timer()
evaluate_existing_algorithms(algorithm="ICU", data="BEST", fast=True)
stop = timeit.default_timer()
print("ICU Time: {}".format(stop-start))
# Deepcut
start = timeit.default_timer()
evaluate_existing_algorithms(algorithm="Deepcut", data="BEST", fast=True)
stop = timeit.default_timer()
print("Deepcut Time: {}".format(stop-start))
# '''