-
Notifications
You must be signed in to change notification settings - Fork 2
/
odtxmlparser.py
executable file
·88 lines (79 loc) · 3.05 KB
/
odtxmlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python3
# -*-coding:Utf-8 -*
#Deus, in adjutorium meum intende
"""This module parses
a xml file exported via gui of libreoffice,
from a .odt file."""
from dataswitcher import finput
import polyglot.detect as polydet
import xml.etree.ElementTree as eltree
import lxml.etree
import unittest.mock as mock
no_error_parser = lxml.etree.XMLParser(recover = True)
def _detect_language(string):
"""Detects language and returned code
as following : 'la','fr'.
If automatic detection can't be made,
user input is required"""
try:
string_detector = polydet.Detector(string)
except polydet.base.UnknownLanguage:
string_detector = mock.MagicMock() # worst idea ever
string_detector.reliable = False
string_detector.language.confidence = 0
if string_detector.language.confidence < 90 or not string_detector.reliable:
answer = ''
if 'En lune de...' in string:
answer = 'fr'
elif 'Die' in string and string.count('†') == 2:
answer = 'la'
while answer not in ('la','fr'):
print(string)
suggestion = ['',string_detector.language.code][string_detector.language.code in ('la','fr')]
if suggestion == '':
languages_found = {lang.code for lang in string_detector.languages if lang.code in ('la','fr')}
if len(languages_found) != 1:
suggestion = ''
if suggestion == '':
if set(string.lower().split()).intersection({'saint','sainte','saintes','saints','confesseur','vierge','vierges'}):
suggestion = 'fr'
else:
suggestion = 'la'
answer = finput("Quelle est la langue de ce passage ? fr/la\n",suggestion)
return answer
return string_detector.language.code
def _get_children(root,martyrology):
"""Recursive function.
Get children of root"""
if root.tag == "text":
martyrology = _process_text(root.text,martyrology)
elif root.tag in ('infos','Text','LineBreak','Special'):
return martyrology
else:
for child in root:
martyrology = _get_children(root,martyrology)
return martyrology
def _process_text(text,martyrology):
"""Check language of text.
Check if it is a new day,
and create it if necessary
put it in the martyrology dict
return martyrology dict modified"""
l_code = _detect_language(text)
if l_code == 'la':
if '† Die ' in text:
martyrology['la'].append('')
martyrology['la'][-1] += text
elif l_code == 'fr':
if 'En lune de...' in text:
martyrology['fr'].append('')
martyrology['fr'][-1] += text
return martyrology
def parse_data(path):
"""Parse data from path
return two lists, one for latin, one for french.
Each list[0] == 28th of Nov"""
martyrology = {'la':[],'fr':[]}
tree = eltree.parse(path,parser = no_error_parser)
martyrology = _get_children(tree.getroot(),martyrology)
return martyrology