-
Notifications
You must be signed in to change notification settings - Fork 17
/
extract.py
executable file
·58 lines (48 loc) · 1.46 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Autoscrape Extractor - A wrapper around Hext for
walking a directory and extracting all structured
data using a provided Hext template.
Hext templates can be created using the JavaScript
UI found in ./hext_builder_ui.
Usage:
extract.py <input-directory> <hext-template> [options]
Options:
--output-file FILENAME
By default, all output will be printed to stdout.
This option directs all output to a specified file.
"""
from docopt import docopt
import html5lib
import hext
def parse_html_file(filepath):
with open(filepath, "r") as f:
html = f.read()
return html5lib.parse(
html, treebuilder='lxml', namespaceHTMLElements=False
)
if __name__ == "__main__":
docopt_args = docopt(__doc__)
option = None
if "build-template" in docopt_args.keys():
option = "build-template"
docopt_args.pop("build-template")
elif "extract" in docopt_args.keys():
option = "extract"
docopt_args.pop("extract")
# strip the -- and convert - to _, remove <>
args = {}
for option in docopt_args:
args[option[2:].replace(
'<', ''
).replace(
'>', ''
).replace(
'-', '_'
)] = docopt_args[option]
if option == "extract":
# TODO: walk directory, feed files to below:
rule = hext.Rule(strhext)
document = hext.Html(strhtml)
result = rule.extract(document)