forked from guitarmind/tesseract-web-service
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tesseractserver.py
185 lines (151 loc) · 5.81 KB
/
tesseractserver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
import tornado.httpserver
import tornado.ioloop
import tornado.web
import optparse
import pprint
import Image
import StringIO
import os
import uuid
# for downloading PNG image from url
import urllib, cStringIO
import json
# C API wrapper
from tesseractcapi import TesseactWrapper
# global variables
lang = "eng"
libpath = ""
tessdata = ""
workingFolderPath = os.getcwd()
wrapper = None
"""
Handles the GET/POST of image files to OCR result string.
"""
class FileUploadHandler(tornado.web.RequestHandler):
def get(self):
self.write('</pre>'+
'<form action="/upload" method="post" enctype="multipart/form-data">'+
'<input type="file" name="the_file" />'+
'<input type="submit" value="Submit" />'+'</form>'+
'<pre class="prettyprint">')
def post(self):
self.set_header("Content-Type", "text/html")
self.write("")
# create a unique ID file
tempname = str(uuid.uuid4()) + ".png"
tmpImg = Image.open(StringIO.StringIO(self.request.files.items()[0][1][0]['body']))
# force resize to width=150px if the incoming image is too small for better precision
targetWidth = 150
width, height = tmpImg.size
if width < targetWidth:
ratio = float(targetWidth) / width
newHeight = int(height * ratio)
tmpImg = tmpImg.resize((targetWidth, newHeight), Image.ANTIALIAS)
print "resize image to (" + str(targetWidth) + "," + str(newHeight) + ")"
# save tmp image
global workingFolderPath
tmpFilePath = workingFolderPath + "/static/" + tempname
print "workingFolderPath: ", workingFolderPath
print "tmpFilePath: ", tmpFilePath
tmpImg.save(tmpFilePath)
# do OCR, print result
global wrapper
result = wrapper.imageFileToString(tmpFilePath)
# remove tmp image file
self.cleanup(tmpFilePath)
if "." not in result and " " in result:
result = result.replace(" ", ".")
else:
result = result.replace(" ", "")
# send response json
response = { 'result': result }
self.write(response)
print response
def cleanup(self, filePath):
try:
os.remove(filePath)
except OSError:
pass
"""
Handles the GET/POST of image url to OCR result string.
"""
class ImageUrlHandler(tornado.web.RequestHandler):
def initialize(self, *args, **kwargs):
self.contentType = self.request.headers.get('Content-Type')
def get(self):
html = """
<html>
<title>Tesseract Web Service</title>
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script>
<body>
<h2>Tesseract Web Service</h2>
<form name="mainForm" id="mainForm" action="" method="POST" enctype="multipart/form-data">
Target image url: <input type="text" id="imageUrl" name="imageUrl" size="80" />
<input id="submitBtn" type="submit" value="Submit" />
</form>
<div id="result"></div>
</body>
</html>
"""
self.write(html)
def post(self):
self.set_header("Content-Type", "application/json; charset=UTF-8")
if("multipart/form-data" in self.contentType):
url = self.get_argument("imageUrl", default = None, strip = False)
else:
# parse received json
jsonobj = json.loads(self.request.body)
url = jsonobj["url"]
# force resize to width=150px if the incoming image is too small for better precision
minWidth = 150;
# do OCR, get result string
global wrapper
result = wrapper.imageUrlToString(url, minWidth)
if "." not in result and " " in result:
result = result.replace(" ", ".")
else:
result = result.replace(" ", "")
# send response json
response = { 'result': result, 'url': url }
self.write(response)
print response
settings = {
"static_path": os.path.join(os.path.dirname(__file__), "static"),
}
application = tornado.web.Application([
(r"/upload", FileUploadHandler),
(r"/fetchurl", ImageUrlHandler)
], **settings)
def main():
parser = optparse.OptionParser()
parser.add_option('-p', '--port', dest='port', help='the listening port of RESTful tesseract web service. (default: 1688)')
parser.add_option('-l', '--lang', dest='lang', help='the targe language. (default: eng)')
parser.add_option('-b', '--lib-path', dest='libPath', help='the absolute path of tesseract library.')
parser.add_option('-d', '--tessdata-folder', dest='tessdata', help='the absolute path of tessdata folder containing language packs.')
(options, args) = parser.parse_args()
global lang
global libpath
global tessdata
global wrapper
if options.lang: # if lang is given
lang = options.lang
if not options.libPath: # if libPath is not given
parser.error('lib-path not given')
else:
libpath = options.libPath
if not options.tessdata: # if tessdata is not given
parser.error('tessdata not given')
else:
tessdata = options.tessdata
# create global wrapper instance for reuse
wrapper = TesseactWrapper(lang, libpath, tessdata)
port = options.port
if not options.port: # if port is not given, use the default one
port = 1688
http_server = tornado.httpserver.HTTPServer(application)
http_server.listen(port)
print "Tesseract Web Service starts at port " + str(port) + "."
tornado.ioloop.IOLoop.instance().start()
if __name__ == "__main__":
main()