diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 0eadf3d..4ce4910 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -81,8 +81,13 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, - # abort if no such image can be produced: - feature_filter='cropped') + # abort if no such image can be produced; + # moreover, for some reason, external binarization + # degrades Tesseract segmentation quality + # (probably because C_OUTLINE::ComputeEdgeOffsets, + # which needs the greyscale image, is more + # accurate than C_OUTLINE::ComputeBinaryOffsets): + feature_filter='cropped,binarized') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 1e41527..4ee1938 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -66,7 +66,13 @@ def process(self): page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id) + page, page_id, + # for some reason, external binarization + # degrades Tesseract segmentation quality + # (probably because C_OUTLINE::ComputeEdgeOffsets, + # which needs the greyscale image, is more + # accurate than C_OUTLINE::ComputeBinaryOffsets): + feature_filter='binarized') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) @@ -92,7 +98,13 @@ def process(self): LOG.warning('keeping existing TextLines in region "%s"', region.id) LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords) + region, page_image, page_coords, + # for some reason, external binarization + # degrades Tesseract segmentation quality + # (probably because C_OUTLINE::ComputeEdgeOffsets, + # which needs the greyscale image, is more + # accurate than C_OUTLINE::ComputeBinaryOffsets): + feature_filter='binarized') tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index f6a96be..300e3da 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -136,7 +136,13 @@ def process(self): LOG.warning('keeping existing ReadingOrder') page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id) + page, page_id, + # for some reason, external binarization + # degrades Tesseract segmentation quality + # (probably because C_OUTLINE::ComputeEdgeOffsets, + # which needs the greyscale image, is more + # accurate than C_OUTLINE::ComputeBinaryOffsets): + feature_filter='binarized') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 98928c3..75ce4c8 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -78,7 +78,13 @@ def process(self): page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id) + page, page_id, + # for some reason, external binarization + # degrades Tesseract segmentation quality + # (probably because C_OUTLINE::ComputeEdgeOffsets, + # which needs the greyscale image, is more + # accurate than C_OUTLINE::ComputeBinaryOffsets): + feature_filter='binarized') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) @@ -124,12 +130,25 @@ def process(self): LOG.warning('keeping existing TextRegions in block "%s" of page "%s"', region.id, page_id) # get region image region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords) + region, page_image, page_coords, + # for some reason, external binarization + # degrades Tesseract segmentation quality + # (probably because C_OUTLINE::ComputeEdgeOffsets, + # which needs the greyscale image, is more + # accurate than C_OUTLINE::ComputeBinaryOffsets): + feature_filter='binarized') tessapi.SetImage(region_image) LOG.info("Detecting table cells in region '%s'", region.id) # # detect the region segments: - tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells" + tessapi.SetPageSegMode(PSM.SPARSE_TEXT_OSD) # retrieve "cells" + # FIXME: _OSD is necessary to get VERTICAL_TEXT (90°) blocks, but + # this also causes looking for vertical gaps/alignments everywhere + # (not just blocks that end up as vertical), so often cells + # will span more than 1 line and some text will even be missed! + # We should check whether some strokewidth params can influence this. + # Otherwise, Tesseract should become more consistent in deciding for + # vertically aligned blobs (either the whole block, or keep horizontal). # TODO: we should XY-cut the sparse cells in regroup them into consistent cells layout = tessapi.AnalyseLayout() roelem = reading_order.get(region.id)