OCR-D · bertsky · Aug 24, 2020 · Aug 24, 2020 · Oct 1, 2020
diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py
@@ -81,8 +81,13 @@ def process(self):
                 page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                     page, page_id,
                     # image must not have been cropped already,
-                    # abort if no such image can be produced:
-                    feature_filter='cropped')
+                    # abort if no such image can be produced;
+                    # moreover, for some reason, external binarization
+                    # degrades Tesseract segmentation quality
+                    # (probably because C_OUTLINE::ComputeEdgeOffsets,
+                    #  which needs the greyscale image, is more
+                    #  accurate than C_OUTLINE::ComputeBinaryOffsets):
+                    feature_filter='cropped,binarized')
                 if self.parameter['dpi'] > 0:
                     dpi = self.parameter['dpi']
                     LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)

diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py
@@ -66,7 +66,13 @@ def process(self):
                 page = pcgts.get_Page()
 
                 page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                    page, page_id)
+                    page, page_id,
+                    # for some reason, external binarization
+                    # degrades Tesseract segmentation quality
+                    # (probably because C_OUTLINE::ComputeEdgeOffsets,
+                    #  which needs the greyscale image, is more
+                    #  accurate than C_OUTLINE::ComputeBinaryOffsets):
+                    feature_filter='binarized')
                 if self.parameter['dpi'] > 0:
                     dpi = self.parameter['dpi']
                     LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
@@ -92,7 +98,13 @@ def process(self):
                             LOG.warning('keeping existing TextLines in region "%s"', region.id)
                     LOG.debug("Detecting lines in region '%s'", region.id)
                     region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords)
+                        region, page_image, page_coords,
+                        # for some reason, external binarization
+                        # degrades Tesseract segmentation quality
+                        # (probably because C_OUTLINE::ComputeEdgeOffsets,
+                        #  which needs the greyscale image, is more
+                        #  accurate than C_OUTLINE::ComputeBinaryOffsets):
+                        feature_filter='binarized')
                     tessapi.SetImage(region_image)
                     for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
                         line_id = '%s_line%04d' % (region.id, line_no)

diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py
@@ -136,7 +136,13 @@ def process(self):
                         LOG.warning('keeping existing ReadingOrder')
 
                 page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                    page, page_id)
+                    page, page_id,
+                    # for some reason, external binarization
+                    # degrades Tesseract segmentation quality
+                    # (probably because C_OUTLINE::ComputeEdgeOffsets,
+                    #  which needs the greyscale image, is more
+                    #  accurate than C_OUTLINE::ComputeBinaryOffsets):
+                    feature_filter='binarized')
                 if self.parameter['dpi'] > 0:
                     dpi = self.parameter['dpi']
                     LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)

diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py
@@ -78,7 +78,13 @@ def process(self):
                 page = pcgts.get_Page()
 
                 page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                    page, page_id)
+                    page, page_id,
+                    # for some reason, external binarization
+                    # degrades Tesseract segmentation quality
+                    # (probably because C_OUTLINE::ComputeEdgeOffsets,
+                    #  which needs the greyscale image, is more
+                    #  accurate than C_OUTLINE::ComputeBinaryOffsets):
+                    feature_filter='binarized')
                 if self.parameter['dpi'] > 0:
                     dpi = self.parameter['dpi']
                     LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
@@ -124,12 +130,25 @@ def process(self):
                             LOG.warning('keeping existing TextRegions in block "%s" of page "%s"', region.id, page_id)
                     # get region image
                     region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords)
+                        region, page_image, page_coords,
+                        # for some reason, external binarization
+                        # degrades Tesseract segmentation quality
+                        # (probably because C_OUTLINE::ComputeEdgeOffsets,
+                        #  which needs the greyscale image, is more
+                        #  accurate than C_OUTLINE::ComputeBinaryOffsets):
+                        feature_filter='binarized')
                     tessapi.SetImage(region_image)
                     LOG.info("Detecting table cells in region '%s'", region.id)
                     #
                     # detect the region segments:
-                    tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells"
+                    tessapi.SetPageSegMode(PSM.SPARSE_TEXT_OSD) # retrieve "cells"
+                    # FIXME: _OSD is necessary to get VERTICAL_TEXT (90°) blocks, but
+                    #        this also causes looking for vertical gaps/alignments everywhere
+                    #        (not just blocks that end up as vertical), so often cells
+                    #        will span more than 1 line and some text will even be missed!
+                    #        We should check whether some strokewidth params can influence this.
+                    #        Otherwise, Tesseract should become more consistent in deciding for
+                    #        vertically aligned blobs (either the whole block, or keep horizontal).
                     # TODO: we should XY-cut the sparse cells in regroup them into consistent cells
                     layout = tessapi.AnalyseLayout()
                     roelem = reading_order.get(region.id)