Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/MisterAP/calibre
Browse files Browse the repository at this point in the history
Fixes #2089436 [Error in reflowing of lines from PDFs](https://bugs.launchpad.net/calibre/+bug/2089436)
  • Loading branch information
kovidgoyal committed Nov 24, 2024
2 parents 728129e + 2996ec2 commit b2ec29e
Showing 1 changed file with 23 additions and 9 deletions.
32 changes: 23 additions & 9 deletions src/calibre/ebooks/pdf/reflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,16 +830,22 @@ def is_empty(self):
and len(self.imgs) == 0

def find_match(self, frag):
# We have not yet worked out the stats, specifically line_spacing
# Approximate the line spacing for checking overlapped lines
line_height = frag.bottom - frag.top
for t in self.texts:
if t is not frag :
# Do the parts of a line overlap?
# Some files can have separate lines overlapping slightly
# BOTTOM_FACTOR allows for this
if (frag.top == t.top or frag.bottom == t.bottom) \
or (frag.top < t.top and frag.bottom > t.top+BOTTOM_FACTOR) \
or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \
or (t.top < frag.top and t.bottom > frag.top+BOTTOM_FACTOR) \
or (t.top < frag.top and t.bottom+BOTTOM_FACTOR > frag.bottom):
top = min(frag.top, t.top)
bot = max(frag.bottom, t.bottom)
if bot - top < line_height * 1.5 \
and ((frag.top == t.top or frag.bottom == t.bottom) \
or (frag.top < t.top and frag.bottom > t.top+BOTTOM_FACTOR) \
or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \
or (t.top < frag.top and t.bottom > frag.top+BOTTOM_FACTOR) \
or (t.top < frag.top and t.bottom+BOTTOM_FACTOR > frag.bottom)):
return t # Force match if same line
# Sorting can put parts of a line in the wrong order if there are small chars
if t.left < frag.left:
Expand Down Expand Up @@ -1045,8 +1051,8 @@ def can_merge(self, first_text, second_text, stats):
# Should check for values approx the same, as with indents
frag.margin_left = int(round(((frag.left - left) / self.stats_margin_px)+0.5))
if last_frag is not None \
and frag.bottom - last_frag.bottom \
> stats.para_space*SECTION_FACTOR:
and stats.para_space > 0 \
and frag.bottom - last_frag.bottom > stats.para_space*SECTION_FACTOR:
#and frag.top - last_frag.bottom > frag.height + stats.line_space + (stats.line_space*LINE_FACTOR):
frag.blank_line_before = 1
last_frag = frag
Expand Down Expand Up @@ -1112,6 +1118,7 @@ def find_margins(self, tops, indents, line_spaces, bottoms, rights):
# The most used font will be treated as size 1em
max_bot = 0
max_right = 0
max_space = 0
last_top = 0
#last_bottom = 0
first = True
Expand All @@ -1132,6 +1139,8 @@ def find_margins(self, tops, indents, line_spaces, bottoms, rights):
# Beware of multiple text on same line. These look like small spacing
if text.height <= space:
line_spaces[space] = line_spaces.get(space, 0) + 1
elif not max_space:
max_space = space # Remember first in case of short docs

last_top = top
max_bot = max(max_bot, text.bottom)
Expand All @@ -1144,6 +1153,9 @@ def find_margins(self, tops, indents, line_spaces, bottoms, rights):
bottoms[max_bot] = bottoms.get(max_bot, 0) + 1
if max_right > 0:
rights[max_right] = rights.get(max_right, 0) + 1
if max_space > 0 and not line_spaces:
# Nothing has been set up, so create one to avoid empty array
line_spaces[max_space] = line_spaces.get(max_space, 0) + 1

return
#########################
Expand Down Expand Up @@ -1717,7 +1729,9 @@ def set_indents(indents, odd_even):
break
count -= 1

# For safety, check in the right order
# For safety, check present and in the right order
if not para_k or para_k == line_k:
para_k = round(line_k * PARA_FACTOR)
if line_k > para_k:
x = para_k
para_k = line_k
Expand All @@ -1726,7 +1740,7 @@ def set_indents(indents, odd_even):
self.stats.line_space = line_k
# Some docs have no great distinction for paragraphs
# Limit the size of the gap, or section breaks not found
if para_k > line_k * PARA_FACTOR:
if para_k > round(line_k * PARA_FACTOR):
self.stats.para_space = round(line_k * PARA_FACTOR)
else:
self.stats.para_space = para_k
Expand Down

0 comments on commit b2ec29e

Please sign in to comment.