Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 2089436. Error in reflowing lines from PDFs #2540

Merged
merged 1 commit into from
Nov 24, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions src/calibre/ebooks/pdf/reflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,16 +830,22 @@ def is_empty(self):
and len(self.imgs) == 0

def find_match(self, frag):
# We have not yet worked out the stats, specifically line_spacing
# Approximate the line spacing for checking overlapped lines
line_height = frag.bottom - frag.top
for t in self.texts:
if t is not frag :
# Do the parts of a line overlap?
# Some files can have separate lines overlapping slightly
# BOTTOM_FACTOR allows for this
if (frag.top == t.top or frag.bottom == t.bottom) \
or (frag.top < t.top and frag.bottom > t.top+BOTTOM_FACTOR) \
or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \
or (t.top < frag.top and t.bottom > frag.top+BOTTOM_FACTOR) \
or (t.top < frag.top and t.bottom+BOTTOM_FACTOR > frag.bottom):
top = min(frag.top, t.top)
bot = max(frag.bottom, t.bottom)
if bot - top < line_height * 1.5 \
and ((frag.top == t.top or frag.bottom == t.bottom) \
or (frag.top < t.top and frag.bottom > t.top+BOTTOM_FACTOR) \
or (frag.top < t.top and frag.bottom+BOTTOM_FACTOR > t.bottom) \
or (t.top < frag.top and t.bottom > frag.top+BOTTOM_FACTOR) \
or (t.top < frag.top and t.bottom+BOTTOM_FACTOR > frag.bottom)):
return t # Force match if same line
# Sorting can put parts of a line in the wrong order if there are small chars
if t.left < frag.left:
Expand Down Expand Up @@ -1045,8 +1051,8 @@ def can_merge(self, first_text, second_text, stats):
# Should check for values approx the same, as with indents
frag.margin_left = int(round(((frag.left - left) / self.stats_margin_px)+0.5))
if last_frag is not None \
and frag.bottom - last_frag.bottom \
> stats.para_space*SECTION_FACTOR:
and stats.para_space > 0 \
and frag.bottom - last_frag.bottom > stats.para_space*SECTION_FACTOR:
#and frag.top - last_frag.bottom > frag.height + stats.line_space + (stats.line_space*LINE_FACTOR):
frag.blank_line_before = 1
last_frag = frag
Expand Down Expand Up @@ -1112,6 +1118,7 @@ def find_margins(self, tops, indents, line_spaces, bottoms, rights):
# The most used font will be treated as size 1em
max_bot = 0
max_right = 0
max_space = 0
last_top = 0
#last_bottom = 0
first = True
Expand All @@ -1132,6 +1139,8 @@ def find_margins(self, tops, indents, line_spaces, bottoms, rights):
# Beware of multiple text on same line. These look like small spacing
if text.height <= space:
line_spaces[space] = line_spaces.get(space, 0) + 1
elif not max_space:
max_space = space # Remember first in case of short docs

last_top = top
max_bot = max(max_bot, text.bottom)
Expand All @@ -1144,6 +1153,9 @@ def find_margins(self, tops, indents, line_spaces, bottoms, rights):
bottoms[max_bot] = bottoms.get(max_bot, 0) + 1
if max_right > 0:
rights[max_right] = rights.get(max_right, 0) + 1
if max_space > 0 and not line_spaces:
# Nothing has been set up, so create one to avoid empty array
line_spaces[max_space] = line_spaces.get(max_space, 0) + 1

return
#########################
Expand Down Expand Up @@ -1717,7 +1729,9 @@ def set_indents(indents, odd_even):
break
count -= 1

# For safety, check in the right order
# For safety, check present and in the right order
if not para_k or para_k == line_k:
para_k = round(line_k * PARA_FACTOR)
if line_k > para_k:
x = para_k
para_k = line_k
Expand All @@ -1726,7 +1740,7 @@ def set_indents(indents, odd_even):
self.stats.line_space = line_k
# Some docs have no great distinction for paragraphs
# Limit the size of the gap, or section breaks not found
if para_k > line_k * PARA_FACTOR:
if para_k > round(line_k * PARA_FACTOR):
self.stats.para_space = round(line_k * PARA_FACTOR)
else:
self.stats.para_space = para_k
Expand Down
Loading