Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional header/footer checks #2528

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 65 additions & 25 deletions src/calibre/ebooks/pdf/reflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ def create_page_format(self, stats, opts):
# Do this before automatic actions
self.remove_head_foot_regex(opts)

def find_margins(self, tops, indents_odd, indents_even, line_spaces, bottoms, rights):
def find_margins(self, tops, indents, line_spaces, bottoms, rights):

#from collections import Counter

Expand Down Expand Up @@ -1138,10 +1138,7 @@ def find_margins(self, tops, indents_odd, indents_even, line_spaces, bottoms, ri

max_right = max(max_right, text.right)

if self.odd_even:
indents_odd[left] = indents_odd.get(left, 0) + 1
else:
indents_even[left] = indents_even.get(left, 0) + 1
indents[left] = indents.get(left, 0) + 1

if max_bot > 0:
bottoms[max_bot] = bottoms.get(max_bot, 0) + 1
Expand Down Expand Up @@ -1449,8 +1446,8 @@ def __init__(self, xml, opts, log):

# Work out document dimensions from page format
for page in self.pages:
page.find_margins(self.tops, self.indents_odd, self.indents_even, \
self.line_spaces, self.bottoms, self.rights)
page.find_margins(self.tops, self.indents_odd if page.odd_even else self.indents_even, \
self.line_spaces, self.bottoms, self.rights)

self.setup_stats()

Expand Down Expand Up @@ -1757,14 +1754,21 @@ def find_header_footer(self):
head_text = [''] * LINE_SCAN_COUNT
head_match = [0] * LINE_SCAN_COUNT
head_match1 = [0] * LINE_SCAN_COUNT
head_match2 = [0] * LINE_SCAN_COUNT
head_page = 0
head_skip = 0
foot_text = [''] * LINE_SCAN_COUNT
foot_match = [0] * LINE_SCAN_COUNT
foot_match1 = [0] * LINE_SCAN_COUNT
foot_match2 = [0] * LINE_SCAN_COUNT
foot_page = 0
foot_skip = 0
# xxx nn xxx nn or nn xxx or just roman numerals
pagenum_text = r'(.*\d+\s+\w+\s+\d+.*)|(\s*\d+\s+.*)|(^\s*[ivxlcIVXLC]+\s*$)'
# For line ending nn, is the preceding text constant
fixed_text = r'(^.+[^0-9])\d+\s*$'
fixed_head = ''
fixed_foot = ''

pages_to_scan = scan_count
# Note that a line may be in more than 1 part
Expand All @@ -1790,11 +1794,21 @@ def find_header_footer(self):
head_match[head_ind] += 1
if head_page == 0:
head_page = page.number
else: # Look for page count of format 'n xxx n'
if re.match(pagenum_text, t) is not None:
head_match1[head_ind] += 1
if head_page == 0:
head_page = page.number
elif re.match(pagenum_text, t) is not None:
# Look for page count of format 'n xxx n'
head_match1[head_ind] += 1
if head_page == 0:
head_page = page.number
else:
# Look for text of format 'constant nn'
f = re.match(fixed_text, t)
if f and f.group(1):
if not fixed_head:
fixed_head = f.group(1)
elif fixed_head == f.group(1):
head_match2[head_ind] += 1
if head_page == 0:
head_page = page.number

if self.opts.pdf_footer_skip < 0 \
and len(page.texts) > 0:
Expand All @@ -1813,11 +1827,21 @@ def find_header_footer(self):
foot_match[foot_ind] += 1
if foot_page == 0:
foot_page = page.number
else: # Look for page count of format 'n xxx n'
if re.match(pagenum_text, t) is not None:
foot_match1[foot_ind] += 1
if foot_page == 0:
foot_page = page.number
elif re.match(pagenum_text, t) is not None:
# Look for page count of format 'n xxx n'
foot_match1[foot_ind] += 1
if foot_page == 0:
foot_page = page.number
else:
# Look for text of format 'constant nn'
f = re.match(fixed_text, t)
if f and f.group(1):
if not fixed_foot:
fixed_foot = f.group(1)
elif fixed_foot == f.group(1):
foot_match2[foot_ind] += 1
if foot_page == 0:
foot_page = page.number

pages_to_scan -= 1
if pages_to_scan < 1:
Expand All @@ -1833,19 +1857,27 @@ def find_header_footer(self):

head_ind = 0
for i in range(LINE_SCAN_COUNT):
if head_match[i] > pages_to_scan or head_match1[i] > pages_to_scan:
if head_match[i] > pages_to_scan \
or head_match1[i] > pages_to_scan \
or head_match2[i] > pages_to_scan:
head_ind = i # Remember the last matching line
if self.pages[head_page].texts \
and (head_match[head_ind] > pages_to_scan or head_match1[head_ind] > pages_to_scan):
and (head_match[head_ind] > pages_to_scan \
or head_match1[head_ind] > pages_to_scan \
or head_match2[head_ind] > pages_to_scan):
t = self.pages[head_page].texts[head_ind]
head_skip = t.top + t.height + 1

foot_ind = 0
for i in range(LINE_SCAN_COUNT):
if foot_match[i] > pages_to_scan or foot_match1[i] > pages_to_scan:
if foot_match[i] > pages_to_scan \
or foot_match1[i] > pages_to_scan \
or foot_match2[i] > pages_to_scan:
foot_ind = i # Remember the last matching line
if self.pages[foot_page].texts \
and (foot_match[foot_ind] > pages_to_scan or foot_match1[foot_ind] > pages_to_scan):
and (foot_match[foot_ind] > pages_to_scan \
or foot_match1[foot_ind] > pages_to_scan \
or foot_match2[foot_ind] > pages_to_scan):
t = self.pages[foot_page].texts[-foot_ind-1]
foot_skip = t.top - 1

Expand Down Expand Up @@ -1884,6 +1916,8 @@ def merge_pages(self, idc):
save_bottom = 0
# After merge, skip to this page
pind = 0
# If a page is merged, and removed, may need to remember it
save_candidate = None

# Now merge where bottom of one is within ORPHAN_LINES lines of max_bottom
# and top of next is within a line of min_top
Expand All @@ -1892,7 +1926,8 @@ def merge_pages(self, idc):
while merge_done:
merge_done = False # A merge was done
merged_page = None # Page merged into previous
candidate = None # Lines close enough to the bottom that it might merge
candidate = save_candidate # Lines close enough to the bottom that it might merge
save_candidate = None
while pind < len(self.pages):
page = self.pages[pind]
stats_left = page.stats_left
Expand Down Expand Up @@ -1976,14 +2011,19 @@ def merge_pages(self, idc):
candidate.texts[-1].coalesce(merged_text, candidate.number, left_margin, right_margin)
merged_page.texts.remove(merged_text)
# Put back top/bottom after coalesce if final line
if save_bottom != 0.0 :
if save_bottom:
# Ignore top as that can confuse things where the 1st para of a page
# was merged with a previous. Keep the original top
candidate.texts[-1].bottom = save_bottom
#candidate.coalesce_paras()

# Have we removed everything from this page (well, all texts and images)
if merged_page.is_empty:
candidate.texts[-1].blank_line_before = 1
# Empty page does/may not actually mean blank line
#candidate.texts[-1].blank_line_before = 1
# If pages are merged, and the merged page gets removed (as here),
# and the next page is short (forced page break),
# then the merge would fail when this loop restarts.
save_candidate = candidate
self.pages.remove(merged_page)

def linearize(self):
Expand Down
Loading