Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/MisterAP/calibre
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Nov 19, 2024
2 parents c946ffa + 0071e85 commit 4f5b833
Showing 1 changed file with 65 additions and 25 deletions.
90 changes: 65 additions & 25 deletions src/calibre/ebooks/pdf/reflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ def create_page_format(self, stats, opts):
# Do this before automatic actions
self.remove_head_foot_regex(opts)

def find_margins(self, tops, indents_odd, indents_even, line_spaces, bottoms, rights):
def find_margins(self, tops, indents, line_spaces, bottoms, rights):

#from collections import Counter

Expand Down Expand Up @@ -1138,10 +1138,7 @@ def find_margins(self, tops, indents_odd, indents_even, line_spaces, bottoms, ri

max_right = max(max_right, text.right)

if self.odd_even:
indents_odd[left] = indents_odd.get(left, 0) + 1
else:
indents_even[left] = indents_even.get(left, 0) + 1
indents[left] = indents.get(left, 0) + 1

if max_bot > 0:
bottoms[max_bot] = bottoms.get(max_bot, 0) + 1
Expand Down Expand Up @@ -1449,8 +1446,8 @@ def __init__(self, xml, opts, log):

# Work out document dimensions from page format
for page in self.pages:
page.find_margins(self.tops, self.indents_odd, self.indents_even, \
self.line_spaces, self.bottoms, self.rights)
page.find_margins(self.tops, self.indents_odd if page.odd_even else self.indents_even, \
self.line_spaces, self.bottoms, self.rights)

self.setup_stats()

Expand Down Expand Up @@ -1757,14 +1754,21 @@ def find_header_footer(self):
head_text = [''] * LINE_SCAN_COUNT
head_match = [0] * LINE_SCAN_COUNT
head_match1 = [0] * LINE_SCAN_COUNT
head_match2 = [0] * LINE_SCAN_COUNT
head_page = 0
head_skip = 0
foot_text = [''] * LINE_SCAN_COUNT
foot_match = [0] * LINE_SCAN_COUNT
foot_match1 = [0] * LINE_SCAN_COUNT
foot_match2 = [0] * LINE_SCAN_COUNT
foot_page = 0
foot_skip = 0
# xxx nn xxx nn or nn xxx or just roman numerals
pagenum_text = r'(.*\d+\s+\w+\s+\d+.*)|(\s*\d+\s+.*)|(^\s*[ivxlcIVXLC]+\s*$)'
# For line ending nn, is the preceding text constant
fixed_text = r'(^.+[^0-9])\d+\s*$'
fixed_head = ''
fixed_foot = ''

pages_to_scan = scan_count
# Note that a line may be in more than 1 part
Expand All @@ -1790,11 +1794,21 @@ def find_header_footer(self):
head_match[head_ind] += 1
if head_page == 0:
head_page = page.number
else: # Look for page count of format 'n xxx n'
if re.match(pagenum_text, t) is not None:
head_match1[head_ind] += 1
if head_page == 0:
head_page = page.number
elif re.match(pagenum_text, t) is not None:
# Look for page count of format 'n xxx n'
head_match1[head_ind] += 1
if head_page == 0:
head_page = page.number
else:
# Look for text of format 'constant nn'
f = re.match(fixed_text, t)
if f and f.group(1):
if not fixed_head:
fixed_head = f.group(1)
elif fixed_head == f.group(1):
head_match2[head_ind] += 1
if head_page == 0:
head_page = page.number

if self.opts.pdf_footer_skip < 0 \
and len(page.texts) > 0:
Expand All @@ -1813,11 +1827,21 @@ def find_header_footer(self):
foot_match[foot_ind] += 1
if foot_page == 0:
foot_page = page.number
else: # Look for page count of format 'n xxx n'
if re.match(pagenum_text, t) is not None:
foot_match1[foot_ind] += 1
if foot_page == 0:
foot_page = page.number
elif re.match(pagenum_text, t) is not None:
# Look for page count of format 'n xxx n'
foot_match1[foot_ind] += 1
if foot_page == 0:
foot_page = page.number
else:
# Look for text of format 'constant nn'
f = re.match(fixed_text, t)
if f and f.group(1):
if not fixed_foot:
fixed_foot = f.group(1)
elif fixed_foot == f.group(1):
foot_match2[foot_ind] += 1
if foot_page == 0:
foot_page = page.number

pages_to_scan -= 1
if pages_to_scan < 1:
Expand All @@ -1833,19 +1857,27 @@ def find_header_footer(self):

head_ind = 0
for i in range(LINE_SCAN_COUNT):
if head_match[i] > pages_to_scan or head_match1[i] > pages_to_scan:
if head_match[i] > pages_to_scan \
or head_match1[i] > pages_to_scan \
or head_match2[i] > pages_to_scan:
head_ind = i # Remember the last matching line
if self.pages[head_page].texts \
and (head_match[head_ind] > pages_to_scan or head_match1[head_ind] > pages_to_scan):
and (head_match[head_ind] > pages_to_scan \
or head_match1[head_ind] > pages_to_scan \
or head_match2[head_ind] > pages_to_scan):
t = self.pages[head_page].texts[head_ind]
head_skip = t.top + t.height + 1

foot_ind = 0
for i in range(LINE_SCAN_COUNT):
if foot_match[i] > pages_to_scan or foot_match1[i] > pages_to_scan:
if foot_match[i] > pages_to_scan \
or foot_match1[i] > pages_to_scan \
or foot_match2[i] > pages_to_scan:
foot_ind = i # Remember the last matching line
if self.pages[foot_page].texts \
and (foot_match[foot_ind] > pages_to_scan or foot_match1[foot_ind] > pages_to_scan):
and (foot_match[foot_ind] > pages_to_scan \
or foot_match1[foot_ind] > pages_to_scan \
or foot_match2[foot_ind] > pages_to_scan):
t = self.pages[foot_page].texts[-foot_ind-1]
foot_skip = t.top - 1

Expand Down Expand Up @@ -1884,6 +1916,8 @@ def merge_pages(self, idc):
save_bottom = 0
# After merge, skip to this page
pind = 0
# If a page is merged, and removed, may need to remember it
save_candidate = None

# Now merge where bottom of one is within ORPHAN_LINES lines of max_bottom
# and top of next is within a line of min_top
Expand All @@ -1892,7 +1926,8 @@ def merge_pages(self, idc):
while merge_done:
merge_done = False # A merge was done
merged_page = None # Page merged into previous
candidate = None # Lines close enough to the bottom that it might merge
candidate = save_candidate # Lines close enough to the bottom that it might merge
save_candidate = None
while pind < len(self.pages):
page = self.pages[pind]
stats_left = page.stats_left
Expand Down Expand Up @@ -1976,14 +2011,19 @@ def merge_pages(self, idc):
candidate.texts[-1].coalesce(merged_text, candidate.number, left_margin, right_margin)
merged_page.texts.remove(merged_text)
# Put back top/bottom after coalesce if final line
if save_bottom != 0.0 :
if save_bottom:
# Ignore top as that can confuse things where the 1st para of a page
# was merged with a previous. Keep the original top
candidate.texts[-1].bottom = save_bottom
#candidate.coalesce_paras()

# Have we removed everything from this page (well, all texts and images)
if merged_page.is_empty:
candidate.texts[-1].blank_line_before = 1
# Empty page does/may not actually mean blank line
#candidate.texts[-1].blank_line_before = 1
# If pages are merged, and the merged page gets removed (as here),
# and the next page is short (forced page break),
# then the merge would fail when this loop restarts.
save_candidate = candidate
self.pages.remove(merged_page)

def linearize(self):
Expand Down

0 comments on commit 4f5b833

Please sign in to comment.