DOCX Input: When some text has multiple footnotes insert a space betw…

…een the consecutive foot note numbers so that they are distinct. Fixes #2089433 [Separate several footnote/endnote references](https://bugs.launchpad.net/calibre/+bug/2089433)
kovidgoyal · Nov 24, 2024 · 2dacaf7 · 2dacaf7
1 parent 90b33c9
commit 2dacaf7
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 2 deletions.
diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py
@@ -113,11 +113,19 @@ def wrap_contents(tag_name, elem):
     elem.append(wrapper)
 
 
-def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
+def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath, uuid):
     # Apply vertical-align
     for span in root.xpath('//span[@data-docx-vert]'):
         wrap_contents(span.attrib.pop('data-docx-vert'), span)
 
+    for span in root.xpath(f'//*[@data-noteref-container="{uuid}"]'):
+        span.attrib.pop('data-noteref-container')
+        parent = span.getparent()
+        idx = parent.index(span)
+        if idx + 1 < len(parent) and (ns := parent[idx+1]) and hasattr(ns, 'get') and ns.get('data-noteref-container'):
+            if len(span) and not span[-1].tail:
+                span[-1].tail = '\xa0'
+
     # Move <hr>s outside paragraphs, if possible.
     pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
     for hr in root.xpath('//span/hr'):

diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
@@ -75,6 +75,7 @@ def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, n
         self.dest_dir = dest_dir or os.getcwd()
         self.mi = self.docx.metadata
         self.body = BODY()
+        self.uuid = uuid.uuid4().hex
         self.theme = Theme(self.namespace)
         self.settings = Settings(self.namespace)
         self.tables = Tables(self.namespace)
@@ -241,7 +242,7 @@ def __call__(self):
         self.fields.polish_markup(self.object_map)
 
         self.log.debug('Cleaning up redundant markup generated by Word')
-        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
+        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath, self.uuid)
 
         return self.write(doc)
 
@@ -713,6 +714,7 @@ def convert_run(self, run):
                     l.set('role', 'doc-noteref')
                     text.add_elem(l)
                     ans.append(text.elem)
+                    ans.set('data-noteref-container', self.uuid)
             elif self.namespace.is_tag(child, 'w:tab'):
                 spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
                 text.add_elem(SPAN(NBSP * spaces))