feat: add parse_fragment and create_tag (#134)

* feat: add parse_fragment and create_tag * refactor: markdown stylign * refactor: elaborate docs * refactor: remove lexbor_ prefix
rushter · Oct 22, 2024 · de807b5 · de807b5
1 parent 279c526
commit de807b5
Show file tree

Hide file tree

Showing 9 changed files with 381 additions and 1 deletion.
diff --git a/examples/walkthrough.ipynb b/examples/walkthrough.ipynb
@@ -6,7 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from selectolax.parser import HTMLParser"
+    "from selectolax.parser import HTMLParser, parse_fragment, create_tag"
    ]
   },
   {
@@ -29,9 +29,49 @@
     "        <p id='stext'>Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>\n",
     "    </div>\n",
     "</body>\n",
+    "\"\"\"\n",
+    "\n",
+    "fragment = \"\"\"\n",
+    "<div>\n",
+    "    <p class=\"p3\">\n",
+    "        Hello there!\n",
+    "    </p>\n",
+    "</div>\n",
+    "<script>\n",
+    "    document.querySelector(\".p3\").addEventListener(\"click\", () => { ... });\n",
+    "</script>\n",
     "\"\"\""
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Parsing HTML\n",
+    "\n",
+    "There are 3 ways to create or parse objects in Selectolax:\n",
+    "1. Parse HTML as a full document using `HTMLParser()`\n",
+    "2. Parse HTML as a fragment using `parse_fragment()`\n",
+    "3. Create single node using `create_tag()`\n",
+    "\n",
+    "- `HTMLParser()` - This returns the HTML tree as parsed by Modest / Lexbor, unmodified. The HTML is assumed to be a full document. `<html>`, `<head>`, and `<body>` tags are added if missing.\n",
+    "\n",
+    "- `parse_fragment()` - Intended for HTML fragments / partials. Returns a list of Nodes. Given HTML doesn't need to contain `<html>`, `<head>`, `<body>`. HTML can have multiple root elements.\n",
+    "\n",
+    "- `create_tag()` - Create a single empty node for given tag."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "html_tree = HTMLParser(html)\n",
+    "frag_tree = parse_fragment(fragment)\n",
+    "node = create_tag(\"div\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi
@@ -121,3 +121,20 @@ class LexborHTMLParser:
     def css_matches(self, selector: str) -> bool: ...
     def clone(self) -> "LexborHTMLParser": ...
     def unwrap_tags(self, tags: list[str]) -> None: ...
+
+def create_tag(tag: str) -> "LexborNode":
+    """
+    Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
+    e.g. `"<div></div>"`.
+    """
+    ...
+
+def parse_fragment(html: str) -> list["LexborNode"]:
+    """
+    Given HTML, parse it into a list of Nodes, such that the nodes
+    correspond to the given HTML.
+
+    For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
+    if they are missing. This function does not add these tags.
+    """
+    ...
diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx
@@ -7,6 +7,7 @@ include "utils.pxi"
 include "lexbor/attrs.pxi"
 include "lexbor/node.pxi"
 include "lexbor/selection.pxi"
+include "lexbor/util.pxi"
 
 # We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
 

diff --git a/selectolax/lexbor/util.pxi b/selectolax/lexbor/util.pxi
@@ -0,0 +1,19 @@
+include "../utils.pxi"
+
+def create_tag(tag: str):
+    """
+    Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
+    e.g. `"<div></div>"`.
+    """
+    return do_create_tag(tag, LexborHTMLParser)
+
+
+def parse_fragment(html: str):
+    """
+    Given HTML, parse it into a list of Nodes, such that the nodes
+    correspond to the given HTML.
+
+    For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
+    if they are missing. This function does not add these tags.
+    """
+    return do_parse_fragment(html, LexborHTMLParser)
diff --git a/selectolax/modest/util.pxi b/selectolax/modest/util.pxi
@@ -0,0 +1,19 @@
+include "../utils.pxi"
+
+def create_tag(tag: str):
+    """
+    Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
+    e.g. `"<div></div>"`.
+    """
+    return do_create_tag(tag, HTMLParser)
+
+
+def parse_fragment(html: str):
+    """
+    Given HTML, parse it into a list of Nodes, such that the nodes
+    correspond to the given HTML.
+
+    For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
+    if they are missing. This function does not add these tags.
+    """
+    return do_parse_fragment(html, HTMLParser)
diff --git a/selectolax/parser.pyi b/selectolax/parser.pyi
@@ -279,3 +279,20 @@ class HTMLParser:
 
         This is useful for text extraction."""
         ...
+
+def create_tag(tag: str) -> "Node":
+    """
+    Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
+    e.g. `"<div></div>"`.
+    """
+    ...
+
+def parse_fragment(html: str) -> list["Node"]:
+    """
+    Given HTML, parse it into a list of Nodes, such that the nodes
+    correspond to the given HTML.
+
+    For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
+    if they are missing. This function does not add these tags.
+    """
+    ...
diff --git a/selectolax/parser.pyx b/selectolax/parser.pyx
@@ -3,6 +3,7 @@ from cpython cimport bool
 
 include "modest/selection.pxi"
 include "modest/node.pxi"
+include "modest/util.pxi"
 include "utils.pxi"
 
 cdef class HTMLParser:

diff --git a/selectolax/utils.pxi b/selectolax/utils.pxi
@@ -1,5 +1,11 @@
+from typing import Literal, Optional, Union, Type
+
 MAX_HTML_INPUT_SIZE = 250e+7
 
+ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
+Parser = Union["HTMLParser", "LexborHTMLParser"]
+
+
 def preprocess_input(html, decode_errors='ignore'):
     if isinstance(html, (str, unicode)):
         bytes_html = html.encode('UTF-8', errors=decode_errors)
@@ -11,3 +17,91 @@ def preprocess_input(html, decode_errors='ignore'):
     if html_len > MAX_HTML_INPUT_SIZE:
         raise ValueError("The specified HTML input is too large to be processed (%d bytes)" % html_len)
     return bytes_html, html_len
+
+
+def do_create_tag(tag: str, parser_cls: ParserCls):
+    if not tag:
+        raise ValueError("Tag name cannot be empty")
+    return do_parse_fragment(f"<{tag}></{tag}>", parser_cls)[0]
+
+
+def get_fragment_type(
+    html: str,
+    parser_cls: ParserCls,
+    tree: Optional[Parser] = None,
+) -> Literal["document", "fragment", "head", "body", "head_and_body", "document_no_head", "document_no_body", "document_no_head_no_body"]:
+    if not tree:
+        tree = parser_cls(html)
+
+    import re
+    html_re = re.compile(r"<html|<body|<head", re.IGNORECASE)
+
+    has_html = False
+    has_head = False
+    has_body = False
+    for match in html_re.finditer(html):
+        if match[0] == "<html":
+            has_html = True
+        elif match[0] == "<head":
+            has_head = True
+        elif match[0] == "<body":
+            has_body = True
+
+        if has_html and has_head and has_body:
+            break
+
+    if has_html and has_head and has_body:
+        return "document"
+    elif has_html and not has_head and has_body:
+        return "document_no_head"
+    elif has_html and has_head and not has_body:
+        return "document_no_body"
+    elif has_html and not has_head and not has_body:
+        return "document_no_head_no_body"
+    elif has_head and not has_body:
+        return "head"
+    elif not has_head and has_body:
+        return "body"
+    elif has_head and has_body:
+        return "head_and_body"
+    else:
+        return "fragment"
+
+
+def do_parse_fragment(html: str, parser_cls: ParserCls):
+    """
+    Given HTML, parse it into a list of Nodes, such that the nodes
+    correspond to the given HTML.
+
+    For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
+    if they are missing. This function does not add these tags.
+    """
+    html = html.strip()
+    tree = parser_cls(html)
+    frag_type = get_fragment_type(html, parser_cls, tree)
+
+    if frag_type == "document":
+        return [tree.root]
+    if frag_type == "document_no_head":
+        tree.head.decompose(recursive=True)
+        return [tree.root]
+    if frag_type == "document_no_body":
+        tree.body.decompose(recursive=True)
+        return [tree.root]
+    if frag_type == "document_no_head_no_body":
+        tree.head.decompose(recursive=True)
+        tree.body.decompose(recursive=True)
+        return [tree.root]
+    elif frag_type == "head":
+        tree.body.decompose(recursive=True)
+        return [tree.head]
+    elif frag_type == "body":
+        tree.head.decompose(recursive=True)
+        return [tree.body]
+    elif frag_type == "head_and_body":
+        return [tree.head, tree.body]
+    else:
+        return [
+            *tree.head.iter(include_text=True),
+            *tree.body.iter(include_text=True),
+        ]