Skip to content

Commit

Permalink
feat: add parse_fragment and create_tag (#134)
Browse files Browse the repository at this point in the history
* feat: add parse_fragment and create_tag

* refactor:  markdown stylign

* refactor: elaborate docs

* refactor: remove lexbor_ prefix
  • Loading branch information
JuroOravec authored Oct 22, 2024
1 parent 279c526 commit de807b5
Show file tree
Hide file tree
Showing 9 changed files with 381 additions and 1 deletion.
42 changes: 41 additions & 1 deletion examples/walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"metadata": {},
"outputs": [],
"source": [
"from selectolax.parser import HTMLParser"
"from selectolax.parser import HTMLParser, parse_fragment, create_tag"
]
},
{
Expand All @@ -29,9 +29,49 @@
" <p id='stext'>Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>\n",
" </div>\n",
"</body>\n",
"\"\"\"\n",
"\n",
"fragment = \"\"\"\n",
"<div>\n",
" <p class=\"p3\">\n",
" Hello there!\n",
" </p>\n",
"</div>\n",
"<script>\n",
" document.querySelector(\".p3\").addEventListener(\"click\", () => { ... });\n",
"</script>\n",
"\"\"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Parsing HTML\n",
"\n",
"There are 3 ways to create or parse objects in Selectolax:\n",
"1. Parse HTML as a full document using `HTMLParser()`\n",
"2. Parse HTML as a fragment using `parse_fragment()`\n",
"3. Create single node using `create_tag()`\n",
"\n",
"- `HTMLParser()` - This returns the HTML tree as parsed by Modest / Lexbor, unmodified. The HTML is assumed to be a full document. `<html>`, `<head>`, and `<body>` tags are added if missing.\n",
"\n",
"- `parse_fragment()` - Intended for HTML fragments / partials. Returns a list of Nodes. Given HTML doesn't need to contain `<html>`, `<head>`, `<body>`. HTML can have multiple root elements.\n",
"\n",
"- `create_tag()` - Create a single empty node for given tag."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"html_tree = HTMLParser(html)\n",
"frag_tree = parse_fragment(fragment)\n",
"node = create_tag(\"div\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
17 changes: 17 additions & 0 deletions selectolax/lexbor.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,20 @@ class LexborHTMLParser:
def css_matches(self, selector: str) -> bool: ...
def clone(self) -> "LexborHTMLParser": ...
def unwrap_tags(self, tags: list[str]) -> None: ...

def create_tag(tag: str) -> "LexborNode":
"""
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
e.g. `"<div></div>"`.
"""
...

def parse_fragment(html: str) -> list["LexborNode"]:
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
...
1 change: 1 addition & 0 deletions selectolax/lexbor.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ include "utils.pxi"
include "lexbor/attrs.pxi"
include "lexbor/node.pxi"
include "lexbor/selection.pxi"
include "lexbor/util.pxi"

# We don't inherit from HTMLParser here, because it also includes all the C code from Modest.

Expand Down
19 changes: 19 additions & 0 deletions selectolax/lexbor/util.pxi
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
include "../utils.pxi"

def create_tag(tag: str):
"""
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
e.g. `"<div></div>"`.
"""
return do_create_tag(tag, LexborHTMLParser)


def parse_fragment(html: str):
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
return do_parse_fragment(html, LexborHTMLParser)
19 changes: 19 additions & 0 deletions selectolax/modest/util.pxi
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
include "../utils.pxi"

def create_tag(tag: str):
"""
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
e.g. `"<div></div>"`.
"""
return do_create_tag(tag, HTMLParser)


def parse_fragment(html: str):
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
return do_parse_fragment(html, HTMLParser)
17 changes: 17 additions & 0 deletions selectolax/parser.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,20 @@ class HTMLParser:
This is useful for text extraction."""
...

def create_tag(tag: str) -> "Node":
"""
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
e.g. `"<div></div>"`.
"""
...

def parse_fragment(html: str) -> list["Node"]:
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
...
1 change: 1 addition & 0 deletions selectolax/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ from cpython cimport bool

include "modest/selection.pxi"
include "modest/node.pxi"
include "modest/util.pxi"
include "utils.pxi"

cdef class HTMLParser:
Expand Down
94 changes: 94 additions & 0 deletions selectolax/utils.pxi
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from typing import Literal, Optional, Union, Type

MAX_HTML_INPUT_SIZE = 250e+7

ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
Parser = Union["HTMLParser", "LexborHTMLParser"]


def preprocess_input(html, decode_errors='ignore'):
if isinstance(html, (str, unicode)):
bytes_html = html.encode('UTF-8', errors=decode_errors)
Expand All @@ -11,3 +17,91 @@ def preprocess_input(html, decode_errors='ignore'):
if html_len > MAX_HTML_INPUT_SIZE:
raise ValueError("The specified HTML input is too large to be processed (%d bytes)" % html_len)
return bytes_html, html_len


def do_create_tag(tag: str, parser_cls: ParserCls):
if not tag:
raise ValueError("Tag name cannot be empty")
return do_parse_fragment(f"<{tag}></{tag}>", parser_cls)[0]


def get_fragment_type(
html: str,
parser_cls: ParserCls,
tree: Optional[Parser] = None,
) -> Literal["document", "fragment", "head", "body", "head_and_body", "document_no_head", "document_no_body", "document_no_head_no_body"]:
if not tree:
tree = parser_cls(html)

import re
html_re = re.compile(r"<html|<body|<head", re.IGNORECASE)

has_html = False
has_head = False
has_body = False
for match in html_re.finditer(html):
if match[0] == "<html":
has_html = True
elif match[0] == "<head":
has_head = True
elif match[0] == "<body":
has_body = True

if has_html and has_head and has_body:
break

if has_html and has_head and has_body:
return "document"
elif has_html and not has_head and has_body:
return "document_no_head"
elif has_html and has_head and not has_body:
return "document_no_body"
elif has_html and not has_head and not has_body:
return "document_no_head_no_body"
elif has_head and not has_body:
return "head"
elif not has_head and has_body:
return "body"
elif has_head and has_body:
return "head_and_body"
else:
return "fragment"


def do_parse_fragment(html: str, parser_cls: ParserCls):
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
html = html.strip()
tree = parser_cls(html)
frag_type = get_fragment_type(html, parser_cls, tree)

if frag_type == "document":
return [tree.root]
if frag_type == "document_no_head":
tree.head.decompose(recursive=True)
return [tree.root]
if frag_type == "document_no_body":
tree.body.decompose(recursive=True)
return [tree.root]
if frag_type == "document_no_head_no_body":
tree.head.decompose(recursive=True)
tree.body.decompose(recursive=True)
return [tree.root]
elif frag_type == "head":
tree.body.decompose(recursive=True)
return [tree.head]
elif frag_type == "body":
tree.head.decompose(recursive=True)
return [tree.body]
elif frag_type == "head_and_body":
return [tree.head, tree.body]
else:
return [
*tree.head.iter(include_text=True),
*tree.body.iter(include_text=True),
]
Loading

0 comments on commit de807b5

Please sign in to comment.