-
Notifications
You must be signed in to change notification settings - Fork 89
/
Copy pathresiliparse_parser.py
36 lines (30 loc) · 1.28 KB
/
resiliparse_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse import detect_encoding
from resiliparse.parse.html import HTMLTree
class HTMLParser(object):
"""
HTML parser using Resiliparse
"""
def html_to_text(self, tree, **kwargs) -> str:
"""
Convert HTML content to plain text using Resiliparse.
Returns:
str: Extracted plain text with scripts and styles removed
"""
text = extract_plain_text(tree, **kwargs)
return text
def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree:
"""
Get the HTML tree object
Args:
page (bytes): Raw HTML content as bytes
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
**kwargs: Additional arguments passed to extract_plain_text:
Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments.
Returns:
str: Extracted plain text content
"""
if not encoding:
encoding = detect_encoding(page)
tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs)
return tree