Skip to content

Commit 007180d

Browse files
committed
Propose PDFRouterParser and Loader
1 parent b5221f2 commit 007180d

File tree

5 files changed

+98
-79
lines changed

5 files changed

+98
-79
lines changed

libs/community/langchain_community/document_loaders/parsers/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from langchain_community.document_loaders.parsers.pdf import (
3030
PDFMinerParser,
3131
PDFPlumberParser,
32+
PDFRouterParser,
3233
PyMuPDFParser,
3334
PyPDFium2Parser,
3435
PyPDFParser,
@@ -51,6 +52,7 @@
5152
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
5253
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
5354
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
55+
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
5456
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
5557
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
5658
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@@ -76,6 +78,7 @@ def __getattr__(name: str) -> Any:
7678
"OpenAIWhisperParser",
7779
"PDFMinerParser",
7880
"PDFPlumberParser",
81+
"PDFRouterParser",
7982
"PyMuPDFParser",
8083
"PyPDFParser",
8184
"PyPDFium2Parser",

libs/community/langchain_community/document_loaders/parsers/pdf.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
from __future__ import annotations
44

5-
import re
65
import html
76
import io
87
import logging
8+
import re
99
import threading
1010
import warnings
1111
from datetime import datetime
@@ -1670,6 +1670,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
16701670

16711671
yield from docs
16721672

1673+
16731674
class PDFRouterParser(BaseBlobParser):
16741675
"""
16751676
Load PDFs using different parsers based on the metadata of the PDF
@@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
17001701
```
17011702
"""
17021703

1704+
Routes = Sequence[
1705+
tuple[
1706+
str,
1707+
Mapping[str, Union[re.Pattern, str]],
1708+
BaseBlobParser,
1709+
]
1710+
]
1711+
17031712
def __init__(
17041713
self,
1705-
routes: list[
1706-
tuple[
1707-
str,
1708-
dict[str, Union[re.Pattern, str]],
1709-
BaseBlobParser,
1710-
]
1711-
],
1714+
routes: Routes,
17121715
*,
17131716
password: Optional[str] = None,
17141717
):
@@ -1736,7 +1739,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
17361739
import pypdf # noqa:F401
17371740
except ImportError:
17381741
raise ImportError(
1739-
"pypdf package not found, please install it with `pip install pypdf.six`"
1742+
"pypdf package not found, please install it with "
1743+
"`pip install pypdf.six`"
17401744
)
17411745
from pypdf import PdfReader
17421746

libs/community/langchain_community/document_loaders/pdf.py

+58-59
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
from urllib.parse import urlparse
2323

2424
import requests
25-
26-
from langchain_core.document_loaders import BaseBlobParser
2725
from langchain_core.documents import Document
2826
from langchain_core.utils import get_from_dict_or_env
2927

@@ -39,7 +37,7 @@
3937
PDFPlumberParser,
4038
PyMuPDFParser,
4139
PyPDFium2Parser,
42-
PyPDFParser, PDFRouterParser,
40+
PyPDFParser,
4341
)
4442
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
4543

@@ -1426,60 +1424,61 @@ def lazy_load(self) -> Iterator[Document]:
14261424
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
14271425
PagedPDFSplitter = PyPDFLoader
14281426

1429-
class PDFRouterLoader(BasePDFLoader):
1430-
"""
1431-
Load PDFs using different parsers based on the metadata of the PDF
1432-
or the body of the first page.
1433-
The routes are defined as a list of tuples, where each tuple contains
1434-
the name, a dictionary of metadata and regex pattern and the parser to use.
1435-
The special key "page1" is to search in the first page with a regexp.
1436-
Use the route in the correct order, as the first matching route is used.
1437-
Add a default route ("default", {}, parser) at the end to catch all PDFs.
1438-
1439-
Sample:
1440-
```python
1441-
from langchain_community.document_loaders import PyPDFLoader
1442-
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
1443-
from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
1444-
from langchain_community.document_loaders.parsers import PDFPlumberParser
1445-
routes = [
1446-
# Name, keys with regex, parser
1447-
("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
1448-
PyMuPDFParser()),
1449-
("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
1450-
("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
1451-
("defautl", {}, PyPDFium2Parser())
1452-
]
1453-
loader = PDFRouterLoader(filename, routes)
1454-
loader.load()
1455-
```
1456-
"""
1457-
1458-
def __init__(
1459-
self,
1460-
file_path: Union[str, Path],
1461-
*,
1462-
routes: list[
1463-
tuple[
1464-
str,
1465-
dict[str, Union[re.Pattern | str]],
1466-
BaseBlobParser,
1467-
]
1468-
],
1469-
password: Optional[str] = None,
1470-
):
1471-
"""Initialize with a file path."""
1472-
super().__init__(file_path)
1473-
self.parser = PDFRouterParser(routes, password=password)
1474-
14751427

1476-
def lazy_load(
1477-
self,
1478-
) -> Iterator[Document]:
1479-
if self.web_path:
1480-
blob = Blob.from_data(
1481-
open(self.file_path, "rb").read(), path=self.web_path
1482-
) # type: ignore[attr-defined]
1483-
else:
1484-
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
1485-
yield from self.parser.lazy_parse(blob)
1428+
# class PDFRouterLoader(BasePDFLoader):
1429+
# """
1430+
# Load PDFs using different parsers based on the metadata of the PDF
1431+
# or the body of the first page.
1432+
# The routes are defined as a list of tuples, where each tuple contains
1433+
# the name, a dictionary of metadata and regex pattern and the parser to use.
1434+
# The special key "page1" is to search in the first page with a regexp.
1435+
# Use the route in the correct order, as the first matching route is used.
1436+
# Add a default route ("default", {}, parser) at the end to catch all PDFs.
1437+
#
1438+
# Sample:
1439+
# ```python
1440+
# from langchain_community.document_loaders import PyPDFLoader
1441+
# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
1442+
# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
1443+
# from langchain_community.document_loaders.parsers import PDFPlumberParser
1444+
# routes = [
1445+
# # Name, keys with regex, parser
1446+
# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
1447+
# PyMuPDFParser()),
1448+
# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
1449+
# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
1450+
# PDFPlumberParser()),
1451+
# ("defautl", {}, PyPDFium2Parser())
1452+
# ]
1453+
# loader = PDFRouterLoader(filename, routes)
1454+
# loader.load()
1455+
# ```
1456+
# """
1457+
#
1458+
# def __init__(
1459+
# self,
1460+
# file_path: Union[str, Path],
1461+
# *,
1462+
# routes: list[
1463+
# tuple[
1464+
# str,
1465+
# dict[str, Union[re.Pattern, str]],
1466+
# BaseBlobParser,
1467+
# ]
1468+
# ],
1469+
# password: Optional[str] = None,
1470+
# ):
1471+
# """Initialize with a file path."""
1472+
# super().__init__(file_path)
1473+
# self.parser = PDFRouterParser(routes, password=password)
1474+
#
1475+
# def lazy_load(
1476+
# self,
1477+
# ) -> Iterator[Document]:
1478+
# if self.web_path:
1479+
# blob = Blob.from_data(open(self.file_path, "rb").read(),
1480+
# path=self.web_path) # type: ignore[attr-defined]
1481+
# else:
1482+
# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
1483+
# yield from self.parser.lazy_parse(blob)
1484+
# FIXME

libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py

+23-11
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import re
44
from pathlib import Path
5-
from typing import TYPE_CHECKING, Iterator
5+
from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
66

77
import pytest
88

@@ -11,10 +11,12 @@
1111
from langchain_community.document_loaders.blob_loaders import Blob
1212
from langchain_community.document_loaders.parsers import (
1313
BaseImageBlobParser,
14-
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
14+
PDFMinerParser,
15+
PDFPlumberParser,
16+
PDFRouterParser,
17+
PyMuPDFParser,
18+
PyPDFium2Parser,
1519
)
16-
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
17-
PDFMinerParser
1820

1921
if TYPE_CHECKING:
2022
from PIL.Image import Image
@@ -315,9 +317,15 @@ def _analyze_image(self, img: Image) -> str:
315317
)
316318
_std_assert_with_parser(parser)
317319

320+
318321
def test_parser_router_parse() -> None:
319-
mode = "single"
320-
routes = [
322+
mode: Literal["single"] = "single"
323+
routes: PDFRouterParser.Routes = [
324+
(
325+
"Xdvipdfmx",
326+
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
327+
PDFMinerParser(mode=mode),
328+
),
321329
(
322330
"Microsoft",
323331
{"producer": "Microsoft", "creator": "Microsoft"},
@@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
331339
PDFMinerParser(mode=mode),
332340
),
333341
(
334-
"Xdvipdfmx",
335-
{"producer": "xdvipdfmx.*", "page1": "Hello"},
336-
PDFMinerParser(mode=mode),
342+
"default",
343+
cast(dict[str, Union[re.Pattern, str]], dict()),
344+
PyPDFium2Parser(mode=mode),
337345
),
338-
("default", {}, PyPDFium2Parser(mode=mode)),
339346
]
340-
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
347+
_assert_with_parser(
348+
PDFRouterParser(
349+
routes=routes,
350+
),
351+
splits_by_page=False,
352+
)

libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
1414
"OpenAIWhisperParser",
1515
"PyPDFParser",
1616
"PDFMinerParser",
17+
"PDFRouterParser",
1718
"PyMuPDFParser",
1819
"PyPDFium2Parser",
1920
"PDFPlumberParser",

0 commit comments

Comments
 (0)