Skip to content

Commit fdf5c9a

Browse files
committed
Propose PDFRouterParser and Loader
1 parent b5221f2 commit fdf5c9a

File tree

5 files changed

+30
-18
lines changed

5 files changed

+30
-18
lines changed

libs/community/langchain_community/document_loaders/parsers/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from langchain_community.document_loaders.parsers.pdf import (
3030
PDFMinerParser,
3131
PDFPlumberParser,
32+
PDFRouterParser,
3233
PyMuPDFParser,
3334
PyPDFium2Parser,
3435
PyPDFParser,
@@ -51,6 +52,7 @@
5152
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
5253
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
5354
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
55+
"PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
5456
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
5557
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
5658
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@@ -76,6 +78,7 @@ def __getattr__(name: str) -> Any:
7678
"OpenAIWhisperParser",
7779
"PDFMinerParser",
7880
"PDFPlumberParser",
81+
"PDFRouterParser",
7982
"PyMuPDFParser",
8083
"PyPDFParser",
8184
"PyPDFium2Parser",

libs/community/langchain_community/document_loaders/parsers/pdf.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
from __future__ import annotations
44

5-
import re
65
import html
76
import io
87
import logging
8+
import re
99
import threading
1010
import warnings
1111
from datetime import datetime
@@ -1670,6 +1670,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
16701670

16711671
yield from docs
16721672

1673+
16731674
class PDFRouterParser(BaseBlobParser):
16741675
"""
16751676
Load PDFs using different parsers based on the metadata of the PDF
@@ -1702,7 +1703,7 @@ class PDFRouterParser(BaseBlobParser):
17021703

17031704
def __init__(
17041705
self,
1705-
routes: list[
1706+
routes: Sequence[
17061707
tuple[
17071708
str,
17081709
dict[str, Union[re.Pattern, str]],
@@ -1736,7 +1737,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
17361737
import pypdf # noqa:F401
17371738
except ImportError:
17381739
raise ImportError(
1739-
"pypdf package not found, please install it with `pip install pypdf.six`"
1740+
"pypdf package not found, please install it with "
1741+
"`pip install pypdf.six`"
17401742
)
17411743
from pypdf import PdfReader
17421744

libs/community/langchain_community/document_loaders/pdf.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from urllib.parse import urlparse
2323

2424
import requests
25-
2625
from langchain_core.document_loaders import BaseBlobParser
2726
from langchain_core.documents import Document
2827
from langchain_core.utils import get_from_dict_or_env
@@ -37,9 +36,10 @@
3736
DocumentIntelligenceParser,
3837
PDFMinerParser,
3938
PDFPlumberParser,
39+
PDFRouterParser,
4040
PyMuPDFParser,
4141
PyPDFium2Parser,
42-
PyPDFParser, PDFRouterParser,
42+
PyPDFParser,
4343
)
4444
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
4545

@@ -1426,6 +1426,7 @@ def lazy_load(self) -> Iterator[Document]:
14261426
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
14271427
PagedPDFSplitter = PyPDFLoader
14281428

1429+
14291430
class PDFRouterLoader(BasePDFLoader):
14301431
"""
14311432
Load PDFs using different parsers based on the metadata of the PDF
@@ -1462,7 +1463,7 @@ def __init__(
14621463
routes: list[
14631464
tuple[
14641465
str,
1465-
dict[str, Union[re.Pattern | str]],
1466+
dict[str, Union[re.Pattern, str]],
14661467
BaseBlobParser,
14671468
]
14681469
],
@@ -1472,14 +1473,11 @@ def __init__(
14721473
super().__init__(file_path)
14731474
self.parser = PDFRouterParser(routes, password=password)
14741475

1475-
14761476
def lazy_load(
14771477
self,
14781478
) -> Iterator[Document]:
14791479
if self.web_path:
1480-
blob = Blob.from_data(
1481-
open(self.file_path, "rb").read(), path=self.web_path
1482-
) # type: ignore[attr-defined]
1480+
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
14831481
else:
14841482
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
14851483
yield from self.parser.lazy_parse(blob)

libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import re
44
from pathlib import Path
5-
from typing import TYPE_CHECKING, Iterator
5+
from typing import TYPE_CHECKING, Iterator, Literal
66

77
import pytest
88

@@ -11,10 +11,12 @@
1111
from langchain_community.document_loaders.blob_loaders import Blob
1212
from langchain_community.document_loaders.parsers import (
1313
BaseImageBlobParser,
14-
PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
14+
PDFMinerParser,
15+
PDFPlumberParser,
16+
PDFRouterParser,
17+
PyMuPDFParser,
18+
PyPDFium2Parser,
1519
)
16-
from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
17-
PDFMinerParser
1820

1921
if TYPE_CHECKING:
2022
from PIL.Image import Image
@@ -315,8 +317,9 @@ def _analyze_image(self, img: Image) -> str:
315317
)
316318
_std_assert_with_parser(parser)
317319

320+
318321
def test_parser_router_parse() -> None:
319-
mode = "single"
322+
mode: Literal["single"] = "single"
320323
routes = [
321324
(
322325
"Microsoft",
@@ -332,9 +335,14 @@ def test_parser_router_parse() -> None:
332335
),
333336
(
334337
"Xdvipdfmx",
335-
{"producer": "xdvipdfmx.*", "page1": "Hello"},
336-
PDFMinerParser(mode=mode),
338+
{"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
339+
# PDFMinerParser(mode=mode),
340+
PyPDFium2Parser(mode=mode),
341+
),
342+
(
343+
"default",
344+
dict(),
345+
PyPDFium2Parser(mode=mode),
337346
),
338-
("default", {}, PyPDFium2Parser(mode=mode)),
339347
]
340348
_assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)

libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
1414
"OpenAIWhisperParser",
1515
"PyPDFParser",
1616
"PDFMinerParser",
17+
"PDFRouterParser",
1718
"PyMuPDFParser",
1819
"PyPDFium2Parser",
1920
"PDFPlumberParser",

0 commit comments

Comments
 (0)