Propose PDFRouterParser and Loader

pprados · pprados · commit 007180d990dd · 2025-04-16T09:03:34.000+02:00
diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py
@@ -29,6 +29,7 @@
     from langchain_community.document_loaders.parsers.pdf import (
         PDFMinerParser,
         PDFPlumberParser,
+        PDFRouterParser,
         PyMuPDFParser,
         PyPDFium2Parser,
         PyPDFParser,
@@ -51,6 +52,7 @@
     "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
     "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
     "PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
+    "PDFRouterParser": "langchain_community.document_loaders.parsers.pdf",
     "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
     "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
     "TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
@@ -76,6 +78,7 @@ def __getattr__(name: str) -> Any:
     "OpenAIWhisperParser",
     "PDFMinerParser",
     "PDFPlumberParser",
+    "PDFRouterParser",
     "PyMuPDFParser",
     "PyPDFParser",
     "PyPDFium2Parser",
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -2,10 +2,10 @@
 
 from __future__ import annotations
 
-import re
 import html
 import io
 import logging
+import re
 import threading
 import warnings
 from datetime import datetime
@@ -1670,6 +1670,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
 
             yield from docs
 
+
 class PDFRouterParser(BaseBlobParser):
     """
     Load PDFs using different parsers based on the metadata of the PDF
@@ -1700,15 +1701,17 @@ class PDFRouterParser(BaseBlobParser):
     ```
     """
 
+    Routes = Sequence[
+        tuple[
+            str,
+            Mapping[str, Union[re.Pattern, str]],
+            BaseBlobParser,
+        ]
+    ]
+
     def __init__(
         self,
-        routes: list[
-            tuple[
-                str,
-                dict[str, Union[re.Pattern, str]],
-                BaseBlobParser,
-            ]
-        ],
+        routes: Routes,
         *,
         password: Optional[str] = None,
     ):
@@ -1736,7 +1739,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
             import pypdf  # noqa:F401
         except ImportError:
             raise ImportError(
-                "pypdf package not found, please install it with `pip install pypdf.six`"
+                "pypdf package not found, please install it with "
+                "`pip install pypdf.six`"
             )
         from pypdf import PdfReader
 
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
@@ -22,8 +22,6 @@
 from urllib.parse import urlparse
 
 import requests
-
-from langchain_core.document_loaders import BaseBlobParser
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_dict_or_env
 
@@ -39,7 +37,7 @@
     PDFPlumberParser,
     PyMuPDFParser,
     PyPDFium2Parser,
-    PyPDFParser, PDFRouterParser,
+    PyPDFParser,
 )
 from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
 
@@ -1426,60 +1424,61 @@ def lazy_load(self) -> Iterator[Document]:
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
 
-class PDFRouterLoader(BasePDFLoader):
-    """
-    Load PDFs using different parsers based on the metadata of the PDF
-    or the body of the first page.
-    The routes are defined as a list of tuples, where each tuple contains
-    the name, a dictionary of metadata and regex pattern and the parser to use.
-    The special key "page1" is to search in the first page with a regexp.
-    Use the route in the correct order, as the first matching route is used.
-    Add a default route ("default", {}, parser) at the end to catch all PDFs.
-
-    Sample:
-    ```python
-    from langchain_community.document_loaders import PyPDFLoader
-    from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
-    from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
-    from langchain_community.document_loaders.parsers import PDFPlumberParser
-    routes = [
-        # Name, keys with regex, parser
-        ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
-        PyMuPDFParser()),
-        ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
-        ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()),
-        ("defautl", {}, PyPDFium2Parser())
-    ]
-    loader = PDFRouterLoader(filename, routes)
-    loader.load()
-    ```
-    """
-
-    def __init__(
-        self,
-        file_path: Union[str, Path],
-        *,
-        routes: list[
-            tuple[
-                str,
-                dict[str, Union[re.Pattern | str]],
-                BaseBlobParser,
-            ]
-        ],
-        password: Optional[str] = None,
-    ):
-        """Initialize with a file path."""
-        super().__init__(file_path)
-        self.parser = PDFRouterParser(routes, password=password)
-
 
-    def lazy_load(
-        self,
-    ) -> Iterator[Document]:
-        if self.web_path:
-            blob = Blob.from_data(
-                open(self.file_path, "rb").read(), path=self.web_path
-            )  # type: ignore[attr-defined]
-        else:
-            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
-        yield from self.parser.lazy_parse(blob)
+# class PDFRouterLoader(BasePDFLoader):
+#     """
+#     Load PDFs using different parsers based on the metadata of the PDF
+#     or the body of the first page.
+#     The routes are defined as a list of tuples, where each tuple contains
+#     the name, a dictionary of metadata and regex pattern and the parser to use.
+#     The special key "page1" is to search in the first page with a regexp.
+#     Use the route in the correct order, as the first matching route is used.
+#     Add a default route ("default", {}, parser) at the end to catch all PDFs.
+#
+#     Sample:
+#     ```python
+#     from langchain_community.document_loaders import PyPDFLoader
+#     from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
+#     from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser
+#     from langchain_community.document_loaders.parsers import PDFPlumberParser
+#     routes = [
+#         # Name, keys with regex, parser
+#         ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"},
+#         PyMuPDFParser()),
+#         ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()),
+#         ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"},
+#         PDFPlumberParser()),
+#         ("defautl", {}, PyPDFium2Parser())
+#     ]
+#     loader = PDFRouterLoader(filename, routes)
+#     loader.load()
+#     ```
+#     """
+#
+#     def __init__(
+#         self,
+#         file_path: Union[str, Path],
+#         *,
+#         routes: list[
+#             tuple[
+#                 str,
+#                 dict[str, Union[re.Pattern, str]],
+#                 BaseBlobParser,
+#             ]
+#         ],
+#         password: Optional[str] = None,
+#     ):
+#         """Initialize with a file path."""
+#         super().__init__(file_path)
+#         self.parser = PDFRouterParser(routes, password=password)
+#
+#     def lazy_load(
+#         self,
+#     ) -> Iterator[Document]:
+#         if self.web_path:
+#             blob = Blob.from_data(open(self.file_path, "rb").read(),
+#             path=self.web_path)  # type: ignore[attr-defined]
+#         else:
+#             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
+#         yield from self.parser.lazy_parse(blob)
+# FIXME
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -2,7 +2,7 @@
 
 import re
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING, Iterator, Literal, Union, cast
 
 import pytest
 
@@ -11,10 +11,12 @@
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers import (
     BaseImageBlobParser,
-    PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser,
+    PDFMinerParser,
+    PDFPlumberParser,
+    PDFRouterParser,
+    PyMuPDFParser,
+    PyPDFium2Parser,
 )
-from langchain_community.document_loaders.parsers.pdf import PDFRouterParser, \
-    PDFMinerParser
 
 if TYPE_CHECKING:
     from PIL.Image import Image
@@ -315,9 +317,15 @@ def _analyze_image(self, img: Image) -> str:
     )
     _std_assert_with_parser(parser)
 
+
 def test_parser_router_parse() -> None:
-    mode = "single"
-    routes = [
+    mode: Literal["single"] = "single"
+    routes: PDFRouterParser.Routes = [
+        (
+            "Xdvipdfmx",
+            {"producer": re.compile(r"xdvipdfmx.*"), "page1": "Hello"},
+            PDFMinerParser(mode=mode),
+        ),
         (
             "Microsoft",
             {"producer": "Microsoft", "creator": "Microsoft"},
@@ -331,10 +339,14 @@ def test_parser_router_parse() -> None:
             PDFMinerParser(mode=mode),
         ),
         (
-            "Xdvipdfmx",
-            {"producer": "xdvipdfmx.*", "page1": "Hello"},
-            PDFMinerParser(mode=mode),
+            "default",
+            cast(dict[str, Union[re.Pattern, str]], dict()),
+            PyPDFium2Parser(mode=mode),
         ),
-        ("default", {}, PyPDFium2Parser(mode=mode)),
     ]
-    _assert_with_parser(PDFRouterParser(routes=routes), splits_by_page=False)
+    _assert_with_parser(
+        PDFRouterParser(
+            routes=routes,
+        ),
+        splits_by_page=False,
+    )
diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -14,6 +14,7 @@ def test_parsers_public_api_correct() -> None:
         "OpenAIWhisperParser",
         "PyPDFParser",
         "PDFMinerParser",
+        "PDFRouterParser",
         "PyMuPDFParser",
         "PyPDFium2Parser",
         "PDFPlumberParser",