|
22 | 22 | from urllib.parse import urlparse
|
23 | 23 |
|
24 | 24 | import requests
|
25 |
| - |
26 |
| -from langchain_core.document_loaders import BaseBlobParser |
27 | 25 | from langchain_core.documents import Document
|
28 | 26 | from langchain_core.utils import get_from_dict_or_env
|
29 | 27 |
|
|
39 | 37 | PDFPlumberParser,
|
40 | 38 | PyMuPDFParser,
|
41 | 39 | PyPDFium2Parser,
|
42 |
| - PyPDFParser, PDFRouterParser, |
| 40 | + PyPDFParser, |
43 | 41 | )
|
44 | 42 | from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
45 | 43 |
|
@@ -1426,60 +1424,61 @@ def lazy_load(self) -> Iterator[Document]:
|
1426 | 1424 | # Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
1427 | 1425 | PagedPDFSplitter = PyPDFLoader
|
1428 | 1426 |
|
1429 |
| -class PDFRouterLoader(BasePDFLoader): |
1430 |
| - """ |
1431 |
| - Load PDFs using different parsers based on the metadata of the PDF |
1432 |
| - or the body of the first page. |
1433 |
| - The routes are defined as a list of tuples, where each tuple contains |
1434 |
| - the name, a dictionary of metadata and regex pattern and the parser to use. |
1435 |
| - The special key "page1" is to search in the first page with a regexp. |
1436 |
| - Use the route in the correct order, as the first matching route is used. |
1437 |
| - Add a default route ("default", {}, parser) at the end to catch all PDFs. |
1438 |
| -
|
1439 |
| - Sample: |
1440 |
| - ```python |
1441 |
| - from langchain_community.document_loaders import PyPDFLoader |
1442 |
| - from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser |
1443 |
| - from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser |
1444 |
| - from langchain_community.document_loaders.parsers import PDFPlumberParser |
1445 |
| - routes = [ |
1446 |
| - # Name, keys with regex, parser |
1447 |
| - ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, |
1448 |
| - PyMuPDFParser()), |
1449 |
| - ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), |
1450 |
| - ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, PDFPlumberParser()), |
1451 |
| - ("defautl", {}, PyPDFium2Parser()) |
1452 |
| - ] |
1453 |
| - loader = PDFRouterLoader(filename, routes) |
1454 |
| - loader.load() |
1455 |
| - ``` |
1456 |
| - """ |
1457 |
| - |
1458 |
| - def __init__( |
1459 |
| - self, |
1460 |
| - file_path: Union[str, Path], |
1461 |
| - *, |
1462 |
| - routes: list[ |
1463 |
| - tuple[ |
1464 |
| - str, |
1465 |
| - dict[str, Union[re.Pattern | str]], |
1466 |
| - BaseBlobParser, |
1467 |
| - ] |
1468 |
| - ], |
1469 |
| - password: Optional[str] = None, |
1470 |
| - ): |
1471 |
| - """Initialize with a file path.""" |
1472 |
| - super().__init__(file_path) |
1473 |
| - self.parser = PDFRouterParser(routes, password=password) |
1474 |
| - |
1475 | 1427 |
|
1476 |
| - def lazy_load( |
1477 |
| - self, |
1478 |
| - ) -> Iterator[Document]: |
1479 |
| - if self.web_path: |
1480 |
| - blob = Blob.from_data( |
1481 |
| - open(self.file_path, "rb").read(), path=self.web_path |
1482 |
| - ) # type: ignore[attr-defined] |
1483 |
| - else: |
1484 |
| - blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] |
1485 |
| - yield from self.parser.lazy_parse(blob) |
| 1428 | +# class PDFRouterLoader(BasePDFLoader): |
| 1429 | +# """ |
| 1430 | +# Load PDFs using different parsers based on the metadata of the PDF |
| 1431 | +# or the body of the first page. |
| 1432 | +# The routes are defined as a list of tuples, where each tuple contains |
| 1433 | +# the name, a dictionary of metadata and regex pattern and the parser to use. |
| 1434 | +# The special key "page1" is to search in the first page with a regexp. |
| 1435 | +# Use the route in the correct order, as the first matching route is used. |
| 1436 | +# Add a default route ("default", {}, parser) at the end to catch all PDFs. |
| 1437 | +# |
| 1438 | +# Sample: |
| 1439 | +# ```python |
| 1440 | +# from langchain_community.document_loaders import PyPDFLoader |
| 1441 | +# from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser |
| 1442 | +# from langchain_community.document_loaders.parsers.pdf import PyPDFium2Parser |
| 1443 | +# from langchain_community.document_loaders.parsers import PDFPlumberParser |
| 1444 | +# routes = [ |
| 1445 | +# # Name, keys with regex, parser |
| 1446 | +# ("Microsoft", {"producer": "Microsoft", "creator": "Microsoft"}, |
| 1447 | +# PyMuPDFParser()), |
| 1448 | +# ("LibreOffice", {"producer": "LibreOffice", }, PDFPlumberParser()), |
| 1449 | +# ("Xdvipdfmx", {"producer": "xdvipdfmx.*", "page1":"Hello"}, |
| 1450 | +# PDFPlumberParser()), |
| 1451 | +# ("defautl", {}, PyPDFium2Parser()) |
| 1452 | +# ] |
| 1453 | +# loader = PDFRouterLoader(filename, routes) |
| 1454 | +# loader.load() |
| 1455 | +# ``` |
| 1456 | +# """ |
| 1457 | +# |
| 1458 | +# def __init__( |
| 1459 | +# self, |
| 1460 | +# file_path: Union[str, Path], |
| 1461 | +# *, |
| 1462 | +# routes: list[ |
| 1463 | +# tuple[ |
| 1464 | +# str, |
| 1465 | +# dict[str, Union[re.Pattern, str]], |
| 1466 | +# BaseBlobParser, |
| 1467 | +# ] |
| 1468 | +# ], |
| 1469 | +# password: Optional[str] = None, |
| 1470 | +# ): |
| 1471 | +# """Initialize with a file path.""" |
| 1472 | +# super().__init__(file_path) |
| 1473 | +# self.parser = PDFRouterParser(routes, password=password) |
| 1474 | +# |
| 1475 | +# def lazy_load( |
| 1476 | +# self, |
| 1477 | +# ) -> Iterator[Document]: |
| 1478 | +# if self.web_path: |
| 1479 | +# blob = Blob.from_data(open(self.file_path, "rb").read(), |
| 1480 | +# path=self.web_path) # type: ignore[attr-defined] |
| 1481 | +# else: |
| 1482 | +# blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] |
| 1483 | +# yield from self.parser.lazy_parse(blob) |
| 1484 | +# FIXME |
0 commit comments