22
22
from urllib .parse import urlparse
23
23
24
24
import requests
25
-
26
25
from langchain_core .document_loaders import BaseBlobParser
27
26
from langchain_core .documents import Document
28
27
from langchain_core .utils import get_from_dict_or_env
37
36
DocumentIntelligenceParser ,
38
37
PDFMinerParser ,
39
38
PDFPlumberParser ,
39
+ PDFRouterParser ,
40
40
PyMuPDFParser ,
41
41
PyPDFium2Parser ,
42
- PyPDFParser , PDFRouterParser ,
42
+ PyPDFParser ,
43
43
)
44
44
from langchain_community .document_loaders .unstructured import UnstructuredFileLoader
45
45
@@ -1426,6 +1426,7 @@ def lazy_load(self) -> Iterator[Document]:
1426
1426
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
1427
1427
PagedPDFSplitter = PyPDFLoader
1428
1428
1429
+
1429
1430
class PDFRouterLoader (BasePDFLoader ):
1430
1431
"""
1431
1432
Load PDFs using different parsers based on the metadata of the PDF
@@ -1462,7 +1463,7 @@ def __init__(
1462
1463
routes : list [
1463
1464
tuple [
1464
1465
str ,
1465
- dict [str , Union [re .Pattern | str ]],
1466
+ dict [str , Union [re .Pattern , str ]],
1466
1467
BaseBlobParser ,
1467
1468
]
1468
1469
],
@@ -1472,14 +1473,11 @@ def __init__(
1472
1473
super ().__init__ (file_path )
1473
1474
self .parser = PDFRouterParser (routes , password = password )
1474
1475
1475
-
1476
1476
def lazy_load (
1477
1477
self ,
1478
1478
) -> Iterator [Document ]:
1479
1479
if self .web_path :
1480
- blob = Blob .from_data (
1481
- open (self .file_path , "rb" ).read (), path = self .web_path
1482
- ) # type: ignore[attr-defined]
1480
+ blob = Blob .from_data (open (self .file_path , "rb" ).read (), path = self .web_path ) # type: ignore[attr-defined]
1483
1481
else :
1484
1482
blob = Blob .from_path (self .file_path ) # type: ignore[attr-defined]
1485
1483
yield from self .parser .lazy_parse (blob )
0 commit comments