Replace PyPDF2 with pypdf for PDF processing

- Update import from PyPDF2 to pypdf
- Change dependency to pypdf>=6.1.0
- Update all requirements files
- Remove PyPDF2 from lock file
- Use modern pypdf library
This commit is contained in:
yangdx
2025-11-11 01:38:09 +08:00
parent af5423919b
commit c434879c7a
5 changed files with 9 additions and 18 deletions

View File

@@ -1081,11 +1081,11 @@ async def pipeline_enqueue_file(
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("pypdf2"): # type: ignore
pm.install("pypdf2")
if not pm.is_installed("pypdf"): # type: ignore
pm.install("pypdf")
if not pm.is_installed("pycryptodome"): # type: ignore
pm.install("pycryptodome")
from PyPDF2 import PdfReader # type: ignore
from pypdf import PdfReader # type: ignore
from io import BytesIO
pdf_file = BytesIO(file)

View File

@@ -86,7 +86,7 @@ offline-docs = [
# Document processing dependencies
"openpyxl>=3.0.0,<4.0.0",
"pycryptodome>=3.0.0,<4.0.0",
"pypdf2>=3.0.0",
"pypdf>=6.1.0",
"python-docx>=0.8.11,<2.0.0",
"python-pptx>=0.6.21,<2.0.0",
]

View File

@@ -10,6 +10,6 @@
# Document processing dependencies (with version constraints matching pyproject.toml)
openpyxl>=3.0.0,<4.0.0
pycryptodome>=3.0.0,<4.0.0
pypdf2>=3.0.0
pypdf>=6.1.0
python-docx>=0.8.11,<2.0.0
python-pptx>=0.6.21,<2.0.0

View File

@@ -24,7 +24,7 @@ openpyxl>=3.0.0,<4.0.0
pycryptodome>=3.0.0,<4.0.0
pymilvus>=2.6.2,<3.0.0
pymongo>=4.0.0,<5.0.0
pypdf2>=3.0.0
pypdf>=6.1.0
python-docx>=0.8.11,<2.0.0
python-pptx>=0.6.21,<2.0.0
qdrant-client>=1.11.0,<2.0.0

15
uv.lock generated
View File

@@ -1981,7 +1981,7 @@ offline = [
{ name = "pycryptodome" },
{ name = "pymilvus" },
{ name = "pymongo" },
{ name = "pypdf2" },
{ name = "pypdf" },
{ name = "python-docx" },
{ name = "python-pptx" },
{ name = "qdrant-client" },
@@ -1992,7 +1992,7 @@ offline = [
offline-docs = [
{ name = "openpyxl" },
{ name = "pycryptodome" },
{ name = "pypdf2" },
{ name = "pypdf" },
{ name = "python-docx" },
{ name = "python-pptx" },
]
@@ -2071,7 +2071,7 @@ requires-dist = [
{ name = "pyjwt", marker = "extra == 'api'", specifier = ">=2.8.0,<3.0.0" },
{ name = "pymilvus", marker = "extra == 'offline-storage'", specifier = ">=2.6.2,<3.0.0" },
{ name = "pymongo", marker = "extra == 'offline-storage'", specifier = ">=4.0.0,<5.0.0" },
{ name = "pypdf2", marker = "extra == 'offline-docs'", specifier = ">=3.0.0" },
{ name = "pypdf", marker = "extra == 'offline-docs'", specifier = ">=6.1.0" },
{ name = "pypinyin" },
{ name = "pypinyin", marker = "extra == 'api'" },
{ name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
@@ -3977,15 +3977,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/fa/ed/494fd0cc1190a7c335e6958eeaee6f373a281869830255c2ed4785dac135/pypdf-6.1.3-py3-none-any.whl", hash = "sha256:eb049195e46f014fc155f566fa20e09d70d4646a9891164ac25fa0cbcfcdbcb5", size = 323863, upload-time = "2025-10-22T16:13:44.174Z" },
]
[[package]]
name = "pypdf2"
version = "3.0.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
]
[[package]]
name = "pypinyin"
version = "0.55.0"