首先,手动下载一个软件(poppler for Windows),下载地址:https://github.com/oschwartz10612/poppler-windows/releases/tag/v24.08.0-0
否则会出现以下错误:
PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?
下载完了之后解压并配置下环境变量并确认是否安装成功:
比如:解压到C:\software\poppler-24.08.0;添加系统环境变量:C:\software\poppler-24.08.0\Library\bin;打开cmd命令行验证:输入pdfinfo -v
完整代码:
#!/user/bin/env python3
# -*- coding: utf-8 -*-
import os
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
def pdf_to_jpg(folder_path, output_path):
for root, dirs, files in os.walk(folder_path):
# 创建输出目录
if not os.path.exists(output_path):
os.makedirs(output_path)
for file in files:
if not file.endswith(".pdf"):
continue
if len(dirs) < 1:
images = convert_from_path(os.path.join(root, file),
dpi=600,
poppler_path=r'C:\software\poppler-24.08.0\Library\bin')
# 将每一页图像保存为JPEG文件
for i, image in enumerate(images):
# 还可以指定宽度或高度,调整图像大小
# if width or height:
# image = image.resize((width, height))
gray_image = image.convert("L") # 将图片转为8位灰度图,“L”表示luminance
gray_array = np.array(gray_image)
threshold = 240
mask = gray_array < threshold # 用阈值来获取图片中非白色部分
coords = np.column_stack(np.where(mask))
y0, x0 = coords.min(axis=0) # 获取非白色区域的坐标
y1, x1 = coords.max(axis=0)
cropped_image = image.crop((x0, y0, x1+1, y1+1))
jpg_file = os.path.join(output_path, f"{file.split('.')[0]}.jpg")
cropped_image.save(jpg_file, 'JPEG')
print(f'Saved {output_path}')
else:
for d in dirs:
images = convert_from_path(os.path.join(root, d))
# 创建输出目录
output_path_d = os.path.join(output_path, d)
if not os.path.exists(output_path_d):
os.makedirs(output_path_d)
# 将每一页图像保存为JPEG文件
for i, image in enumerate(images):
# 还可以指定宽度或高度,调整图像大小
# if width or height:
# image = image.resize((width, height))
jpg_file = os.path.join(output_path_d, f"{file.split('.')[0]}.jpg")
image.save(jpg_file, 'JPEG')
print(f'Saved {output_path_d}')
if __name__ == '__main__':
# PDF文件路径
pdf_path = r'C:\datasets\D94_pdf'
pdf_to_jpg(pdf_path, r'C:\datasets\D94_jpg')
# 转换为图像
参考链接:
python 去除图片白边_mob649e8167c4a3的技术博客_51CTO博客
Python学习笔记:PDF转图片 - Hider1214 - 博客园