准备环境
python3 -m pip install huggingface_hub
python3 -m pip install modelscope
python3 -m pip install -U magic-pdf[ full] --extra-index-url https://wheels.myhloli.com
下载需要的模型
import json
import os
import requests
from huggingface_hub import snapshot_download
def download_json ( url) :
response = requests. get( url)
response. raise_for_status( )
return response. json( )
def download_and_modify_json ( url, local_filename, modifications) :
if os. path. exists( local_filename) :
data = json. load( open ( local_filename) )
config_version = data. get( 'config_version' , '0.0.0' )
if config_version < '1.0.0' :
data = download_json( url)
else :
data = download_json( url)
for key, value in modifications. items( ) :
data[ key] = value
with open ( local_filename, 'w' , encoding= 'utf-8' ) as f:
json. dump( data, f, ensure_ascii= False , indent= 4 )
if __name__ == '__main__' :
mineru_patterns = [
"models/Layout/LayoutLMv3/*" ,
"models/Layout/YOLO/*" ,
"models/MFD/YOLO/*" ,
"models/MFR/unimernet_small/*" ,
"models/TabRec/TableMaster/*" ,
"models/TabRec/StructEqTable/*" ,
]
model_dir = snapshot_download( 'opendatalab/PDF-Extract-Kit-1.0' , allow_patterns= mineru_patterns)
layoutreader_pattern = [
"*.json" ,
"*.safetensors" ,
]
layoutreader_model_dir = snapshot_download( 'hantian/layoutreader' , allow_patterns= layoutreader_pattern)
model_dir = model_dir + '/models'
print ( f'model_dir is: { model_dir} ' )
print ( f'layoutreader_model_dir is: { layoutreader_model_dir} ' )
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os. path. expanduser( '~' )
config_file = os. path. join( home_dir, config_file_name)
json_mods = {
'models-dir' : model_dir,
'layoutreader-model-dir' : layoutreader_model_dir,
}
download_and_modify_json( json_url, config_file, json_mods)
print ( f'The configuration file has been configured successfully, the path is: { config_file} ' )
测试
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
GPU加速
vim yourpath/magic-pdf.json
"device-mode" : "cpu" -> "device-mode" : "cuda" ,