| |
| """A command line tool for extracting text and images from PDF and |
| output it to plain text, html, xml or tags. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import logging |
| import sys |
| from string import Template |
| from typing import List, Optional |
|
|
| from pdf2zh import __version__, log |
| from pdf2zh.high_level import translate, download_remote_fonts |
| from pdf2zh.doclayout import OnnxModel, ModelInstance |
| import os |
|
|
| from pdf2zh.config import ConfigManager |
| from babeldoc.translation_config import TranslationConfig as YadtConfig |
| from babeldoc.high_level import async_translate as yadt_translate |
| from babeldoc.high_level import init as yadt_init |
| from babeldoc.main import create_progress_handler |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def create_parser() -> argparse.ArgumentParser: |
| parser = argparse.ArgumentParser(description=__doc__, add_help=True) |
| parser.add_argument( |
| "files", |
| type=str, |
| default=None, |
| nargs="*", |
| help="One or more paths to PDF files.", |
| ) |
| parser.add_argument( |
| "--version", |
| "-v", |
| action="version", |
| version=f"pdf2zh v{__version__}", |
| ) |
| parser.add_argument( |
| "--debug", |
| "-d", |
| default=False, |
| action="store_true", |
| help="Use debug logging level.", |
| ) |
| parse_params = parser.add_argument_group( |
| "Parser", |
| description="Used during PDF parsing", |
| ) |
| parse_params.add_argument( |
| "--pages", |
| "-p", |
| type=str, |
| help="The list of page numbers to parse.", |
| ) |
| parse_params.add_argument( |
| "--vfont", |
| "-f", |
| type=str, |
| default="", |
| help="The regex to math font name of formula.", |
| ) |
| parse_params.add_argument( |
| "--vchar", |
| "-c", |
| type=str, |
| default="", |
| help="The regex to math character of formula.", |
| ) |
| parse_params.add_argument( |
| "--lang-in", |
| "-li", |
| type=str, |
| default="en", |
| help="The code of source language.", |
| ) |
| parse_params.add_argument( |
| "--lang-out", |
| "-lo", |
| type=str, |
| default="zh", |
| help="The code of target language.", |
| ) |
| parse_params.add_argument( |
| "--service", |
| "-s", |
| type=str, |
| default="google", |
| help="The service to use for translation.", |
| ) |
| parse_params.add_argument( |
| "--output", |
| "-o", |
| type=str, |
| default="", |
| help="Output directory for files.", |
| ) |
| parse_params.add_argument( |
| "--thread", |
| "-t", |
| type=int, |
| default=4, |
| help="The number of threads to execute translation.", |
| ) |
| parse_params.add_argument( |
| "--interactive", |
| "-i", |
| action="store_true", |
| help="Interact with GUI.", |
| ) |
| parse_params.add_argument( |
| "--share", |
| action="store_true", |
| help="Enable Gradio Share", |
| ) |
| parse_params.add_argument( |
| "--flask", |
| action="store_true", |
| help="flask", |
| ) |
| parse_params.add_argument( |
| "--celery", |
| action="store_true", |
| help="celery", |
| ) |
| parse_params.add_argument( |
| "--authorized", |
| type=str, |
| nargs="+", |
| help="user name and password.", |
| ) |
| parse_params.add_argument( |
| "--prompt", |
| type=str, |
| help="user custom prompt.", |
| ) |
|
|
| parse_params.add_argument( |
| "--compatible", |
| "-cp", |
| action="store_true", |
| help="Convert the PDF file into PDF/A format to improve compatibility.", |
| ) |
|
|
| parse_params.add_argument( |
| "--onnx", |
| type=str, |
| help="custom onnx model path.", |
| ) |
|
|
| parse_params.add_argument( |
| "--serverport", |
| type=int, |
| help="custom WebUI port.", |
| ) |
|
|
| parse_params.add_argument( |
| "--dir", |
| action="store_true", |
| help="translate directory.", |
| ) |
|
|
| parse_params.add_argument( |
| "--config", |
| type=str, |
| help="config file.", |
| ) |
|
|
| parse_params.add_argument( |
| "--babeldoc", |
| default=False, |
| action="store_true", |
| help="Use experimental backend babeldoc.", |
| ) |
|
|
| parse_params.add_argument( |
| "--skip-subset-fonts", |
| action="store_true", |
| help="Skip font subsetting. " |
| "This option can improve compatibility " |
| "but will increase the size of the output file.", |
| ) |
|
|
| parse_params.add_argument( |
| "--ignore-cache", |
| action="store_true", |
| help="Ignore cache and force retranslation.", |
| ) |
|
|
| return parser |
|
|
|
|
| def parse_args(args: Optional[List[str]]) -> argparse.Namespace: |
| parsed_args = create_parser().parse_args(args=args) |
|
|
| if parsed_args.pages: |
| pages = [] |
| for p in parsed_args.pages.split(","): |
| if "-" in p: |
| start, end = p.split("-") |
| pages.extend(range(int(start) - 1, int(end))) |
| else: |
| pages.append(int(p) - 1) |
| parsed_args.raw_pages = parsed_args.pages |
| parsed_args.pages = pages |
|
|
| return parsed_args |
|
|
|
|
| def find_all_files_in_directory(directory_path): |
| """ |
| Recursively search all PDF files in the given directory and return their paths as a list. |
| |
| :param directory_path: str, the path to the directory to search |
| :return: list of PDF file paths |
| """ |
| |
| if not os.path.isdir(directory_path): |
| raise ValueError(f"The provided path '{directory_path}' is not a directory.") |
|
|
| file_paths = [] |
|
|
| |
| for root, _, files in os.walk(directory_path): |
| for file in files: |
| |
| if file.lower().endswith(".pdf"): |
| |
| file_paths.append(os.path.join(root, file)) |
|
|
| return file_paths |
|
|
|
|
| def main(args: Optional[List[str]] = None) -> int: |
| from rich.logging import RichHandler |
|
|
| logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) |
|
|
| |
| logging.getLogger("httpx").setLevel("CRITICAL") |
| logging.getLogger("httpx").propagate = False |
| logging.getLogger("openai").setLevel("CRITICAL") |
| logging.getLogger("openai").propagate = False |
| logging.getLogger("httpcore").setLevel("CRITICAL") |
| logging.getLogger("httpcore").propagate = False |
| logging.getLogger("http11").setLevel("CRITICAL") |
| logging.getLogger("http11").propagate = False |
|
|
| parsed_args = parse_args(args) |
|
|
| if parsed_args.config: |
| ConfigManager.custome_config(parsed_args.config) |
|
|
| if parsed_args.debug: |
| log.setLevel(logging.DEBUG) |
|
|
| if parsed_args.onnx: |
| ModelInstance.value = OnnxModel(parsed_args.onnx) |
| else: |
| ModelInstance.value = OnnxModel.load_available() |
|
|
| if parsed_args.interactive: |
| from pdf2zh.gui import setup_gui |
|
|
| if parsed_args.serverport: |
| setup_gui( |
| parsed_args.share, parsed_args.authorized, int(parsed_args.serverport) |
| ) |
| else: |
| setup_gui(parsed_args.share, parsed_args.authorized) |
| return 0 |
|
|
| if parsed_args.flask: |
| from pdf2zh.backend import flask_app |
|
|
| flask_app.run(port=11008) |
| return 0 |
|
|
| if parsed_args.celery: |
| from pdf2zh.backend import celery_app |
|
|
| celery_app.start(argv=sys.argv[2:]) |
| return 0 |
|
|
| if parsed_args.prompt: |
| try: |
| with open(parsed_args.prompt, "r", encoding="utf-8") as file: |
| content = file.read() |
| parsed_args.prompt = Template(content) |
| except Exception: |
| raise ValueError("prompt error.") |
|
|
| print(parsed_args) |
| if parsed_args.babeldoc: |
| return yadt_main(parsed_args) |
| if parsed_args.dir: |
| untranlate_file = find_all_files_in_directory(parsed_args.files[0]) |
| parsed_args.files = untranlate_file |
| translate(model=ModelInstance.value, **vars(parsed_args)) |
| return 0 |
|
|
| translate(model=ModelInstance.value, **vars(parsed_args)) |
| return 0 |
|
|
|
|
| def yadt_main(parsed_args) -> int: |
| if parsed_args.dir: |
| untranlate_file = find_all_files_in_directory(parsed_args.files[0]) |
| else: |
| untranlate_file = parsed_args.files |
| lang_in = parsed_args.lang_in |
| lang_out = parsed_args.lang_out |
| ignore_cache = parsed_args.ignore_cache |
| outputdir = None |
| if parsed_args.output: |
| outputdir = parsed_args.output |
|
|
| |
| yadt_init() |
| font_path = download_remote_fonts(lang_out.lower()) |
|
|
| param = parsed_args.service.split(":", 1) |
| service_name = param[0] |
| service_model = param[1] if len(param) > 1 else None |
|
|
| envs = {} |
| prompt = [] |
|
|
| if parsed_args.prompt: |
| try: |
| with open(parsed_args.prompt, "r", encoding="utf-8") as file: |
| content = file.read() |
| prompt = Template(content) |
| except Exception: |
| raise ValueError("prompt error.") |
|
|
| from pdf2zh.translator import ( |
| AzureOpenAITranslator, |
| GoogleTranslator, |
| BingTranslator, |
| DeepLTranslator, |
| DeepLXTranslator, |
| OllamaTranslator, |
| OpenAITranslator, |
| ZhipuTranslator, |
| ModelScopeTranslator, |
| SiliconTranslator, |
| GeminiTranslator, |
| AzureTranslator, |
| TencentTranslator, |
| DifyTranslator, |
| AnythingLLMTranslator, |
| XinferenceTranslator, |
| ArgosTranslator, |
| GrokTranslator, |
| GroqTranslator, |
| DeepseekTranslator, |
| OpenAIlikedTranslator, |
| QwenMtTranslator, |
| ) |
|
|
| for translator in [ |
| GoogleTranslator, |
| BingTranslator, |
| DeepLTranslator, |
| DeepLXTranslator, |
| OllamaTranslator, |
| XinferenceTranslator, |
| AzureOpenAITranslator, |
| OpenAITranslator, |
| ZhipuTranslator, |
| ModelScopeTranslator, |
| SiliconTranslator, |
| GeminiTranslator, |
| AzureTranslator, |
| TencentTranslator, |
| DifyTranslator, |
| AnythingLLMTranslator, |
| ArgosTranslator, |
| GrokTranslator, |
| GroqTranslator, |
| DeepseekTranslator, |
| OpenAIlikedTranslator, |
| QwenMtTranslator, |
| ]: |
| if service_name == translator.name: |
| translator = translator( |
| lang_in, |
| lang_out, |
| service_model, |
| envs=envs, |
| prompt=prompt, |
| ignore_cache=ignore_cache, |
| ) |
| break |
| else: |
| raise ValueError("Unsupported translation service") |
| import asyncio |
|
|
| for file in untranlate_file: |
| file = file.strip("\"'") |
| yadt_config = YadtConfig( |
| input_file=file, |
| font=font_path, |
| pages=",".join((str(x) for x in getattr(parsed_args, "raw_pages", []))), |
| output_dir=outputdir, |
| doc_layout_model=None, |
| translator=translator, |
| debug=parsed_args.debug, |
| lang_in=lang_in, |
| lang_out=lang_out, |
| no_dual=False, |
| no_mono=False, |
| qps=parsed_args.thread, |
| ) |
|
|
| async def yadt_translate_coro(yadt_config): |
| progress_context, progress_handler = create_progress_handler(yadt_config) |
| |
| with progress_context: |
| async for event in yadt_translate(yadt_config): |
| progress_handler(event) |
| if yadt_config.debug: |
| logger.debug(event) |
| if event["type"] == "finish": |
| result = event["translate_result"] |
| logger.info("Translation Result:") |
| logger.info(f" Original PDF: {result.original_pdf_path}") |
| logger.info(f" Time Cost: {result.total_seconds:.2f}s") |
| logger.info(f" Mono PDF: {result.mono_pdf_path or 'None'}") |
| logger.info(f" Dual PDF: {result.dual_pdf_path or 'None'}") |
| break |
|
|
| asyncio.run(yadt_translate_coro(yadt_config)) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|