Spaces:

gitdeem
/

dt

Running

App Files Files Community

dt / app /translate /pdf.py

gitdeem

Upload 96 files

4e9efe9 verified 4 months ago

raw

history blame contribute delete

29.2 kB

	import platform
	import tempfile
	import threading
	import traceback

	import fitz
	import re
	from . import to_translate
	from . import common
	import io
	import sys
	import time
	import datetime
	from docx import Document
	from docx.shared import Pt, RGBColor
	# import pdfkit
	import subprocess
	import base64
	import pdf2docx
	from . import word
	import copy
	from io import BytesIO
	from PIL import Image,ImageDraw
	import pytesseract
	import uuid
	from pdfdeal import Doc2X
	# from weasyprint import HTML
	import os
	from docx2pdf import convert
	import shutil
	pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


	# -----word转pdf
	def docxtopdf(docx_path, pdf_path):
	# 如果目标 PDF 文件已存在，则删除
	# if os.path.exists(pdf_path):
	# os.remove(pdf_path)

	# 确保目标目录存在
	target_path_dir = os.path.dirname(pdf_path)
	if not os.path.exists(target_path_dir):
	os.makedirs(target_path_dir, mode=0o777, exist_ok=True)

	# 根据操作系统选择方案
	if platform.system() == "Windows":
	# Windows 方案：使用 pywin32 调用 Microsoft Word
	try:
	import win32com.client
	word = win32com.client.Dispatch("Word.Application")
	word.Visible = False # 不显示 Word 界面
	doc = word.Documents.Open(docx_path)
	doc.SaveAs(pdf_path, FileFormat=17) # 17 是 PDF 格式
	doc.Close()
	word.Quit()
	print("转换成功！")
	except Exception as e:
	print(f"Windows 方案转换失败: {e}")
	else:
	# Linux/macOS 方案：使用 unoconv
	sys.path.append("/usr/local/bin") # 添加 unoconv 可能的路径
	unoconv_path = shutil.which("unoconv")
	if unoconv_path is None:
	raise Exception("未安装 unoconv，请先安装 unoconv 或 LibreOffice")
	try:
	command = [unoconv_path, "-f", "pdf", "-o", pdf_path, docx_path]
	print("{} -f pdf -o {} {}".format(unoconv_path, pdf_path, docx_path))
	subprocess.run(command)
	print("转换成功！")
	except subprocess.CalledProcessError as e:
	print(f"Linux 方案转换失败: {e}")

	def start11(trans):
	texts=[]
	src_pdf = fitz.open(trans['file_path'])
	# print(is_scan_pdf(src_pdf))
	# exit()
	# if is_scan_pdf(src_pdf):
	start_time = datetime.datetime.now()
	origin_docx_path=os.path.dirname(trans['file_path'])+"/"+trans['uuid']+".docx"
	target_docx_path=os.path.dirname(trans['file_path'])+"/"+trans['uuid']+"-translated.docx"
	target_pdf_path=os.path.dirname(trans['file_path'])+"/"+trans['uuid']+".pdf"
	# target_pdf_path = trans['file_path']
	# target_docx_path=re.sub(r"\.pdf",".docx",trans['target_file'], flags=re.I)
	# pdf_path=re.sub(r"\.pdf",".docx",trans['file_path'], flags=re.I)
	# print(target_pdf_path+"\n")
	# print(trans['storage_path']+"\n")
	# print(trans['target_file']+"\n")
	# print(os.path.join(trans['storage_path'], trans['target_filepath'])+"\n")
	pdftodocx(trans['file_path'], origin_docx_path)
	word_trans=copy.copy(trans)
	word_trans['file_path']=origin_docx_path
	word_trans['target_file']=target_docx_path
	word_trans['run_complete']=False
	word_trans['extension']='.docx'
	text_count=0

	if word.start(word_trans):
	# print("word done")
	docxtopdf(target_docx_path, target_pdf_path)
	shutil.move(target_pdf_path, trans['target_file'])
	end_time = datetime.datetime.now()
	spend_time=common.display_spend(start_time, end_time)
	to_translate.complete(trans,text_count,spend_time)
	return True
	# return False

	uuid=trans['uuid']
	html_path=trans['storage_path']+'/uploads/'+uuid
	trans['html_path']=html_path
	# read_pdf_html(trans['file_path'], html_path)
	# print(trans['storage_path']+'/uploads/pdf.html')
	# exit()
	# 允许的最大线程
	# print(trans)
	# wkhtmltopdf_bin=common.find_command_location("wkhtmltopdf")
	threads=trans['threads']
	if threads is None or int(threads)<0:
	max_threads=10
	else:
	max_threads=int(threads)
	# 当前执行的索引位置
	run_index=0
	start_time = datetime.datetime.now()
	# print(f'Source pdf file: {} \n', trans['file_path'])

	read_page_images(src_pdf, texts)

	text_count=0
	# translate.get_models()
	# exit()
	# read_page_html(src_pdf, texts, trans)
	# read_pdf_html(src_pdf, texts, trans)
	pdftohtml(trans['file_path'], html_path, texts)
	src_pdf.close()

	# print(texts)
	# exit()

	max_run=max_threads if len(texts)>max_threads else len(texts)
	event=threading.Event()
	before_active_count=threading.activeCount()
	while run_index<=len(texts)-1:
	if threading.activeCount()<max_run+before_active_count:
	if not event.is_set():
	# print("run_index:",run_index)
	thread = threading.Thread(target=to_translate.get, args=(trans, event, texts, run_index))
	thread.start()
	run_index+=1
	else:
	return False

	while True:
	if event.is_set():
	return False
	complete=True
	for text in texts:
	if not text['complete']:
	complete=False
	if complete:
	break
	else:
	time.sleep(1)


	# print(texts)

	write_to_html_file(html_path, texts)
	# config = pdfkit.configuration(wkhtmltopdf="/usr/local/bin/wkhtmltopdf")
	# with open(html_path) as f:
	# pdfkit.from_file(f, trans['target_file'],options={"enable-local-file-access":True}, configuration=config)

	# print(trans['target_file'])

	end_time = datetime.datetime.now()
	spend_time=common.display_spend(start_time, end_time)
	to_translate.complete(trans, text_count, spend_time)
	return True
	# ----------------------

	pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


	def start(trans):
	texts = []
	# src_pdf = fitz.open(trans['file_path'])
	start_time = datetime.datetime.now()
	origin_docx_path = os.path.join(os.path.dirname(trans['file_path']), f"{trans['uuid']}.docx")
	target_docx_path = os.path.join(os.path.dirname(trans['file_path']), f"{trans['uuid']}-translated.docx")
	origin_pdf_path = trans['file_path']# 原pdf路径 os.path.join(os.path.dirname(trans['file_path']), f"{trans['uuid']}.pdf")
	target_pdf_path=trans['target_file'] # 目标pdf路径
	# 中间PDF路径
	target_pdf_path1 = os.path.join(os.path.dirname(trans['file_path']), trans['uuid'] + ".pdf")

	# ---------
	# origin_docx_path = os.path.dirname(trans['file_path']) + "/" + trans['uuid'] + ".docx"
	# target_docx_path = os.path.dirname(trans['file_path']) + "/" + trans[
	# 'uuid'] + "-translated.docx"
	# target_pdf_path = os.path.dirname(trans['file_path']) + "/" + trans['uuid'] + ".pdf"
	# --------

	# target_docx_path=re.sub(r"\.pdf",".docx",trans['target_file'], flags=re.I)
	# pdf_path=re.sub(r"\.pdf",".docx",trans['file_path'], flags=re.I)
	# print(target_pdf_path+"\n")
	# print(trans['storage_path']+"\n")
	# print(trans['target_file']+"\n")
	# print(os.path.join(trans['storage_path'], trans['target_filepath'])+"\n")
	# 先PDF转Word
	pdftodocx(origin_pdf_path, origin_docx_path)
	word_trans = copy.copy(trans)
	word_trans['file_path'] = origin_docx_path
	word_trans['target_file'] = target_docx_path
	word_trans['run_complete'] = False
	word_trans['extension'] = '.docx'
	text_count = 0

	if word.start(word_trans):
	docxtopdf(target_docx_path, target_pdf_path1)
	# 移动
	shutil.move(target_pdf_path1, target_pdf_path)
	end_time = datetime.datetime.now()
	spend_time = common.display_spend(start_time, end_time)
	print('pdf参数',trans)
	to_translate.complete(trans, text_count, spend_time)
	return True
	else:
	return False

	uuid = trans['uuid']
	html_path = trans['storage_path'] + '/uploads/' + uuid
	trans['html_path'] = html_path
	# read_pdf_html(trans['file_path'], html_path)
	# print(trans['storage_path']+'/uploads/pdf.html')
	# exit()
	# 允许的最大线程
	# print(trans)
	# wkhtmltopdf_bin=common.find_command_location("wkhtmltopdf")
	threads = trans['threads']
	if threads is None or int(threads) < 0:
	max_threads = 10
	else:
	max_threads = int(threads)
	# 当前执行的索引位置
	run_index = 0
	start_time = datetime.datetime.now()
	# print(f'Source pdf file: {} \n', trans['file_path'])

	read_page_images(src_pdf, texts)

	text_count = 0
	# translate.get_models()
	# exit()
	# read_page_html(src_pdf, texts, trans)
	# read_pdf_html(src_pdf, texts, trans)
	pdftohtml(trans['file_path'], html_path, texts)
	src_pdf.close()

	# print(texts)
	# exit()

	max_run = max_threads if len(texts) > max_threads else len(texts)
	event = threading.Event()
	before_active_count = threading.activeCount()
	while run_index <= len(texts) - 1:
	if threading.activeCount() < max_run + before_active_count:
	if not event.is_set():
	# print("run_index:",run_index)
	thread = threading.Thread(target=translate.get,
	args=(trans, event, texts, run_index))
	thread.start()
	run_index += 1
	else:
	return False

	while True:
	if event.is_set():
	return False
	complete = True
	for text in texts:
	if not text['complete']:
	complete = False
	if complete:
	break
	else:
	time.sleep(1)

	# print(texts)

	write_to_html_file(html_path, texts)
	# config = pdfkit.configuration(wkhtmltopdf="/usr/local/bin/wkhtmltopdf")
	# with open(html_path) as f:
	# pdfkit.from_file(f, trans['target_file'],options={"enable-local-file-access":True}, configuration=config)

	# print(trans['target_file'])

	end_time = datetime.datetime.now()
	spend_time = common.display_spend(start_time, end_time)
	to_translate.complete(trans, text_count, spend_time)
	return True

	# -------------------------------
	# def read_to_html(pages):

	def read_page_html(pages, texts, trans):
	storage_path=trans['storage_path']
	uuid=trans['uuid']
	if is_scan_pdf(pages):
	for index,page in enumerate(pages):
	html=page.get_text("xhtml")
	images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
	for i,image in enumerate(images):
	append_text(image, 'image', texts)

	else:
	for index,page in enumerate(pages):
	html=page.get_text("xhtml")
	# images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
	# for i,image in enumerate(images):
	append_text(html,'text', texts)

	def read_page_images(pages, texts):
	for index,page in enumerate(pages):
	html=page.get_text("xhtml")
	images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
	for i,image in enumerate(images):
	append_text(image, 'image', texts)

	def write_to_html_file(html_path,texts):
	with open(html_path, 'w+') as f:
	f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1"></head><body>')
	for item in texts:
	f.write(item.get("text", ""))
	f.write('</body></html>')
	f.close()

	def read_block_text(pages,texts):
	text=""
	for page in pages:
	last_x0=0
	last_x1=0
	html=page.get_text("html")
	with open("test.html",'a+') as f:
	f.write(html)
	f.close()
	exit()
	for block in page.get_text("blocks"):
	current_x1=block[2]
	current_x0=block[0]
	# 对于每个文本块，分行并读取
	if block[5]==0 or abs(current_x1-last_x1)>12 or abs(current_x0-last_x0)>12:
	append_text(text, "text", texts)
	text=block[4].replace("\n","")
	else:
	text=text+(block[4].replace("\n",""))
	last_x1=block[2]
	last_x0=block[0]
	append_text(text, "text", texts)

	def write_block_text(pages,newpdf,texts):
	text=""
	for page in pages:
	last_x0=0
	last_x1=0
	last_y0=0
	new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
	font=fitz.Font("helv")
	for block in page.get_text("blocks"):
	current_x1=block[2]
	current_x0=block[0]
	current_y0=block[1]
	# 对于每个文本块，分行并读取
	if block[5]==0 or abs(current_x1-last_x1)>12 or abs(current_x0-last_x0)>12 and len(texts)>0:
	item=texts.pop(0)
	trans_text=item.get("text","")
	new_page.insert_text((last_x0,last_y0), trans_text, fontsize=12,fontname="Helvetica", overlay=False)
	text=block[4].replace("\n","")
	else:
	text=text+(block[4].replace("\n",""))
	last_x1=block[2]
	last_x0=block[0]
	last_y0=block[1]
	if check_text(text) and len(texts):
	new_page.insert_text((last_x0,last_y0), trans_text, fontsize=12, overlay=False)

	def write_page_text(pages,newpdf,texts):
	for page in pages:
	text=page.get_text("text")
	new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
	if check_text(text) and len(texts)>0:
	item=texts.pop(0)
	text=item.get("text","")
	new_page.insert_text((0,0), text, fontsize=12, overlay=False)

	def read_row(pages,texts):
	text=""
	for page in pages:
	# 获取页面的文本块
	for block in page.get_text("blocks"):
	# 对于每个文本块，分行并读取
	if block[5]==0:
	append_text(text, 'text', texts)
	text=block[4]
	else:
	text=text+block[4]

	def write_row(newpdf, texts, page_width, page_height):
	text_count=0
	new_page = newpdf.new_page(width=page_width, height=page_height)
	for text in texts:
	print(text['text'])
	# draw_text_avoid_overlap(new_page, text['text'],text['block'][0],text['block'][1], 16)
	new_page.insert_text((text['block'][0],text['block'][1]),text['text'], fontsize=16)
	return



	def append_text(text, content_type, texts):
	if check_text(text):
	# print(text)
	texts.append({"text":text,"type":content_type, "complete":False})


	def check_text(text):
	return text!=None and len(text)>0 and not common.is_all_punc(text)

	def draw_text_avoid_overlap(page, text, x, y, font_size):
	"""
	在指定位置绘制文本，避免与现有文本重叠。
	"""
	text_length = len(text) * font_size # 估算文本长度
	while True:
	text_box = page.get_textbox((x, y, x + text_length, y + font_size))
	if not text_box:
	break # 没有重叠的文本，退出循环
	y += font_size + 1 # 移动到下一个位置

	page.insert_text((x,y),text, fontsize=font_size)


	def draw_table(page, table_data, x, y, width, cell_height):
	# 表格的列数
	cols = len(table_data[0])
	rows = len(table_data)

	# 绘制表格
	for i in range(rows):
	for j in range(cols):
	# 文字写入
	txt = table_data[i][j]
	page.insert_text((x, y), txt)
	# 绘制单元格边框 (仅边界线)
	# 左边
	page.draw_line((x, y),( x+width/cols, y), width=0.5)
	# 上边
	if i == 0:
	page.draw_line((x, y), (x, y+cell_height), width=0.5)
	# 右边
	if j == cols-1:
	page.draw_line((x+width/cols, y), (x+width/cols, y+cell_height), width=0.5)
	# 下边
	if i == rows-1:
	page.draw_line((x, y+cell_height), (x+width/cols, y+cell_height), width=0.5)
	# 移动到下一个单元格
	x += width/cols
	# 移动到下一行
	x = 0
	y += cell_height

	def wrap_text(text, width):
	words = text.split(' ')
	lines = []
	line = ""
	for word in words:
	if len(line.split(' ')) >= width:
	lines.append(line)
	line = ""
	if len(line + word + ' ') <= width * len(word):
	line += word + ' '
	else:
	lines.append(line)
	line = word + ' '
	if line:
	lines.append(line)
	return lines


	def is_paragraph(block):
	# 假设一个段落至少有两行
	if len(block) < 2:
	return False
	# 假设一个段落的行间隔较大
	if max([line.height for line in block]) / min([line.height for line in block]) > 1.5:
	return True
	return False

	def is_next_line_continuation(page, current_line, next_line_index):
	# 判断下一行是否是当前行的继续
	return abs(next_line_index - current_line) < 0.1

	def print_texts(texts):
	for item in texts:
	print(item.get("text"))

	def is_scan_pdf(pages):
	for index,page in enumerate(pages):
	html=page.get_text("xhtml")
	images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
	text=page.get_text()
	print(images)
	print(text)
	if text=="" and len(images)>0:
	return True
	else:
	return False

	def read_pdf_html(pages, texts, trans):
	for index,page in enumerate(pages):
	target_html="{}-{}.html".format(trans['html_path'], page_index)
	if os.path.exists(target_html):
	os.remove(target_html)
	dftohtml_path = shutil.which("pdftohtml")
	if pdftohtml_path is None:
	raise Exception("未安装pdftohtml")
	subprocess.run([dftohtml_path,"-c","-l", page_index, trans['file_path'], trans['html_path']])
	if not os.path.exists(target_html):
	raise Exception("无法生成html")
	# append_text(html,'text', texts)


	def pdftohtml(pdf_path, html_path,texts):
	target_html="{}-html.html".format(html_path)
	if os.path.exists(target_html):
	os.remove(target_html)
	pdftohtml_path = shutil.which("pdftohtml")
	if pdftohtml_path is None:
	raise Exception("未安装pdftohtml")
	subprocess.run([pdftohtml_path,"-c","-s", pdf_path, html_path])
	if not os.path.exists(target_html):
	raise Exception("无法生成html")
	with open(target_html, 'r') as f:
	content=f.read()
	print(content)
	append_text(content, 'text', texts)


	def pdftodocx(pdf_path, docx_path):
	print(docx_path)
	if os.path.exists(docx_path):
	os.remove(docx_path)
	print(pdf_path)
	try:
	cv = pdf2docx.Converter(pdf_path)
	cv.debug_page(0)
	cv.convert(docx_path, start=0,end=1,multi_processing=False)
	cv.close()
	#exit()
	except Exception as e:
	print("error")
	pdf2docxNext(pdf_path, docx_path)

	def pdf2docxNext(pdf_path, docx_path):
	try:
	# 创建一个新的 DOCX 文档
	doc = Document()
	# 打开 PDF 文件
	pdf_document = fitz.open(pdf_path)
	# 遍历 PDF 的每一页
	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	fonts=page.get_fonts()
	# 提取文本
	# 提取文本和样式信息
	text_dict = page.get_text("dict")

	# 遍历文本块
	for block in text_dict["blocks"]:
	if block["type"] == 0: # 只处理文本块
	for line in block["lines"]:
	for span in line["spans"]:
	text = span["text"]
	font_size = span["size"] # 字体大小
	font_color = span["color"] # 字体颜色

	# 创建段落
	paragraph = doc.add_paragraph()
	run = paragraph.add_run(text)

	# 设置字体大小
	run.font.size = Pt(font_size)

	# 设置字体颜色
	if font_color:
	run.font.color.rgb = RGBColor(
	(font_color >> 16) & 0xFF, # R
	(font_color >> 8) & 0xFF, # G
	font_color & 0xFF # B
	)
	elif block["type"] == 1:
	# 提取图像
	try:
	img_index = block["image"]
	base_image = pdf_document.extract_image(img_index)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]
	# 将图像添加到 DOCX
	image_stream = BytesIO(image_bytes)
	doc.add_picture(image_stream, width=None) # 可以指定宽度
	except Exception as e:
	print("图片无法解析")



	# 添加分页符
	doc.add_page_break()

	# 保存 DOCX 文件
	doc.save(docx_path)
	pdf_document.close()
	except Exception as e:
	raise("pdf转docx失败")

	# 舍弃
	def docxtopdf6(docx_path, pdf_path):
	"""
	使用 docx2pdf 库实现跨平台 DOCX 转 PDF
	保持原始逻辑：删除已存在的PDF、创建目录、错误处理
	"""

	# 删除已存在的PDF文件（保留原始逻辑）
	if os.path.exists(pdf_path):
	try:
	os.remove(pdf_path)
	except Exception as e:
	raise RuntimeError(f"无法删除旧PDF文件 {pdf_path}: {str(e)}")

	# 创建输出目录（优化权限设置）
	target_dir = os.path.dirname(pdf_path)
	if not os.path.exists(target_dir):
	try:
	os.makedirs(target_dir, exist_ok=True) # 去除明确的 0o777 权限
	except Exception as e:
	raise RuntimeError(f"无法创建目录 {target_dir}: {str(e)}")

	# 执行转换（替换核心实现）
	try:
	print(f"正在转换: {docx_path} → {pdf_path}") # 保留日志输出
	convert(docx_path, pdf_path) # 核心转换调用

	# 验证转换结果
	if not os.path.exists(pdf_path):
	raise RuntimeError("转换成功但未生成预期输出文件")

	print("转换完成") # 保留完成提示

	except Exception as e:
	# 增强错误信息
	error_msg = f"DOCX转PDF失败: {str(e)}"
	if "No such file or directory" in str(e):
	error_msg += " (请检查输入文件路径)"
	elif "Permission denied" in str(e):
	error_msg += " (权限不足)"
	raise RuntimeError(error_msg)





	# 旧方案
	def docxtopdf11111(docx_path, pdf_path):
	if os.path.exists(pdf_path):
	os.remove(pdf_path)
	sys.path.append("/usr/local/bin")
	unoconv_path = shutil.which("unoconv")
	if unoconv_path is None:
	raise Exception("未安装unoconv")
	target_path_dir=os.path.dirname(pdf_path)
	if not os.path.exists(target_path_dir):
	os.makedirs(target_path_dir, mode=0o777, exist_ok=True)
	# target_pdf = fitz.Document()
	# target_pdf.new_page()
	# target_pdf.save(pdf_path)
	# target_pdf.close()
	# subprocess.run([unoconv_path,"-f","pdf","-e","UTF-8","-o",target_path_dir, docx_path])
	# subprocess.run([unoconv_path,"-f","pdf","-e","UTF-8","-o",target_path_dir, docx_path])
	print("{} -f pdf -o {} {}".format(unoconv_path,pdf_path, docx_path))
	# subprocess.run("{} -f pdf -o {} {}".format(unoconv_path, pdf_path, docx_path), shell=True)
	command = [unoconv_path, "-f", "pdf", "-o", pdf_path, docx_path]
	subprocess.run(command)
	print("done")

	def create_temp_file(suffix='.png'):
	temp_dir = '/tmp' # 或者使用其他临时目录
	filename = f"{uuid.uuid4()}{suffix}"
	return os.path.join(temp_dir, filename)

	def pdf_to_text_with_ocr(pdf_path, docx_path, origin_lang):
	# if not is_tesseract_installed():
	# raise Exception("Tesseract未安装,无法进行OCR")

	document = fitz.open(pdf_path)
	docx = Document()

	for page_num in range(len(document)):
	page = document.load_page(page_num)
	pix = page.get_pixmap()
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	# 转换为灰度图像
	img = img.convert('L')

	# 将图像保存到内存中的字节流
	img_byte_arr = io.BytesIO()
	img.save(img_byte_arr, format='PNG')
	img_byte_arr = img_byte_arr.getvalue()

	try:
	# 使用 Tesseract 命令行工具
	process = subprocess.Popen(
	['/usr/local/bin/tesseract', 'stdin', 'stdout', '-l', origin_lang, '--oem', '3', '--psm', '6'],
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE
	)
	stdout, stderr = process.communicate(input=img_byte_arr)

	if process.returncode != 0:
	raise subprocess.CalledProcessError(process.returncode, process.args, stdout, stderr)

	text = stdout.decode('utf-8').strip()

	# 移除空行和多余的空格
	text = '\n'.join(line.strip() for line in text.splitlines() if line.strip())

	except subprocess.CalledProcessError as e:
	print(f"OCR处理页面 {page_num + 1} 时出错: {str(e)}")
	text = "" # 如果出错，使用空字符串

	paragraph = docx.add_paragraph()
	run = paragraph.add_run(text)
	run.font.size = Pt(12)

	document.close()
	docx.save(docx_path)

	def is_scanned_pdf(pdf_path):
	document = fitz.open(pdf_path)

	# 只检查前几页，通常足以判断
	pages_to_check = min(5, len(document))

	for page_num in range(pages_to_check):
	page = document[page_num]

	# 检查文本
	text = page.get_text().strip()
	if text:
	document.close()
	return False

	# 检查图像
	image_list = page.get_images()
	if len(image_list) > 0:
	# 如果页面只包含一个大图像，很可能是扫描件
	if len(image_list) == 1:
	xref = image_list[0][0]
	img = document.extract_image(xref)
	if img:
	pix = fitz.Pixmap(img["image"])
	# 如果图像覆盖了大部分页面，可能是扫描件
	if pix.width > page.rect.width * 0.9 and pix.height > page.rect.height * 0.9:
	document.close()
	return True

	document.close()
	return True # 如果没有找到文本，默认认为是扫描件

	def is_tesseract_installed():
	tesseract_path = "/usr/local/bin/tesseract"
	return os.path.isfile(tesseract_path) and os.access(tesseract_path, os.X_OK)

	def use_doc2x_revert_pdf_to_docx(dox2x_api_key, pdf_file, docx_path):
	client = Doc2X(apikey=dox2x_api_key,debug=False)
	success, failed, flag = client.pdf2file(
	pdf_file=pdf_file,
	output_path=docx_path,
	output_format="docx",
	)
	if len(success)>0 and success[0]!="":
	return (True,success[0])
	else:
	return (False,failed[0]["error"])

	# def save_image(base64_data, path):
	# image_data = base64.b64decode(base64_data)
	# # 将字节数据写入内存中的文件对象
	# image_file = BytesIO(image_data)
	# # 从内存中的文件对象创建Image对象
	# image = Image.open(image_file)
	# # 保存图片到文件系统
	# image.sav/e(path)