dt / app /translate /pdf.py
gitdeem's picture
Upload 96 files
4e9efe9 verified
import platform
import tempfile
import threading
import traceback
import fitz
import re
from . import to_translate
from . import common
import io
import sys
import time
import datetime
from docx import Document
from docx.shared import Pt, RGBColor
# import pdfkit
import subprocess
import base64
import pdf2docx
from . import word
import copy
from io import BytesIO
from PIL import Image,ImageDraw
import pytesseract
import uuid
from pdfdeal import Doc2X
# from weasyprint import HTML
import os
from docx2pdf import convert
import shutil
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
# -----word转pdf
def docxtopdf(docx_path, pdf_path):
# 如果目标 PDF 文件已存在,则删除
# if os.path.exists(pdf_path):
# os.remove(pdf_path)
# 确保目标目录存在
target_path_dir = os.path.dirname(pdf_path)
if not os.path.exists(target_path_dir):
os.makedirs(target_path_dir, mode=0o777, exist_ok=True)
# 根据操作系统选择方案
if platform.system() == "Windows":
# Windows 方案:使用 pywin32 调用 Microsoft Word
try:
import win32com.client
word = win32com.client.Dispatch("Word.Application")
word.Visible = False # 不显示 Word 界面
doc = word.Documents.Open(docx_path)
doc.SaveAs(pdf_path, FileFormat=17) # 17 是 PDF 格式
doc.Close()
word.Quit()
print("转换成功!")
except Exception as e:
print(f"Windows 方案转换失败: {e}")
else:
# Linux/macOS 方案:使用 unoconv
sys.path.append("/usr/local/bin") # 添加 unoconv 可能的路径
unoconv_path = shutil.which("unoconv")
if unoconv_path is None:
raise Exception("未安装 unoconv,请先安装 unoconv 或 LibreOffice")
try:
command = [unoconv_path, "-f", "pdf", "-o", pdf_path, docx_path]
print("{} -f pdf -o {} {}".format(unoconv_path, pdf_path, docx_path))
subprocess.run(command)
print("转换成功!")
except subprocess.CalledProcessError as e:
print(f"Linux 方案转换失败: {e}")
def start11(trans):
texts=[]
src_pdf = fitz.open(trans['file_path'])
# print(is_scan_pdf(src_pdf))
# exit()
# if is_scan_pdf(src_pdf):
start_time = datetime.datetime.now()
origin_docx_path=os.path.dirname(trans['file_path'])+"/"+trans['uuid']+".docx"
target_docx_path=os.path.dirname(trans['file_path'])+"/"+trans['uuid']+"-translated.docx"
target_pdf_path=os.path.dirname(trans['file_path'])+"/"+trans['uuid']+".pdf"
# target_pdf_path = trans['file_path']
# target_docx_path=re.sub(r"\.pdf",".docx",trans['target_file'], flags=re.I)
# pdf_path=re.sub(r"\.pdf",".docx",trans['file_path'], flags=re.I)
# print(target_pdf_path+"\n")
# print(trans['storage_path']+"\n")
# print(trans['target_file']+"\n")
# print(os.path.join(trans['storage_path'], trans['target_filepath'])+"\n")
pdftodocx(trans['file_path'], origin_docx_path)
word_trans=copy.copy(trans)
word_trans['file_path']=origin_docx_path
word_trans['target_file']=target_docx_path
word_trans['run_complete']=False
word_trans['extension']='.docx'
text_count=0
if word.start(word_trans):
# print("word done")
docxtopdf(target_docx_path, target_pdf_path)
shutil.move(target_pdf_path, trans['target_file'])
end_time = datetime.datetime.now()
spend_time=common.display_spend(start_time, end_time)
to_translate.complete(trans,text_count,spend_time)
return True
# return False
uuid=trans['uuid']
html_path=trans['storage_path']+'/uploads/'+uuid
trans['html_path']=html_path
# read_pdf_html(trans['file_path'], html_path)
# print(trans['storage_path']+'/uploads/pdf.html')
# exit()
# 允许的最大线程
# print(trans)
# wkhtmltopdf_bin=common.find_command_location("wkhtmltopdf")
threads=trans['threads']
if threads is None or int(threads)<0:
max_threads=10
else:
max_threads=int(threads)
# 当前执行的索引位置
run_index=0
start_time = datetime.datetime.now()
# print(f'Source pdf file: {} \n', trans['file_path'])
read_page_images(src_pdf, texts)
text_count=0
# translate.get_models()
# exit()
# read_page_html(src_pdf, texts, trans)
# read_pdf_html(src_pdf, texts, trans)
pdftohtml(trans['file_path'], html_path, texts)
src_pdf.close()
# print(texts)
# exit()
max_run=max_threads if len(texts)>max_threads else len(texts)
event=threading.Event()
before_active_count=threading.activeCount()
while run_index<=len(texts)-1:
if threading.activeCount()<max_run+before_active_count:
if not event.is_set():
# print("run_index:",run_index)
thread = threading.Thread(target=to_translate.get, args=(trans, event, texts, run_index))
thread.start()
run_index+=1
else:
return False
while True:
if event.is_set():
return False
complete=True
for text in texts:
if not text['complete']:
complete=False
if complete:
break
else:
time.sleep(1)
# print(texts)
write_to_html_file(html_path, texts)
# config = pdfkit.configuration(wkhtmltopdf="/usr/local/bin/wkhtmltopdf")
# with open(html_path) as f:
# pdfkit.from_file(f, trans['target_file'],options={"enable-local-file-access":True}, configuration=config)
# print(trans['target_file'])
end_time = datetime.datetime.now()
spend_time=common.display_spend(start_time, end_time)
to_translate.complete(trans, text_count, spend_time)
return True
# ----------------------
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
def start(trans):
texts = []
# src_pdf = fitz.open(trans['file_path'])
start_time = datetime.datetime.now()
origin_docx_path = os.path.join(os.path.dirname(trans['file_path']), f"{trans['uuid']}.docx")
target_docx_path = os.path.join(os.path.dirname(trans['file_path']), f"{trans['uuid']}-translated.docx")
origin_pdf_path = trans['file_path']# 原pdf路径 os.path.join(os.path.dirname(trans['file_path']), f"{trans['uuid']}.pdf")
target_pdf_path=trans['target_file'] # 目标pdf路径
# 中间PDF路径
target_pdf_path1 = os.path.join(os.path.dirname(trans['file_path']), trans['uuid'] + ".pdf")
# ---------
# origin_docx_path = os.path.dirname(trans['file_path']) + "/" + trans['uuid'] + ".docx"
# target_docx_path = os.path.dirname(trans['file_path']) + "/" + trans[
# 'uuid'] + "-translated.docx"
# target_pdf_path = os.path.dirname(trans['file_path']) + "/" + trans['uuid'] + ".pdf"
# --------
# target_docx_path=re.sub(r"\.pdf",".docx",trans['target_file'], flags=re.I)
# pdf_path=re.sub(r"\.pdf",".docx",trans['file_path'], flags=re.I)
# print(target_pdf_path+"\n")
# print(trans['storage_path']+"\n")
# print(trans['target_file']+"\n")
# print(os.path.join(trans['storage_path'], trans['target_filepath'])+"\n")
# 先PDF转Word
pdftodocx(origin_pdf_path, origin_docx_path)
word_trans = copy.copy(trans)
word_trans['file_path'] = origin_docx_path
word_trans['target_file'] = target_docx_path
word_trans['run_complete'] = False
word_trans['extension'] = '.docx'
text_count = 0
if word.start(word_trans):
docxtopdf(target_docx_path, target_pdf_path1)
# 移动
shutil.move(target_pdf_path1, target_pdf_path)
end_time = datetime.datetime.now()
spend_time = common.display_spend(start_time, end_time)
print('pdf参数',trans)
to_translate.complete(trans, text_count, spend_time)
return True
else:
return False
uuid = trans['uuid']
html_path = trans['storage_path'] + '/uploads/' + uuid
trans['html_path'] = html_path
# read_pdf_html(trans['file_path'], html_path)
# print(trans['storage_path']+'/uploads/pdf.html')
# exit()
# 允许的最大线程
# print(trans)
# wkhtmltopdf_bin=common.find_command_location("wkhtmltopdf")
threads = trans['threads']
if threads is None or int(threads) < 0:
max_threads = 10
else:
max_threads = int(threads)
# 当前执行的索引位置
run_index = 0
start_time = datetime.datetime.now()
# print(f'Source pdf file: {} \n', trans['file_path'])
read_page_images(src_pdf, texts)
text_count = 0
# translate.get_models()
# exit()
# read_page_html(src_pdf, texts, trans)
# read_pdf_html(src_pdf, texts, trans)
pdftohtml(trans['file_path'], html_path, texts)
src_pdf.close()
# print(texts)
# exit()
max_run = max_threads if len(texts) > max_threads else len(texts)
event = threading.Event()
before_active_count = threading.activeCount()
while run_index <= len(texts) - 1:
if threading.activeCount() < max_run + before_active_count:
if not event.is_set():
# print("run_index:",run_index)
thread = threading.Thread(target=translate.get,
args=(trans, event, texts, run_index))
thread.start()
run_index += 1
else:
return False
while True:
if event.is_set():
return False
complete = True
for text in texts:
if not text['complete']:
complete = False
if complete:
break
else:
time.sleep(1)
# print(texts)
write_to_html_file(html_path, texts)
# config = pdfkit.configuration(wkhtmltopdf="/usr/local/bin/wkhtmltopdf")
# with open(html_path) as f:
# pdfkit.from_file(f, trans['target_file'],options={"enable-local-file-access":True}, configuration=config)
# print(trans['target_file'])
end_time = datetime.datetime.now()
spend_time = common.display_spend(start_time, end_time)
to_translate.complete(trans, text_count, spend_time)
return True
# -------------------------------
# def read_to_html(pages):
def read_page_html(pages, texts, trans):
storage_path=trans['storage_path']
uuid=trans['uuid']
if is_scan_pdf(pages):
for index,page in enumerate(pages):
html=page.get_text("xhtml")
images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
for i,image in enumerate(images):
append_text(image, 'image', texts)
else:
for index,page in enumerate(pages):
html=page.get_text("xhtml")
# images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
# for i,image in enumerate(images):
append_text(html,'text', texts)
def read_page_images(pages, texts):
for index,page in enumerate(pages):
html=page.get_text("xhtml")
images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
for i,image in enumerate(images):
append_text(image, 'image', texts)
def write_to_html_file(html_path,texts):
with open(html_path, 'w+') as f:
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1"></head><body>')
for item in texts:
f.write(item.get("text", ""))
f.write('</body></html>')
f.close()
def read_block_text(pages,texts):
text=""
for page in pages:
last_x0=0
last_x1=0
html=page.get_text("html")
with open("test.html",'a+') as f:
f.write(html)
f.close()
exit()
for block in page.get_text("blocks"):
current_x1=block[2]
current_x0=block[0]
# 对于每个文本块,分行并读取
if block[5]==0 or abs(current_x1-last_x1)>12 or abs(current_x0-last_x0)>12:
append_text(text, "text", texts)
text=block[4].replace("\n","")
else:
text=text+(block[4].replace("\n",""))
last_x1=block[2]
last_x0=block[0]
append_text(text, "text", texts)
def write_block_text(pages,newpdf,texts):
text=""
for page in pages:
last_x0=0
last_x1=0
last_y0=0
new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
font=fitz.Font("helv")
for block in page.get_text("blocks"):
current_x1=block[2]
current_x0=block[0]
current_y0=block[1]
# 对于每个文本块,分行并读取
if block[5]==0 or abs(current_x1-last_x1)>12 or abs(current_x0-last_x0)>12 and len(texts)>0:
item=texts.pop(0)
trans_text=item.get("text","")
new_page.insert_text((last_x0,last_y0), trans_text, fontsize=12,fontname="Helvetica", overlay=False)
text=block[4].replace("\n","")
else:
text=text+(block[4].replace("\n",""))
last_x1=block[2]
last_x0=block[0]
last_y0=block[1]
if check_text(text) and len(texts):
new_page.insert_text((last_x0,last_y0), trans_text, fontsize=12, overlay=False)
def write_page_text(pages,newpdf,texts):
for page in pages:
text=page.get_text("text")
new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
if check_text(text) and len(texts)>0:
item=texts.pop(0)
text=item.get("text","")
new_page.insert_text((0,0), text, fontsize=12, overlay=False)
def read_row(pages,texts):
text=""
for page in pages:
# 获取页面的文本块
for block in page.get_text("blocks"):
# 对于每个文本块,分行并读取
if block[5]==0:
append_text(text, 'text', texts)
text=block[4]
else:
text=text+block[4]
def write_row(newpdf, texts, page_width, page_height):
text_count=0
new_page = newpdf.new_page(width=page_width, height=page_height)
for text in texts:
print(text['text'])
# draw_text_avoid_overlap(new_page, text['text'],text['block'][0],text['block'][1], 16)
new_page.insert_text((text['block'][0],text['block'][1]),text['text'], fontsize=16)
return
def append_text(text, content_type, texts):
if check_text(text):
# print(text)
texts.append({"text":text,"type":content_type, "complete":False})
def check_text(text):
return text!=None and len(text)>0 and not common.is_all_punc(text)
def draw_text_avoid_overlap(page, text, x, y, font_size):
"""
在指定位置绘制文本,避免与现有文本重叠。
"""
text_length = len(text) * font_size # 估算文本长度
while True:
text_box = page.get_textbox((x, y, x + text_length, y + font_size))
if not text_box:
break # 没有重叠的文本,退出循环
y += font_size + 1 # 移动到下一个位置
page.insert_text((x,y),text, fontsize=font_size)
def draw_table(page, table_data, x, y, width, cell_height):
# 表格的列数
cols = len(table_data[0])
rows = len(table_data)
# 绘制表格
for i in range(rows):
for j in range(cols):
# 文字写入
txt = table_data[i][j]
page.insert_text((x, y), txt)
# 绘制单元格边框 (仅边界线)
# 左边
page.draw_line((x, y),( x+width/cols, y), width=0.5)
# 上边
if i == 0:
page.draw_line((x, y), (x, y+cell_height), width=0.5)
# 右边
if j == cols-1:
page.draw_line((x+width/cols, y), (x+width/cols, y+cell_height), width=0.5)
# 下边
if i == rows-1:
page.draw_line((x, y+cell_height), (x+width/cols, y+cell_height), width=0.5)
# 移动到下一个单元格
x += width/cols
# 移动到下一行
x = 0
y += cell_height
def wrap_text(text, width):
words = text.split(' ')
lines = []
line = ""
for word in words:
if len(line.split(' ')) >= width:
lines.append(line)
line = ""
if len(line + word + ' ') <= width * len(word):
line += word + ' '
else:
lines.append(line)
line = word + ' '
if line:
lines.append(line)
return lines
def is_paragraph(block):
# 假设一个段落至少有两行
if len(block) < 2:
return False
# 假设一个段落的行间隔较大
if max([line.height for line in block]) / min([line.height for line in block]) > 1.5:
return True
return False
def is_next_line_continuation(page, current_line, next_line_index):
# 判断下一行是否是当前行的继续
return abs(next_line_index - current_line) < 0.1
def print_texts(texts):
for item in texts:
print(item.get("text"))
def is_scan_pdf(pages):
for index,page in enumerate(pages):
html=page.get_text("xhtml")
images=re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
text=page.get_text()
print(images)
print(text)
if text=="" and len(images)>0:
return True
else:
return False
def read_pdf_html(pages, texts, trans):
for index,page in enumerate(pages):
target_html="{}-{}.html".format(trans['html_path'], page_index)
if os.path.exists(target_html):
os.remove(target_html)
dftohtml_path = shutil.which("pdftohtml")
if pdftohtml_path is None:
raise Exception("未安装pdftohtml")
subprocess.run([dftohtml_path,"-c","-l", page_index, trans['file_path'], trans['html_path']])
if not os.path.exists(target_html):
raise Exception("无法生成html")
# append_text(html,'text', texts)
def pdftohtml(pdf_path, html_path,texts):
target_html="{}-html.html".format(html_path)
if os.path.exists(target_html):
os.remove(target_html)
pdftohtml_path = shutil.which("pdftohtml")
if pdftohtml_path is None:
raise Exception("未安装pdftohtml")
subprocess.run([pdftohtml_path,"-c","-s", pdf_path, html_path])
if not os.path.exists(target_html):
raise Exception("无法生成html")
with open(target_html, 'r') as f:
content=f.read()
print(content)
append_text(content, 'text', texts)
def pdftodocx(pdf_path, docx_path):
print(docx_path)
if os.path.exists(docx_path):
os.remove(docx_path)
print(pdf_path)
try:
cv = pdf2docx.Converter(pdf_path)
cv.debug_page(0)
cv.convert(docx_path, start=0,end=1,multi_processing=False)
cv.close()
#exit()
except Exception as e:
print("error")
pdf2docxNext(pdf_path, docx_path)
def pdf2docxNext(pdf_path, docx_path):
try:
# 创建一个新的 DOCX 文档
doc = Document()
# 打开 PDF 文件
pdf_document = fitz.open(pdf_path)
# 遍历 PDF 的每一页
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
fonts=page.get_fonts()
# 提取文本
# 提取文本和样式信息
text_dict = page.get_text("dict")
# 遍历文本块
for block in text_dict["blocks"]:
if block["type"] == 0: # 只处理文本块
for line in block["lines"]:
for span in line["spans"]:
text = span["text"]
font_size = span["size"] # 字体大小
font_color = span["color"] # 字体颜色
# 创建段落
paragraph = doc.add_paragraph()
run = paragraph.add_run(text)
# 设置字体大小
run.font.size = Pt(font_size)
# 设置字体颜色
if font_color:
run.font.color.rgb = RGBColor(
(font_color >> 16) & 0xFF, # R
(font_color >> 8) & 0xFF, # G
font_color & 0xFF # B
)
elif block["type"] == 1:
# 提取图像
try:
img_index = block["image"]
base_image = pdf_document.extract_image(img_index)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# 将图像添加到 DOCX
image_stream = BytesIO(image_bytes)
doc.add_picture(image_stream, width=None) # 可以指定宽度
except Exception as e:
print("图片无法解析")
# 添加分页符
doc.add_page_break()
# 保存 DOCX 文件
doc.save(docx_path)
pdf_document.close()
except Exception as e:
raise("pdf转docx失败")
# 舍弃
def docxtopdf6(docx_path, pdf_path):
"""
使用 docx2pdf 库实现跨平台 DOCX 转 PDF
保持原始逻辑:删除已存在的PDF、创建目录、错误处理
"""
# 删除已存在的PDF文件(保留原始逻辑)
if os.path.exists(pdf_path):
try:
os.remove(pdf_path)
except Exception as e:
raise RuntimeError(f"无法删除旧PDF文件 {pdf_path}: {str(e)}")
# 创建输出目录(优化权限设置)
target_dir = os.path.dirname(pdf_path)
if not os.path.exists(target_dir):
try:
os.makedirs(target_dir, exist_ok=True) # 去除明确的 0o777 权限
except Exception as e:
raise RuntimeError(f"无法创建目录 {target_dir}: {str(e)}")
# 执行转换(替换核心实现)
try:
print(f"正在转换: {docx_path}{pdf_path}") # 保留日志输出
convert(docx_path, pdf_path) # 核心转换调用
# 验证转换结果
if not os.path.exists(pdf_path):
raise RuntimeError("转换成功但未生成预期输出文件")
print("转换完成") # 保留完成提示
except Exception as e:
# 增强错误信息
error_msg = f"DOCX转PDF失败: {str(e)}"
if "No such file or directory" in str(e):
error_msg += " (请检查输入文件路径)"
elif "Permission denied" in str(e):
error_msg += " (权限不足)"
raise RuntimeError(error_msg)
# 旧方案
def docxtopdf11111(docx_path, pdf_path):
if os.path.exists(pdf_path):
os.remove(pdf_path)
sys.path.append("/usr/local/bin")
unoconv_path = shutil.which("unoconv")
if unoconv_path is None:
raise Exception("未安装unoconv")
target_path_dir=os.path.dirname(pdf_path)
if not os.path.exists(target_path_dir):
os.makedirs(target_path_dir, mode=0o777, exist_ok=True)
# target_pdf = fitz.Document()
# target_pdf.new_page()
# target_pdf.save(pdf_path)
# target_pdf.close()
# subprocess.run([unoconv_path,"-f","pdf","-e","UTF-8","-o",target_path_dir, docx_path])
# subprocess.run([unoconv_path,"-f","pdf","-e","UTF-8","-o",target_path_dir, docx_path])
print("{} -f pdf -o {} {}".format(unoconv_path,pdf_path, docx_path))
# subprocess.run("{} -f pdf -o {} {}".format(unoconv_path, pdf_path, docx_path), shell=True)
command = [unoconv_path, "-f", "pdf", "-o", pdf_path, docx_path]
subprocess.run(command)
print("done")
def create_temp_file(suffix='.png'):
temp_dir = '/tmp' # 或者使用其他临时目录
filename = f"{uuid.uuid4()}{suffix}"
return os.path.join(temp_dir, filename)
def pdf_to_text_with_ocr(pdf_path, docx_path, origin_lang):
# if not is_tesseract_installed():
# raise Exception("Tesseract未安装,无法进行OCR")
document = fitz.open(pdf_path)
docx = Document()
for page_num in range(len(document)):
page = document.load_page(page_num)
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# 转换为灰度图像
img = img.convert('L')
# 将图像保存到内存中的字节流
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='PNG')
img_byte_arr = img_byte_arr.getvalue()
try:
# 使用 Tesseract 命令行工具
process = subprocess.Popen(
['/usr/local/bin/tesseract', 'stdin', 'stdout', '-l', origin_lang, '--oem', '3', '--psm', '6'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = process.communicate(input=img_byte_arr)
if process.returncode != 0:
raise subprocess.CalledProcessError(process.returncode, process.args, stdout, stderr)
text = stdout.decode('utf-8').strip()
# 移除空行和多余的空格
text = '\n'.join(line.strip() for line in text.splitlines() if line.strip())
except subprocess.CalledProcessError as e:
print(f"OCR处理页面 {page_num + 1} 时出错: {str(e)}")
text = "" # 如果出错,使用空字符串
paragraph = docx.add_paragraph()
run = paragraph.add_run(text)
run.font.size = Pt(12)
document.close()
docx.save(docx_path)
def is_scanned_pdf(pdf_path):
document = fitz.open(pdf_path)
# 只检查前几页,通常足以判断
pages_to_check = min(5, len(document))
for page_num in range(pages_to_check):
page = document[page_num]
# 检查文本
text = page.get_text().strip()
if text:
document.close()
return False
# 检查图像
image_list = page.get_images()
if len(image_list) > 0:
# 如果页面只包含一个大图像,很可能是扫描件
if len(image_list) == 1:
xref = image_list[0][0]
img = document.extract_image(xref)
if img:
pix = fitz.Pixmap(img["image"])
# 如果图像覆盖了大部分页面,可能是扫描件
if pix.width > page.rect.width * 0.9 and pix.height > page.rect.height * 0.9:
document.close()
return True
document.close()
return True # 如果没有找到文本,默认认为是扫描件
def is_tesseract_installed():
tesseract_path = "/usr/local/bin/tesseract"
return os.path.isfile(tesseract_path) and os.access(tesseract_path, os.X_OK)
def use_doc2x_revert_pdf_to_docx(dox2x_api_key, pdf_file, docx_path):
client = Doc2X(apikey=dox2x_api_key,debug=False)
success, failed, flag = client.pdf2file(
pdf_file=pdf_file,
output_path=docx_path,
output_format="docx",
)
if len(success)>0 and success[0]!="":
return (True,success[0])
else:
return (False,failed[0]["error"])
# def save_image(base64_data, path):
# image_data = base64.b64decode(base64_data)
# # 将字节数据写入内存中的文件对象
# image_file = BytesIO(image_data)
# # 从内存中的文件对象创建Image对象
# image = Image.open(image_file)
# # 保存图片到文件系统
# image.sav/e(path)