|
|
import gradio as gr |
|
|
import re |
|
|
from docx import Document |
|
|
from docx.shared import Cm, Pt |
|
|
from docx.enum.style import WD_STYLE_TYPE |
|
|
from docx.oxml import OxmlElement |
|
|
from docx.oxml.ns import qn |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
def set_outline_level(paragraph, level: int = 0): |
|
|
p = paragraph._p |
|
|
pPr = p.get_or_add_pPr() |
|
|
outline = OxmlElement('w:outlineLvl') |
|
|
outline.set(qn('w:val'), str(level)) |
|
|
pPr.append(outline) |
|
|
|
|
|
def normalize_paragraph(text): |
|
|
|
|
|
text = re.sub(r'[\r\n]+', ' ', text) |
|
|
text = re.sub(r'\s{2,}', ' ', text) |
|
|
return text.strip() |
|
|
|
|
|
def process_paragraphs_with_cleanup(doc, combined_pattern): |
|
|
"""處理段落並清理多餘空行,實現 ^p^p -> ^p 效果""" |
|
|
content_list = [] |
|
|
prev_empty = False |
|
|
|
|
|
for para in doc.paragraphs: |
|
|
text = para.text.strip() |
|
|
if not text: |
|
|
|
|
|
if not prev_empty: |
|
|
content_list.append(('empty', '')) |
|
|
prev_empty = True |
|
|
elif re.search(combined_pattern, text): |
|
|
content_list.append(('heading', text)) |
|
|
prev_empty = False |
|
|
else: |
|
|
content_list.append(('paragraph', text)) |
|
|
prev_empty = False |
|
|
|
|
|
return content_list |
|
|
|
|
|
def rebuild_document(doc, content_list): |
|
|
"""重建文檔內容""" |
|
|
for kind, text in content_list: |
|
|
if kind == 'heading': |
|
|
heading = doc.add_paragraph(text, style='Heading 1') |
|
|
heading.paragraph_format.page_break_before = True |
|
|
heading.paragraph_format.space_before = Cm(0) |
|
|
heading.paragraph_format.space_after = Cm(0.3) |
|
|
heading.paragraph_format.line_spacing = 1.0 |
|
|
heading.paragraph_format.left_indent = Cm(0) |
|
|
heading.paragraph_format.first_line_indent = Cm(0) |
|
|
set_outline_level(heading, 0) |
|
|
for run in heading.runs: |
|
|
run.font.name = '新細明體' |
|
|
run.font.size = Pt(16) |
|
|
elif kind == 'paragraph': |
|
|
clean_text = normalize_paragraph(text) |
|
|
para = doc.add_paragraph(clean_text) |
|
|
para.paragraph_format.space_before = Cm(0) |
|
|
para.paragraph_format.space_after = Cm(0) |
|
|
para.paragraph_format.line_spacing = 1.0 |
|
|
para.paragraph_format.left_indent = Cm(0) |
|
|
para.paragraph_format.first_line_indent = Cm(0.7) |
|
|
for run in para.runs: |
|
|
run.font.name = '新細明體' |
|
|
elif kind == 'empty': |
|
|
|
|
|
doc.add_paragraph('') |
|
|
|
|
|
def format_docx(file, chapter_keywords, remove_empty_paragraphs=True): |
|
|
if file is None: |
|
|
return None, "請上傳一個 Word 文件" |
|
|
if not chapter_keywords.strip(): |
|
|
return None, "請輸入章節分段方式(例如:章,節,話)" |
|
|
|
|
|
try: |
|
|
doc = Document(file.name) |
|
|
|
|
|
|
|
|
styles = doc.styles |
|
|
if 'Heading 1' not in styles: |
|
|
heading_style = styles.add_style('Heading 1', WD_STYLE_TYPE.PARAGRAPH) |
|
|
heading_style.base_style = styles['Normal'] |
|
|
heading_style.font.bold = True |
|
|
heading_style.font.size = Pt(16) |
|
|
|
|
|
|
|
|
keywords = [k.strip() for k in chapter_keywords.split(',')] |
|
|
patterns = [f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{k}' for k in keywords] |
|
|
combined_pattern = '|'.join(patterns) |
|
|
|
|
|
|
|
|
content_list = process_paragraphs_with_cleanup(doc, combined_pattern) |
|
|
|
|
|
|
|
|
if remove_empty_paragraphs: |
|
|
content_list = [item for item in content_list if item[0] != 'empty'] |
|
|
|
|
|
|
|
|
for p in doc.paragraphs: |
|
|
p._element.getparent().remove(p._element) |
|
|
|
|
|
|
|
|
rebuild_document(doc, content_list) |
|
|
|
|
|
output_path = tempfile.mktemp(suffix='.docx') |
|
|
doc.save(output_path) |
|
|
|
|
|
empty_status = "已移除所有空段落" if remove_empty_paragraphs else "保留單個空段落" |
|
|
return output_path, f"✅ 處理完成!找到章節關鍵字:{', '.join(keywords)},{empty_status}" |
|
|
except Exception as e: |
|
|
return None, f"❌ 處理失敗:{str(e)}" |
|
|
|
|
|
def create_interface(): |
|
|
with gr.Blocks(title="Word 文件格式化工具", theme=gr.themes.Soft()) as demo: |
|
|
gr.HTML(""" |
|
|
<div style="text-align: center; margin-bottom: 20px;"> |
|
|
<h1>📄 Word 文件格式化工具</h1> |
|
|
<p>自動格式化您的 Word 文件,設定章節樣式和分頁</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
file_input = gr.File(label="上傳 Word 文件 (.docx)", file_types=[".docx"], file_count="single") |
|
|
chapter_input = gr.Textbox(label="章節分段方式", placeholder="章,節,話", value="章,節,話") |
|
|
remove_empty_checkbox = gr.Checkbox( |
|
|
label="移除空段落", |
|
|
value=True, |
|
|
info="勾選時會移除所有空段落,取消勾選時會保留單個空段落(^p^p -> ^p)" |
|
|
) |
|
|
process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg") |
|
|
with gr.Column(scale=1): |
|
|
status_output = gr.Textbox(label="處理狀態", interactive=False, lines=3) |
|
|
download_output = gr.File(label="下載處理後的文件", interactive=False) |
|
|
|
|
|
process_btn.click( |
|
|
fn=format_docx, |
|
|
inputs=[file_input, chapter_input, remove_empty_checkbox], |
|
|
outputs=[download_output, status_output] |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_interface() |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |