Spaces:

dseditor
/

Docfixer

Sleeping

App Files Files Community

Docfixer / app.py

dseditor

Upload app.py

f06ed39 verified 5 months ago

raw

history blame contribute delete

6.07 kB

	import gradio as gr
	import re
	from docx import Document
	from docx.shared import Cm, Pt
	from docx.enum.style import WD_STYLE_TYPE
	from docx.oxml import OxmlElement
	from docx.oxml.ns import qn
	import tempfile
	import os

	def set_outline_level(paragraph, level: int = 0):
	p = paragraph._p
	pPr = p.get_or_add_pPr()
	outline = OxmlElement('w:outlineLvl')
	outline.set(qn('w:val'), str(level))
	pPr.append(outline)

	def normalize_paragraph(text):
	# 處理段落內換行符號與多餘空格
	text = re.sub(r'[\r\n]+', ' ', text)
	text = re.sub(r'\s{2,}', ' ', text)
	return text.strip()

	def process_paragraphs_with_cleanup(doc, combined_pattern):
	"""處理段落並清理多餘空行，實現 ^p^p -> ^p 效果"""
	content_list = []
	prev_empty = False

	for para in doc.paragraphs:
	text = para.text.strip()
	if not text:
	# 只有當前一個段落不是空的時候，才保留這個空段落
	if not prev_empty:
	content_list.append(('empty', ''))
	prev_empty = True
	elif re.search(combined_pattern, text):
	content_list.append(('heading', text))
	prev_empty = False
	else:
	content_list.append(('paragraph', text))
	prev_empty = False

	return content_list

	def rebuild_document(doc, content_list):
	"""重建文檔內容"""
	for kind, text in content_list:
	if kind == 'heading':
	heading = doc.add_paragraph(text, style='Heading 1')
	heading.paragraph_format.page_break_before = True
	heading.paragraph_format.space_before = Cm(0)
	heading.paragraph_format.space_after = Cm(0.3)
	heading.paragraph_format.line_spacing = 1.0
	heading.paragraph_format.left_indent = Cm(0)
	heading.paragraph_format.first_line_indent = Cm(0)
	set_outline_level(heading, 0)
	for run in heading.runs:
	run.font.name = '新細明體'
	run.font.size = Pt(16)
	elif kind == 'paragraph':
	clean_text = normalize_paragraph(text)
	para = doc.add_paragraph(clean_text)
	para.paragraph_format.space_before = Cm(0)
	para.paragraph_format.space_after = Cm(0)
	para.paragraph_format.line_spacing = 1.0
	para.paragraph_format.left_indent = Cm(0)
	para.paragraph_format.first_line_indent = Cm(0.7)
	for run in para.runs:
	run.font.name = '新細明體'
	elif kind == 'empty':
	# 只有在真正需要保留空段落時才創建
	doc.add_paragraph('')

	def format_docx(file, chapter_keywords, remove_empty_paragraphs=True):
	if file is None:
	return None, "請上傳一個 Word 文件"
	if not chapter_keywords.strip():
	return None, "請輸入章節分段方式（例如：章,節,話）"

	try:
	doc = Document(file.name)

	# 確保有 Heading 1 樣式
	styles = doc.styles
	if 'Heading 1' not in styles:
	heading_style = styles.add_style('Heading 1', WD_STYLE_TYPE.PARAGRAPH)
	heading_style.base_style = styles['Normal']
	heading_style.font.bold = True
	heading_style.font.size = Pt(16)

	# 章節關鍵字模式
	keywords = [k.strip() for k in chapter_keywords.split(',')]
	patterns = [f'第\s[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s{k}' for k in keywords]
	combined_pattern = '\|'.join(patterns)

	# 處理段落並清理多餘空行
	content_list = process_paragraphs_with_cleanup(doc, combined_pattern)

	# 如果選擇移除空段落，過濾掉所有空段落
	if remove_empty_paragraphs:
	content_list = [item for item in content_list if item[0] != 'empty']

	# 清空原始內容
	for p in doc.paragraphs:
	p._element.getparent().remove(p._element)

	# 重建段落
	rebuild_document(doc, content_list)

	output_path = tempfile.mktemp(suffix='.docx')
	doc.save(output_path)

	empty_status = "已移除所有空段落" if remove_empty_paragraphs else "保留單個空段落"
	return output_path, f"✅ 處理完成！找到章節關鍵字：{', '.join(keywords)}，{empty_status}"
	except Exception as e:
	return None, f"❌ 處理失敗：{str(e)}"

	def create_interface():
	with gr.Blocks(title="Word 文件格式化工具", theme=gr.themes.Soft()) as demo:
	gr.HTML("""
	<div style="text-align: center; margin-bottom: 20px;">
	<h1>📄 Word 文件格式化工具</h1>
	<p>自動格式化您的 Word 文件，設定章節樣式和分頁</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(label="上傳 Word 文件 (.docx)", file_types=[".docx"], file_count="single")
	chapter_input = gr.Textbox(label="章節分段方式", placeholder="章,節,話", value="章,節,話")
	remove_empty_checkbox = gr.Checkbox(
	label="移除空段落",
	value=True,
	info="勾選時會移除所有空段落，取消勾選時會保留單個空段落（^p^p -> ^p）"
	)
	process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
	with gr.Column(scale=1):
	status_output = gr.Textbox(label="處理狀態", interactive=False, lines=3)
	download_output = gr.File(label="下載處理後的文件", interactive=False)

	process_btn.click(
	fn=format_docx,
	inputs=[file_input, chapter_input, remove_empty_checkbox],
	outputs=[download_output, status_output]
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)