|
import gradio as gr |
|
import requests |
|
import json |
|
import os |
|
from pathlib import Path |
|
import fitz |
|
import re |
|
from typing import Dict, List, Optional |
|
import tempfile |
|
import unicodedata |
|
|
|
def clean_text(text: str) -> str: |
|
"""清理文本,移除多餘空白和特殊字符""" |
|
if not text: |
|
return "" |
|
|
|
text = unicodedata.normalize('NFKC', text) |
|
|
|
text = re.sub(r'\s+', ' ', text.strip()) |
|
return text |
|
|
|
def extract_title_from_collection(collection_name: str) -> str: |
|
"""從論文集名稱中提取簡潔的標題""" |
|
|
|
keywords_to_remove = ['研究', '論文集', '期刊', '學報', '彙編', '全刊', '下載'] |
|
|
|
title = collection_name |
|
for keyword in keywords_to_remove: |
|
title = title.replace(keyword, '') |
|
|
|
|
|
title = re.sub(r'\d+', '', title) |
|
title = re.sub(r'[^\w\u4e00-\u9fff]', '', title) |
|
|
|
return clean_text(title) if title.strip() else collection_name |
|
|
|
def download_pdf(url: str) -> Optional[str]: |
|
"""下載PDF文件並返回臨時文件路徑""" |
|
try: |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|
} |
|
response = requests.get(url, headers=headers, stream=True, timeout=30) |
|
response.raise_for_status() |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') |
|
temp_file.write(response.content) |
|
temp_file.close() |
|
|
|
return temp_file.name |
|
except Exception as e: |
|
print(f"下載PDF失敗: {e}") |
|
return None |
|
|
|
def extract_pdf_content(pdf_path: str) -> Dict[str, str]: |
|
"""從PDF中提取標題、作者和摘要""" |
|
try: |
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
full_text = "" |
|
for page_num in range(min(3, len(doc))): |
|
page = doc[page_num] |
|
text = page.get_text() |
|
full_text += text + "\n" |
|
|
|
doc.close() |
|
|
|
|
|
full_text = clean_text(full_text) |
|
|
|
|
|
title = extract_title(full_text) |
|
|
|
|
|
author = extract_author(full_text) |
|
|
|
|
|
abstract = extract_abstract(full_text) |
|
|
|
return { |
|
"title": title, |
|
"author": author, |
|
"abstract": abstract |
|
} |
|
except Exception as e: |
|
print(f"提取PDF內容失敗: {e}") |
|
return {"title": "", "author": "", "abstract": ""} |
|
|
|
def extract_title(text: str) -> str: |
|
"""提取標題""" |
|
lines = text.split('\n') |
|
|
|
|
|
skip_keywords = ['頁', 'page', '目錄', '內容', '摘要', 'abstract', '關鍵詞'] |
|
|
|
for line in lines[:20]: |
|
line = line.strip() |
|
if len(line) > 5 and len(line) < 100: |
|
if not any(keyword in line.lower() for keyword in skip_keywords): |
|
if not re.match(r'^\d+\.?\d*$', line): |
|
return line |
|
|
|
|
|
for line in lines: |
|
line = line.strip() |
|
if line and len(line) > 3: |
|
return line[:80] |
|
|
|
return "未知標題" |
|
|
|
def extract_author(text: str) -> str: |
|
"""提取作者""" |
|
|
|
author_patterns = [ |
|
r'作者[::]\s*([^\n]+)', |
|
r'著者[::]\s*([^\n]+)', |
|
r'by\s+([^\n]+)', |
|
r'撰稿[::]\s*([^\n]+)', |
|
] |
|
|
|
for pattern in author_patterns: |
|
match = re.search(pattern, text, re.IGNORECASE) |
|
if match: |
|
author = clean_text(match.group(1)) |
|
return author[:50] |
|
|
|
|
|
|
|
chinese_name_pattern = r'[\u4e00-\u9fff]{2,4}(?:\s*、\s*[\u4e00-\u9fff]{2,4})*' |
|
|
|
lines = text.split('\n') |
|
for line in lines[:30]: |
|
line = line.strip() |
|
if len(line) < 100: |
|
matches = re.findall(chinese_name_pattern, line) |
|
if matches and len(line) < 50: |
|
return line |
|
|
|
return "未知作者" |
|
|
|
def extract_abstract(text: str) -> str: |
|
"""提取摘要""" |
|
|
|
abstract_keywords = ['摘要', 'abstract', '內容摘要', '研究摘要'] |
|
|
|
text_lines = text.split('\n') |
|
abstract_start = -1 |
|
|
|
|
|
for i, line in enumerate(text_lines): |
|
line_lower = line.lower().strip() |
|
if any(keyword in line_lower for keyword in abstract_keywords): |
|
abstract_start = i |
|
break |
|
|
|
if abstract_start == -1: |
|
|
|
abstract_text = ' '.join(text_lines[5:15]) |
|
else: |
|
|
|
abstract_lines = text_lines[abstract_start+1:abstract_start+15] |
|
abstract_text = ' '.join(abstract_lines) |
|
|
|
|
|
abstract_text = clean_text(abstract_text) |
|
if len(abstract_text) > 500: |
|
abstract_text = abstract_text[:500] + "..." |
|
|
|
return abstract_text if abstract_text else "無摘要資訊" |
|
|
|
def process_json_data(json_input: str, manual_abstract: str = "") -> str: |
|
"""處理JSON數據,補充缺失欄位""" |
|
try: |
|
|
|
data = json.loads(json_input) |
|
|
|
if not isinstance(data, list): |
|
data = [data] |
|
|
|
processed_data = [] |
|
|
|
for item in data: |
|
|
|
collection_name = item.get("論文集名稱", "") |
|
author = item.get("作者", "") |
|
download_url = item.get("下載位置", "") |
|
|
|
|
|
processed_item = { |
|
"論文集名稱": collection_name, |
|
"作者": author, |
|
"下載位置": download_url, |
|
"名稱": "", |
|
"摘要": "" |
|
} |
|
|
|
|
|
use_manual_abstract = manual_abstract.strip() |
|
|
|
|
|
if download_url: |
|
print(f"正在處理: {collection_name}") |
|
pdf_path = download_pdf(download_url) |
|
|
|
if pdf_path: |
|
try: |
|
|
|
extracted_info = extract_pdf_content(pdf_path) |
|
|
|
|
|
pdf_title = extracted_info.get("title", "") |
|
if pdf_title and pdf_title != "未知標題": |
|
|
|
collection_base = extract_title_from_collection(collection_name) |
|
if collection_base not in pdf_title: |
|
processed_item["名稱"] = pdf_title |
|
else: |
|
|
|
processed_item["名稱"] = pdf_title.replace(collection_base, "").strip() |
|
else: |
|
processed_item["名稱"] = extract_title_from_collection(collection_name) |
|
|
|
|
|
pdf_author = extracted_info.get("author", "") |
|
if pdf_author and pdf_author != "未知作者": |
|
if not author or author == "犯罪防治研究中心彙編": |
|
processed_item["作者"] = pdf_author |
|
|
|
|
|
if use_manual_abstract: |
|
processed_item["摘要"] = use_manual_abstract |
|
else: |
|
processed_item["摘要"] = extracted_info.get("abstract", "無摘要資訊") |
|
|
|
finally: |
|
|
|
if os.path.exists(pdf_path): |
|
os.unlink(pdf_path) |
|
else: |
|
|
|
processed_item["名稱"] = extract_title_from_collection(collection_name) |
|
processed_item["摘要"] = use_manual_abstract if use_manual_abstract else "無法獲取摘要資訊" |
|
else: |
|
|
|
processed_item["名稱"] = extract_title_from_collection(collection_name) |
|
processed_item["摘要"] = use_manual_abstract if use_manual_abstract else "無下載位置,無法提取摘要" |
|
|
|
processed_data.append(processed_item) |
|
|
|
|
|
return json.dumps(processed_data, ensure_ascii=False, indent=2) |
|
|
|
except Exception as e: |
|
return f"處理錯誤: {str(e)}" |
|
|
|
def generate_filename_from_collection(collection_name: str) -> str: |
|
"""根據論文集名稱生成安全的文件名""" |
|
if not collection_name: |
|
return "processed_data.json" |
|
|
|
|
|
safe_name = re.sub(r'[^\w\u4e00-\u9fff\-]', '_', collection_name) |
|
|
|
|
|
safe_name = re.sub(r'_+', '_', safe_name).strip('_') |
|
|
|
|
|
if len(safe_name) > 50: |
|
safe_name = safe_name[:50] |
|
|
|
|
|
if not safe_name.endswith('.json'): |
|
safe_name += '.json' |
|
|
|
return safe_name |
|
|
|
def save_json_file(json_data: str, filename: str = None, auto_filename: bool = False, collection_name: str = None) -> tuple: |
|
"""保存JSON文件,支持自動文件名生成""" |
|
try: |
|
|
|
if auto_filename and collection_name: |
|
filename = generate_filename_from_collection(collection_name) |
|
elif not filename: |
|
filename = "processed_data.json" |
|
|
|
if not filename.endswith('.json'): |
|
filename += '.json' |
|
|
|
|
|
filename = re.sub(r'[^\w\u4e00-\u9fff\-_\.]', '_', filename) |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
f.write(json_data) |
|
|
|
return f"文件已保存: {filename}", filename |
|
except Exception as e: |
|
return f"保存失敗: {str(e)}", None |
|
|
|
|
|
def process_pdf_urls(urls_text: str, manual_abstract: str = "") -> str: |
|
"""處理PDF網址列表,直接提取資訊""" |
|
try: |
|
|
|
urls = [url.strip() for url in urls_text.strip().split('\n') if url.strip()] |
|
|
|
if not urls: |
|
return "請輸入至少一個PDF網址" |
|
|
|
processed_data = [] |
|
use_manual_abstract = manual_abstract.strip() |
|
|
|
for i, url in enumerate(urls, 1): |
|
print(f"正在處理第 {i}/{len(urls)} 個PDF: {url}") |
|
|
|
|
|
pdf_path = download_pdf(url) |
|
|
|
if pdf_path: |
|
try: |
|
|
|
extracted_info = extract_pdf_content(pdf_path) |
|
|
|
|
|
item = { |
|
"名稱": extracted_info.get("title", f"PDF文件 {i}"), |
|
"作者": extracted_info.get("author", "未知作者"), |
|
"摘要": use_manual_abstract if use_manual_abstract else extracted_info.get("abstract", "無摘要資訊"), |
|
"下載位置": url, |
|
"論文集名稱": f"直接處理PDF {i}" |
|
} |
|
|
|
processed_data.append(item) |
|
|
|
finally: |
|
|
|
if os.path.exists(pdf_path): |
|
os.unlink(pdf_path) |
|
else: |
|
|
|
item = { |
|
"名稱": f"無法下載的PDF {i}", |
|
"作者": "未知作者", |
|
"摘要": use_manual_abstract if use_manual_abstract else "PDF下載失敗,無法提取摘要", |
|
"下載位置": url, |
|
"論文集名稱": f"處理失敗 {i}" |
|
} |
|
processed_data.append(item) |
|
|
|
|
|
return json.dumps(processed_data, ensure_ascii=False, indent=2) |
|
|
|
except Exception as e: |
|
return f"處理錯誤: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo: |
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("JSON資料處理"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
json_input = gr.Textbox( |
|
label="輸入JSON資料", |
|
placeholder="請貼上您的JSON資料...", |
|
lines=8, |
|
value='[\n {\n "論文集名稱": "刑事政策與犯罪防治研究36",\n "作者": "犯罪防治研究中心彙編",\n "下載位置": "https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true"\n }\n]' |
|
) |
|
|
|
manual_abstract1 = gr.Textbox( |
|
label="手動輸入摘要 (選填)", |
|
placeholder="如果有摘要,請在此輸入。留空則自動從PDF中提取。", |
|
lines=4 |
|
) |
|
|
|
with gr.Row(): |
|
auto_filename1 = gr.Checkbox( |
|
label="自動使用論文集名稱作為檔名", |
|
value=True |
|
) |
|
|
|
filename_input1 = gr.Textbox( |
|
label="自訂文件名 (僅在未勾選自動檔名時使用)", |
|
placeholder="例: processed_papers.json", |
|
value="processed_papers.json", |
|
visible=False |
|
) |
|
|
|
process_json_btn = gr.Button("處理JSON資料", variant="primary", size="lg") |
|
|
|
with gr.Column(scale=2): |
|
output_json1 = gr.Textbox( |
|
label="處理結果", |
|
lines=18, |
|
show_copy_button=True |
|
) |
|
|
|
save_status1 = gr.Textbox( |
|
label="保存狀態", |
|
lines=2 |
|
) |
|
|
|
download_file1 = gr.File( |
|
label="下載處理後的文件", |
|
visible=False |
|
) |
|
|
|
|
|
with gr.TabItem("PDF網址直接處理"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
pdf_urls_input = gr.Textbox( |
|
label="輸入PDF網址", |
|
placeholder="請輸入PDF網址,每行一個...\n\n例如:\nhttps://example.com/paper1.pdf\nhttps://example.com/paper2.pdf\nhttps://example.com/paper3.pdf", |
|
lines=8, |
|
value="https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true" |
|
) |
|
|
|
manual_abstract2 = gr.Textbox( |
|
label="手動輸入摘要 (選填)", |
|
placeholder="如果有摘要,請在此輸入。留空則自動從PDF中提取。", |
|
lines=4 |
|
) |
|
|
|
with gr.Row(): |
|
auto_filename2 = gr.Checkbox( |
|
label="自動使用第一個PDF標題作為檔名", |
|
value=True |
|
) |
|
|
|
filename_input2 = gr.Textbox( |
|
label="自訂文件名 (僅在未勾選自動檔名時使用)", |
|
placeholder="例: pdf_extracted_data.json", |
|
value="pdf_extracted_data.json", |
|
visible=False |
|
) |
|
|
|
process_urls_btn = gr.Button("處理PDF網址", variant="primary", size="lg") |
|
|
|
with gr.Column(scale=2): |
|
output_json2 = gr.Textbox( |
|
label="處理結果", |
|
lines=18, |
|
show_copy_button=True |
|
) |
|
|
|
save_status2 = gr.Textbox( |
|
label="保存狀態", |
|
lines=2 |
|
) |
|
|
|
download_file2 = gr.File( |
|
label="下載處理後的文件", |
|
visible=False |
|
) |
|
|
|
def process_and_save_json(json_input, manual_abstract, auto_filename, custom_filename): |
|
|
|
result = process_json_data(json_input, manual_abstract) |
|
|
|
|
|
collection_name = "" |
|
try: |
|
data = json.loads(json_input) |
|
if isinstance(data, list) and len(data) > 0: |
|
collection_name = data[0].get("論文集名稱", "") |
|
elif isinstance(data, dict): |
|
collection_name = data.get("論文集名稱", "") |
|
except: |
|
pass |
|
|
|
|
|
save_msg, actual_filename = save_json_file( |
|
result, |
|
custom_filename if not auto_filename else None, |
|
auto_filename, |
|
collection_name |
|
) |
|
|
|
|
|
if actual_filename and "已保存" in save_msg: |
|
return result, save_msg, gr.update(visible=True, value=actual_filename) |
|
else: |
|
return result, save_msg, gr.update(visible=False) |
|
|
|
def process_and_save_urls(urls_input, manual_abstract, auto_filename, custom_filename): |
|
|
|
result = process_pdf_urls(urls_input, manual_abstract) |
|
|
|
|
|
title_for_filename = "" |
|
if auto_filename: |
|
try: |
|
data = json.loads(result) |
|
if isinstance(data, list) and len(data) > 0: |
|
title_for_filename = data[0].get("名稱", "") |
|
except: |
|
pass |
|
|
|
|
|
save_msg, actual_filename = save_json_file( |
|
result, |
|
custom_filename if not auto_filename else None, |
|
auto_filename, |
|
title_for_filename |
|
) |
|
|
|
|
|
if actual_filename and "已保存" in save_msg: |
|
return result, save_msg, gr.update(visible=True, value=actual_filename) |
|
else: |
|
return result, save_msg, gr.update(visible=False) |
|
|
|
|
|
def toggle_filename_input1(auto_filename): |
|
return gr.update(visible=not auto_filename) |
|
|
|
def toggle_filename_input2(auto_filename): |
|
return gr.update(visible=not auto_filename) |
|
|
|
|
|
auto_filename1.change( |
|
toggle_filename_input1, |
|
inputs=[auto_filename1], |
|
outputs=[filename_input1] |
|
) |
|
|
|
auto_filename2.change( |
|
toggle_filename_input2, |
|
inputs=[auto_filename2], |
|
outputs=[filename_input2] |
|
) |
|
|
|
|
|
process_json_btn.click( |
|
process_and_save_json, |
|
inputs=[json_input, manual_abstract1, auto_filename1, filename_input1], |
|
outputs=[output_json1, save_status1, download_file1] |
|
) |
|
|
|
|
|
process_urls_btn.click( |
|
process_and_save_urls, |
|
inputs=[pdf_urls_input, manual_abstract2, auto_filename2, filename_input2], |
|
outputs=[output_json2, save_status2, download_file2] |
|
) |
|
|
|
gr.Markdown("## 基本功能:JSON處理 | PDF網址處理 | 自動檔名 | 手動摘要") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|