import gradio as gr import requests import json import os from pathlib import Path import fitz # PyMuPDF import re from typing import Dict, List, Optional import tempfile import unicodedata def clean_text(text: str) -> str: """清理文本,移除多餘空白和特殊字符""" if not text: return "" # 正規化Unicode字符 text = unicodedata.normalize('NFKC', text) # 移除多餘的空白字符 text = re.sub(r'\s+', ' ', text.strip()) return text def extract_title_from_collection(collection_name: str) -> str: """從論文集名稱中提取簡潔的標題""" # 移除常見的論文集關鍵詞 keywords_to_remove = ['研究', '論文集', '期刊', '學報', '彙編', '全刊', '下載'] title = collection_name for keyword in keywords_to_remove: title = title.replace(keyword, '') # 移除數字和特殊符號,保留核心名稱 title = re.sub(r'\d+', '', title) title = re.sub(r'[^\w\u4e00-\u9fff]', '', title) return clean_text(title) if title.strip() else collection_name def download_pdf(url: str) -> Optional[str]: """下載PDF文件並返回臨時文件路徑""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, stream=True, timeout=30) response.raise_for_status() # 創建臨時文件 temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') temp_file.write(response.content) temp_file.close() return temp_file.name except Exception as e: print(f"下載PDF失敗: {e}") return None def extract_pdf_content(pdf_path: str) -> Dict[str, str]: """從PDF中提取標題、作者和摘要""" try: doc = fitz.open(pdf_path) # 提取前幾頁的文本 full_text = "" for page_num in range(min(3, len(doc))): # 只處理前3頁 page = doc[page_num] text = page.get_text() full_text += text + "\n" doc.close() # 清理文本 full_text = clean_text(full_text) # 提取標題(通常在文檔開頭,字體較大) title = extract_title(full_text) # 提取作者 author = extract_author(full_text) # 提取摘要 abstract = extract_abstract(full_text) return { "title": title, "author": author, "abstract": abstract } except Exception as e: print(f"提取PDF內容失敗: {e}") return {"title": "", "author": "", "abstract": ""} def extract_title(text: str) -> str: """提取標題""" lines = text.split('\n') # 尋找可能的標題(通常在前幾行,不包含常見的頁眉頁腳詞彙) skip_keywords = ['頁', 'page', '目錄', '內容', '摘要', 'abstract', '關鍵詞'] for line in lines[:20]: # 檢查前20行 line = line.strip() if len(line) > 5 and len(line) < 100: # 標題長度合理 if not any(keyword in line.lower() for keyword in skip_keywords): if not re.match(r'^\d+\.?\d*$', line): # 不是純數字 return line # 如果沒找到合適標題,返回第一行非空內容 for line in lines: line = line.strip() if line and len(line) > 3: return line[:80] # 限制長度 return "未知標題" def extract_author(text: str) -> str: """提取作者""" # 常見的作者指示詞 author_patterns = [ r'作者[::]\s*([^\n]+)', r'著者[::]\s*([^\n]+)', r'by\s+([^\n]+)', r'撰稿[::]\s*([^\n]+)', ] for pattern in author_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: author = clean_text(match.group(1)) return author[:50] # 限制長度 # 如果沒找到明確的作者標識,尋找人名模式 # 中文人名模式(2-4個中文字符) chinese_name_pattern = r'[\u4e00-\u9fff]{2,4}(?:\s*、\s*[\u4e00-\u9fff]{2,4})*' lines = text.split('\n') for line in lines[:30]: # 檢查前30行 line = line.strip() if len(line) < 100: # 作者行通常不會太長 matches = re.findall(chinese_name_pattern, line) if matches and len(line) < 50: return line return "未知作者" def extract_abstract(text: str) -> str: """提取摘要""" # 摘要關鍵詞 abstract_keywords = ['摘要', 'abstract', '內容摘要', '研究摘要'] text_lines = text.split('\n') abstract_start = -1 # 尋找摘要開始位置 for i, line in enumerate(text_lines): line_lower = line.lower().strip() if any(keyword in line_lower for keyword in abstract_keywords): abstract_start = i break if abstract_start == -1: # 如果沒找到明確的摘要標識,取文檔開頭部分作為摘要 abstract_text = ' '.join(text_lines[5:15]) # 取中間部分 else: # 從摘要標識處開始提取 abstract_lines = text_lines[abstract_start+1:abstract_start+15] # 取摘要後的內容 abstract_text = ' '.join(abstract_lines) # 清理和限制摘要長度 abstract_text = clean_text(abstract_text) if len(abstract_text) > 500: abstract_text = abstract_text[:500] + "..." return abstract_text if abstract_text else "無摘要資訊" def process_json_data(json_input: str, manual_abstract: str = "") -> str: """處理JSON數據,補充缺失欄位""" try: # 解析輸入的JSON data = json.loads(json_input) if not isinstance(data, list): data = [data] processed_data = [] for item in data: # 獲取現有資料 collection_name = item.get("論文集名稱", "") author = item.get("作者", "") download_url = item.get("下載位置", "") # 初始化處理後的項目 processed_item = { "論文集名稱": collection_name, "作者": author, "下載位置": download_url, "名稱": "", "摘要": "" } # 如果有手動輸入的摘要,優先使用 use_manual_abstract = manual_abstract.strip() # 如果有下載位置,嘗試下載並提取資訊 if download_url: print(f"正在處理: {collection_name}") pdf_path = download_pdf(download_url) if pdf_path: try: # 從PDF提取資訊 extracted_info = extract_pdf_content(pdf_path) # 設定名稱(避免與論文集名稱重複) pdf_title = extracted_info.get("title", "") if pdf_title and pdf_title != "未知標題": # 確保名稱與論文集名稱不重複 collection_base = extract_title_from_collection(collection_name) if collection_base not in pdf_title: processed_item["名稱"] = pdf_title else: # 如果重複,使用更簡潔的版本 processed_item["名稱"] = pdf_title.replace(collection_base, "").strip() else: processed_item["名稱"] = extract_title_from_collection(collection_name) # 更新作者(如果原來沒有或PDF中有更詳細的資訊) pdf_author = extracted_info.get("author", "") if pdf_author and pdf_author != "未知作者": if not author or author == "犯罪防治研究中心彙編": processed_item["作者"] = pdf_author # 設定摘要:優先使用手動輸入,否則使用PDF提取的 if use_manual_abstract: processed_item["摘要"] = use_manual_abstract else: processed_item["摘要"] = extracted_info.get("abstract", "無摘要資訊") finally: # 清理臨時文件 if os.path.exists(pdf_path): os.unlink(pdf_path) else: # 如果無法下載PDF,使用現有資訊 processed_item["名稱"] = extract_title_from_collection(collection_name) processed_item["摘要"] = use_manual_abstract if use_manual_abstract else "無法獲取摘要資訊" else: # 如果沒有下載位置,使用現有資訊 processed_item["名稱"] = extract_title_from_collection(collection_name) processed_item["摘要"] = use_manual_abstract if use_manual_abstract else "無下載位置,無法提取摘要" processed_data.append(processed_item) # 返回格式化的JSON return json.dumps(processed_data, ensure_ascii=False, indent=2) except Exception as e: return f"處理錯誤: {str(e)}" def generate_filename_from_collection(collection_name: str) -> str: """根據論文集名稱生成安全的文件名""" if not collection_name: return "processed_data.json" # 移除特殊字符,保留中英文、數字、連字符 safe_name = re.sub(r'[^\w\u4e00-\u9fff\-]', '_', collection_name) # 移除多餘的下劃線 safe_name = re.sub(r'_+', '_', safe_name).strip('_') # 限制長度 if len(safe_name) > 50: safe_name = safe_name[:50] # 確保有副檔名 if not safe_name.endswith('.json'): safe_name += '.json' return safe_name def save_json_file(json_data: str, filename: str = None, auto_filename: bool = False, collection_name: str = None) -> tuple: """保存JSON文件,支持自動文件名生成""" try: # 如果啟用自動文件名且有論文集名稱,使用論文集名稱 if auto_filename and collection_name: filename = generate_filename_from_collection(collection_name) elif not filename: filename = "processed_data.json" if not filename.endswith('.json'): filename += '.json' # 確保文件名安全 filename = re.sub(r'[^\w\u4e00-\u9fff\-_\.]', '_', filename) with open(filename, 'w', encoding='utf-8') as f: f.write(json_data) return f"文件已保存: {filename}", filename except Exception as e: return f"保存失敗: {str(e)}", None # PDF網址處理函數 def process_pdf_urls(urls_text: str, manual_abstract: str = "") -> str: """處理PDF網址列表,直接提取資訊""" try: # 解析網址 urls = [url.strip() for url in urls_text.strip().split('\n') if url.strip()] if not urls: return "請輸入至少一個PDF網址" processed_data = [] use_manual_abstract = manual_abstract.strip() for i, url in enumerate(urls, 1): print(f"正在處理第 {i}/{len(urls)} 個PDF: {url}") # 下載PDF pdf_path = download_pdf(url) if pdf_path: try: # 從PDF提取資訊 extracted_info = extract_pdf_content(pdf_path) # 構建資料項目 item = { "名稱": extracted_info.get("title", f"PDF文件 {i}"), "作者": extracted_info.get("author", "未知作者"), "摘要": use_manual_abstract if use_manual_abstract else extracted_info.get("abstract", "無摘要資訊"), "下載位置": url, "論文集名稱": f"直接處理PDF {i}" } processed_data.append(item) finally: # 清理臨時文件 if os.path.exists(pdf_path): os.unlink(pdf_path) else: # PDF下載失敗時的處理 item = { "名稱": f"無法下載的PDF {i}", "作者": "未知作者", "摘要": use_manual_abstract if use_manual_abstract else "PDF下載失敗,無法提取摘要", "下載位置": url, "論文集名稱": f"處理失敗 {i}" } processed_data.append(item) # 返回格式化的JSON return json.dumps(processed_data, ensure_ascii=False, indent=2) except Exception as e: return f"處理錯誤: {str(e)}" # Gradio界面 with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo: with gr.Tabs(): # JSON處理標籤頁 with gr.TabItem("JSON資料處理"): with gr.Row(): with gr.Column(scale=1): json_input = gr.Textbox( label="輸入JSON資料", placeholder="請貼上您的JSON資料...", lines=8, value='[\n {\n "論文集名稱": "刑事政策與犯罪防治研究36",\n "作者": "犯罪防治研究中心彙編",\n "下載位置": "https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true"\n }\n]' ) manual_abstract1 = gr.Textbox( label="手動輸入摘要 (選填)", placeholder="如果有摘要,請在此輸入。留空則自動從PDF中提取。", lines=4 ) with gr.Row(): auto_filename1 = gr.Checkbox( label="自動使用論文集名稱作為檔名", value=True ) filename_input1 = gr.Textbox( label="自訂文件名 (僅在未勾選自動檔名時使用)", placeholder="例: processed_papers.json", value="processed_papers.json", visible=False ) process_json_btn = gr.Button("處理JSON資料", variant="primary", size="lg") with gr.Column(scale=2): output_json1 = gr.Textbox( label="處理結果", lines=18, show_copy_button=True ) save_status1 = gr.Textbox( label="保存狀態", lines=2 ) download_file1 = gr.File( label="下載處理後的文件", visible=False ) # PDF網址處理標籤頁 with gr.TabItem("PDF網址直接處理"): with gr.Row(): with gr.Column(scale=1): pdf_urls_input = gr.Textbox( label="輸入PDF網址", placeholder="請輸入PDF網址,每行一個...\n\n例如:\nhttps://example.com/paper1.pdf\nhttps://example.com/paper2.pdf\nhttps://example.com/paper3.pdf", lines=8, value="https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true" ) manual_abstract2 = gr.Textbox( label="手動輸入摘要 (選填)", placeholder="如果有摘要,請在此輸入。留空則自動從PDF中提取。", lines=4 ) with gr.Row(): auto_filename2 = gr.Checkbox( label="自動使用第一個PDF標題作為檔名", value=True ) filename_input2 = gr.Textbox( label="自訂文件名 (僅在未勾選自動檔名時使用)", placeholder="例: pdf_extracted_data.json", value="pdf_extracted_data.json", visible=False ) process_urls_btn = gr.Button("處理PDF網址", variant="primary", size="lg") with gr.Column(scale=2): output_json2 = gr.Textbox( label="處理結果", lines=18, show_copy_button=True ) save_status2 = gr.Textbox( label="保存狀態", lines=2 ) download_file2 = gr.File( label="下載處理後的文件", visible=False ) def process_and_save_json(json_input, manual_abstract, auto_filename, custom_filename): # 處理JSON資料 result = process_json_data(json_input, manual_abstract) # 獲取論文集名稱用於自動檔名 collection_name = "" try: data = json.loads(json_input) if isinstance(data, list) and len(data) > 0: collection_name = data[0].get("論文集名稱", "") elif isinstance(data, dict): collection_name = data.get("論文集名稱", "") except: pass # 保存文件 save_msg, actual_filename = save_json_file( result, custom_filename if not auto_filename else None, auto_filename, collection_name ) # 如果保存成功,提供下載 if actual_filename and "已保存" in save_msg: return result, save_msg, gr.update(visible=True, value=actual_filename) else: return result, save_msg, gr.update(visible=False) def process_and_save_urls(urls_input, manual_abstract, auto_filename, custom_filename): # 處理PDF網址 result = process_pdf_urls(urls_input, manual_abstract) # 獲取第一個PDF的標題用於自動檔名 title_for_filename = "" if auto_filename: try: data = json.loads(result) if isinstance(data, list) and len(data) > 0: title_for_filename = data[0].get("名稱", "") except: pass # 保存文件 save_msg, actual_filename = save_json_file( result, custom_filename if not auto_filename else None, auto_filename, title_for_filename ) # 如果保存成功,提供下載 if actual_filename and "已保存" in save_msg: return result, save_msg, gr.update(visible=True, value=actual_filename) else: return result, save_msg, gr.update(visible=False) # 控制文件名輸入框的顯示/隱藏 def toggle_filename_input1(auto_filename): return gr.update(visible=not auto_filename) def toggle_filename_input2(auto_filename): return gr.update(visible=not auto_filename) # 綁定checkbox事件 auto_filename1.change( toggle_filename_input1, inputs=[auto_filename1], outputs=[filename_input1] ) auto_filename2.change( toggle_filename_input2, inputs=[auto_filename2], outputs=[filename_input2] ) # JSON處理按鈕事件 process_json_btn.click( process_and_save_json, inputs=[json_input, manual_abstract1, auto_filename1, filename_input1], outputs=[output_json1, save_status1, download_file1] ) # PDF網址處理按鈕事件 process_urls_btn.click( process_and_save_urls, inputs=[pdf_urls_input, manual_abstract2, auto_filename2, filename_input2], outputs=[output_json2, save_status2, download_file2] ) gr.Markdown("## 基本功能:JSON處理 | PDF網址處理 | 自動檔名 | 手動摘要") if __name__ == "__main__": demo.launch()