Spaces:

yzweak
/

AutoPR

Running

App Files Files Community

yzweak commited on Sep 24

Commit

ec3d86e

0 Parent(s):

Initial commit

Browse files

Files changed (19) hide show

.gitattributes +1 -0
.vscode/settings.json +5 -0
README.md +16 -0
app.py +506 -0
pragent/backend/__init__.py +0 -0
pragent/backend/agents.py +176 -0
pragent/backend/blog_pipeline.py +401 -0
pragent/backend/data_loader.py +53 -0
pragent/backend/figure_table_pipeline.py +118 -0
pragent/backend/html2txt.py +30 -0
pragent/backend/loader.py +25 -0
pragent/backend/pdf2html.py +38 -0
pragent/backend/prompts.py +518 -0
pragent/backend/text_pipeline.py +42 -0
pragent/backend/text_processor.py +162 -0
pragent/backend/yolo.py +51 -0
pragent/logo/logo.png +0 -0
pragent/model/doclayout_yolo_docstructbench_imgsz1024.pt +3 -0
requirements.txt +28 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+title: AutoPR
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.29.0
+app_file: app.py
+pinned: false
+---
+# PRAgent: A Modular Agentic Framework for Paper-to-PR Conversion
+As the volume of peer-reviewed research surges, scholars increasingly rely on social platforms for discovery, while authors invest significant effort in promotion to sustain visibility and citations. This project aims to address that challenge.
+To accomplish this, we developed **PRAgent**, a modular agentic framework for automatically transforming research papers into promotional posts optimized for specific social media platforms.

app.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# app.py
+import gradio as gr
+import asyncio
+import os
+import shutil
+from pathlib import Path
+import time
+import json
+import traceback
+from typing import List, Dict, Tuple, Optional
+from pragent.backend.text_pipeline import pipeline as run_text_extraction
+from pragent.backend.figure_table_pipeline import run_figure_extraction
+from pragent.backend.blog_pipeline import generate_text_blog, generate_final_post
+from pragent.backend.agents import setup_client, call_text_llm_api
+import base64
+import mimetypes
+import re
+# --- 新增模块：用于将 Markdown Post 格式化为结构化 JSON ---
+FORMAT_PROMPT_TEMPLATE = '''
+You are an expert in structuring social media content. Your task is to convert a post written in Markdown format into a structured JSON format. The JSON structure depends on the target platform.
+**Platform:** {platform}
+**Markdown Content:**
+---
+{markdown_text}
+---
+**Instructions:**
+{platform_instructions}
+'''
+TWITTER_INSTRUCTIONS = '''
+Convert the content into a JSON array representing a Twitter thread. Each element in the array is a tweet object.
+- Each tweet object must have a "text" key. The text should be plain text, without any Markdown formatting (e.g., no `*`, `#`, `[]()`)
+- If a tweet is associated with an image, add an "image_index" key with the corresponding zero-based index from the provided asset list. For example, if the first image in the Markdown `![...](img_0.png)` is used, its index is 0.
+- Ensure the thread flows logically. Split the text into multiple tweets if necessary.
+**Asset List (for reference):**
+{asset_list}
+**JSON Output Format:**
+[
+  {{ "text": "Text of the first tweet.", "image_index": 0 }},
+  {{ "text": "Text of the second tweet." }},
+  {{ "text": "Text of the third tweet.", "image_index": 1 }}
+]
+'''
+XIAOHONGSHU_INSTRUCTIONS = '''
+Convert the content into a single JSON object for a Xiaohongshu post.
+- The JSON object must have a "title" key. Extract the main title from the Markdown (usually the first H1/H2 heading). The title should be plain text.
+- The JSON object must have a "body" key containing the main text content, with emojis. The body text should be plain text, without any Markdown formatting (e.g., no `*`, `#`, `[]()`)
+- The JSON object must have an "image_indices" key, which is an array of all image indexes used in the post, in the order they appear.
+**Asset List (for reference):**
+{asset_list}
+**JSON Output Format:**
+{{
+  "title": "Your Catchy Title Here",
+  "body": "The full body text of the post...",
+  "image_indices": [0, 1, 2, 3]
+}}
+'''
+def image_to_base64(path: str) -> str:
+    """读取图片文件并将其转换为 Base64 Data URL 字符串"""
+    try:
+        # 根据文件路径猜测MIME类型
+        mime_type, _ = mimetypes.guess_type(path)
+        if mime_type is None:
+            mime_type = "image/jpeg"
+            if path.lower().endswith(".png"):
+                mime_type = "image/png"
+            else:
+                mime_type = "image/jpeg"
+        with open(path, "rb") as image_file:
+            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+            return f"data:{mime_type};base64,{encoded_string}"
+    except Exception as e:
+        print(f"[!] Error converting image to base64: {e}")
+        return ""
+LOGO_PATH = "pragent/logo/logo.png"
+LOGO_BASE64 = ""
+if os.path.exists(LOGO_PATH):
+    LOGO_BASE64 = image_to_base64(LOGO_PATH)
+else:
+    print(f"[!] Warning: Logo file not found at {LOGO_PATH}")
+async def format_post_for_display(
+    markdown_text: str,
+    assets: Optional[List[Dict]],
+    platform: str,
+    client,
+    model: str
+) -> Optional[Dict]:
+    """
+    使用 LLM 将 Markdown 格式的帖子转换为结构化的 JSON 以便在 UI 中显示。
+    """
+    if platform == 'twitter':
+        instructions = TWITTER_INSTRUCTIONS
+    elif platform == 'xiaohongshu':
+        instructions = XIAOHONGSHU_INSTRUCTIONS
+    else:
+        return None
+    asset_str = "No assets provided."
+    if assets:
+        asset_str = "\n".join([f"- Index {i}: {asset['dest_name']}" for i, asset in enumerate(assets)])
+    prompt = FORMAT_PROMPT_TEMPLATE.format(
+        platform=platform.capitalize(),
+        markdown_text=markdown_text,
+        platform_instructions=instructions.format(asset_list=asset_str),
+    )
+    system_prompt = "You are a content formatting expert. Output only valid JSON."
+    response_str = ""
+    try:
+        response_str = await call_text_llm_api(client, system_prompt, prompt, model)
+        json_str = None
+        match = re.search(r"```(?:json)?\s*([\s\S]+?)\s*```", response_str)
+        if match:
+            json_str = match.group(1)
+        else:
+            json_str = response_str
+        return json.loads(json_str.strip())
+    except Exception as e:
+        print(f"[!] Error formatting post for display: {e}")
+        traceback.print_exc()
+        return None
+# --- Gradio UI 渲染帮助函数 ---
+def render_twitter_thread(thread_data: List[Dict], assets: List[str]) -> str:
+    html_parts = []
+    for i, tweet in enumerate(thread_data):
+        text_html = tweet.get("text", "").replace("\n", "<br>")
+        image_html = ""
+        if "image_index" in tweet and tweet["image_index"] < len(assets):
+            img_idx = tweet["image_index"]
+            img_path = assets[img_idx]
+            base64_string = image_to_base64(img_path)
+            image_html = f'<div class="tweet-image-container"><img src="{base64_string}" class="tweet-image"></div>'
+        tweet_html = f'''
+        <div class="tweet-row">
+            <div class="avatar-container">
+                <img src="{LOGO_BASE64}" class="avatar">
+            </div>
+            <div class="tweet-content">
+                <div class="user-info">
+                    <strong>PRAgent</strong> <span>@pr_agent</span>
+                </div>
+                <div class="tweet-text">{text_html}</div>
+                {image_html}
+            </div>
+        </div>
+        '''
+        html_parts.append(tweet_html)
+    return "".join(html_parts)
+def render_xiaohongshu_post(post_data: Dict, assets: List[str]) -> str:
+    """V6 - Final Version: Returns ONLY pure HTML structure."""
+    title_html = f"<h2 class='xhs-title'>{post_data.get('title', '')}</h2>"
+    body_text = post_data.get('body', '').replace('\n', '<br>')
+    body_html = f"<div class='xhs-body'>{body_text}</div>"
+    gallery_html = ""
+    if "image_indices" in post_data and post_data["image_indices"]:
+        image_indices = post_data["image_indices"]
+        # Fix: Remove duplicate indices to prevent carousel showing duplicate images, while preserving order.
+        unique_indices = list(dict.fromkeys(image_indices))
+        valid_assets = [assets[i] for i in unique_indices if i < len(assets)]
+        if valid_assets:
+            # We still need a unique ID for the observer to find it
+            carousel_id = f"carousel_{int(time.time() * 1000)}"
+            slides_html = ""
+            for i, img_path in enumerate(valid_assets):
+                base64_string = image_to_base64(img_path)
+                slides_html += f'<div class="carousel-slide"><div class="carousel-numbertext">{i + 1} / {len(valid_assets)}</div><img src="{base64_string}"></div>'
+            arrows_html = ""
+            if len(valid_assets) > 1:
+                arrows_html = '<a class="prev">&#10094;</a><a class="next">&#10095;</a>'
+            gallery_html = f'<div class="carousel-container" id="{carousel_id}">{slides_html}{arrows_html}</div>'
+    return f"{gallery_html}{title_html}{body_html}"
+# --- 主处理流程 ---
+async def process_pdf(
+    pdf_file,
+    text_api_key,
+    vision_api_key,
+    base_url,
+    text_model,
+    vision_model,
+    platform,
+    language,
+    progress=gr.Progress(track_tqdm=True)
+):
+    # Use text_api_key for vision_api_key if it's not provided
+    vision_api_key = vision_api_key or text_api_key
+    if not all([pdf_file, text_api_key, vision_api_key, base_url, text_model, vision_model, platform, language]):
+        raise gr.Error("Please fill in all required fields and upload a PDF.")
+    work_dir = None
+    try:
+        # 1. 创建临时工作目录
+        session_id = f"session_{int(time.time())}"
+        work_dir = Path(".temp_output") / session_id
+        work_dir.mkdir(parents=True, exist_ok=True)
+        pdf_path = Path(work_dir) / Path(pdf_file.name).name
+        shutil.copy(pdf_file.name, pdf_path)
+        final_assets = []
+        # 步骤 1: 提取文本
+        yield gr.update(value="🚀 **Processing...** Please wait.", visible=True), gr.update(value="", visible=False), gr.update(visible=False)
+        progress(0.1, desc="Step 1/5: Extracting text from PDF...")
+        txt_output_path = work_dir / "paper.txt"
+        await run_text_extraction(str(pdf_path), str(txt_output_path))
+        if not txt_output_path.exists():
+            raise gr.Error("Failed to extract text from the PDF.")
+        # 步骤 2: 提取图片
+        progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
+        extraction_work_dir = work_dir / "figure_extraction"
+        extraction_work_dir.mkdir()
+        paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir))
+        if not paired_dir or not any(Path(paired_dir).iterdir()):
+            raise gr.Error("Failed to extract any figures from the PDF.")
+        # 步骤 3: 生成草稿
+        progress(0.5, desc="Step 3/5: Generating structured text draft...")
+        blog_draft, source_paper_text = await generate_text_blog(
+            txt_path=str(txt_output_path),
+            api_key=text_api_key,
+            text_api_base=base_url,
+            model=text_model,
+            language=language
+        )
+        if not blog_draft or blog_draft.startswith("Error:"):
+            raise gr.Error(f"Failed to generate blog draft: {blog_draft}")
+        # 步骤 4: 生成带图分析的最终 Markdown
+        progress(0.7, desc="Step 4/5: Generating final post with vision analysis...")
+        final_post_md, assets_info = await generate_final_post(
+            blog_draft=blog_draft,
+            source_paper_text=source_paper_text,
+            assets_dir=paired_dir,
+            text_api_key=text_api_key,
+            vision_api_key=vision_api_key,
+            text_api_base=base_url,
+            vision_api_base=base_url,
+            text_model=text_model,
+            vision_model=vision_model,
+            platform=platform,
+            language=language,
+            post_format='rich'
+        )
+        if not final_post_md or final_post_md.startswith("Error:"):
+            raise gr.Error(f"Failed to generate final post: {final_post_md}")
+        # 将最终的 Markdown 和图片保存到单独的 "post" 文件夹中，以便压缩
+        post_content_dir = work_dir / "post"
+        post_content_dir.mkdir()
+        if assets_info:
+            for asset in assets_info:
+                # 复制图片到 post_content_dir
+                dest_path = post_content_dir / Path(asset['src_path']).name
+                shutil.copy(asset['src_path'], dest_path)
+                # The path for rendering needs to be the absolute path to the copied file
+                absolute_path_str = str(dest_path.resolve()).replace('\\', '/')
+                final_assets.append(absolute_path_str)
+        # 保存 Markdown
+        (post_content_dir / "post.md").write_text(final_post_md, encoding='utf-8')
+        # 步骤 5: 格式化为JSON
+        progress(0.9, desc="Step 5/5: Formatting for rich display...")
+        async with setup_client(text_api_key, base_url) as client:
+            structured_data = await format_post_for_display(
+                final_post_md, assets_info, platform, client, text_model
+            )
+        if not structured_data:
+            raise gr.Error("Failed to format post for display.")
+        # 保存 structured_data
+        (post_content_dir / "post.json").write_text(json.dumps(structured_data, indent=2, ensure_ascii=False), encoding='utf-8')
+        # 渲染最终UI
+        if platform == 'twitter':
+            final_html = render_twitter_thread(structured_data, final_assets)
+        else: # xiaohongshu
+            final_html = render_xiaohongshu_post(structured_data, final_assets)
+        # 创建 ZIP 压缩包
+        zip_filename_base = f"PRAgent_post_{platform}_{session_id}"
+        zip_path = shutil.make_archive(
+            base_name=str(work_dir / zip_filename_base),
+            format='zip',
+            root_dir=str(work_dir),
+            base_dir="post"
+        )
+        # 使用 gr.update(value=...) 更新 gr.HTML 组件
+        yield gr.update(value="✅ **Done!**"), gr.update(value=final_html, visible=True), gr.update(value=zip_path, visible=True)
+    except Exception as e:
+        traceback.print_exc()
+        error_html = f"<h2>Error:</h2><pre>{traceback.format_exc()}</pre>"
+        yield gr.update(value=f"❌ An error occurred: {e}"), gr.update(value=error_html, visible=True), gr.update(visible=False)
+    finally:
+        # Cleanup is disabled to prevent race conditions with Gradio's reloader
+        # and to allow inspection of generated files.
+        pass
+        # if work_dir and work_dir.exists():
+        #     shutil.rmtree(work_dir)
+# --- Gradio 应用界面定义 ---
+# 自定义 CSS
+CUSTOM_CSS = '''
+/* --- Twitter Style --- */
+.tweet-row {
+    display: flex;
+    align-items: flex-start;
+    padding: 16px;
+    border: 1px solid #e1e8ed;
+    border-radius: 15px;
+    margin-bottom: 12px;
+    background-color: #ffffff;
+}
+.avatar-container {
+    flex-shrink: 0;
+    margin-right: 12px;
+}
+.avatar {
+    width: 48px;
+    height: 48px;
+    border-radius: 50%;
+    object-fit: cover;
+}
+.tweet-content {
+    width: 100%;
+}
+.user-info {
+    font-size: 15px;
+    font-weight: bold;
+}
+.user-info span {
+    color: #536471;
+    font-weight: normal;
+}
+.tweet-text {
+    font-size: 15px;
+    line-height: 1.5;
+    color: #0f1419;
+    margin-top: 4px;
+    word-wrap: break-word;
+}
+.tweet-image-container {
+    margin-top: 12px;
+}
+.tweet-image {
+    width: 100%;
+    border-radius: 15px;
+    border: 1px solid #ddd;
+    display: block;
+}
+/* --- Xiaohongshu Style  --- */
+.xhs-title { font-size: 20px; font-weight: bold; color: #333; margin-bottom: 10px; }
+.xhs-body { font-size: 16px; line-height: 1.8; color: #555; word-wrap: break-word; }
+#output_container {
+    border: 2px dashed #ccc;
+    padding: 20px;
+    min-height: 100px;
+    border-radius: 15px;
+}
+.carousel-container { position: relative; max-width: 100%; margin: auto; overflow: hidden; border-radius: 10px; }
+.carousel-slide { display: none; animation: fade 0.5s ease-in-out; }
+.carousel-slide:first-child { display: block; }
+.carousel-slide img { width: 100%; display: block; }
+.prev, .next { cursor: pointer; position: absolute; top: 50%; width: auto; padding: 16px; margin-top: -22px; color: white; font-weight: bold; font-size: 20px; transition: 0.3s ease; border-radius: 0 3px 3px 0; user-select: none; background-color: rgba(0,0,0,0.3); }
+.next { right: 0; border-radius: 3px 0 0 3px; }
+.prev:hover, .next:hover { background-color: rgba(0,0,0,0.6); }
+.carousel-numbertext { color: #f2f2f2; font-size: 12px; padding: 8px 12px; position: absolute; top: 0; background-color: rgba(0,0,0,0.5); border-radius: 0 0 5px 0; }
+@keyframes fade { from {opacity: .4} to {opacity: 1}}
+'''
+ACTIVATE_CAROUSEEL_JS = '''
+() => {
+    // We use a small 100ms delay to ensure Gradio has finished updating the HTML DOM
+    setTimeout(() => {
+        const container = document.getElementById('output_container');
+        if (container) {
+            const carousel = container.querySelector('.carousel-container');
+            // Check if a carousel exists and hasn't been initialized yet
+            if (carousel && !carousel.dataset.initialized) {
+                console.log("PRAgent Carousel Script: JS listener has found and is activating the carousel ->", carousel.id);
+                let slideIndex = 1;
+                const slides = carousel.getElementsByClassName("carousel-slide");
+                const prevButton = carousel.querySelector(".prev");
+                const nextButton = carousel.querySelector(".next");
+                if (slides.length === 0) return;
+                const showSlides = () => {
+                    if (slideIndex > slides.length) { slideIndex = 1; }
+                    if (slideIndex < 1) { slideIndex = slides.length; }
+                    for (let i = 0; i < slides.length; i++) {
+                        slides[i].style.display = "none";
+                    }
+                    slides[slideIndex - 1].style.display = "block";
+                };
+                if (prevButton) {
+                    prevButton.addEventListener('click', () => { slideIndex--; showSlides(); });
+                }
+                if (nextButton) {
+                    nextButton.addEventListener('click', () => { slideIndex++; showSlides(); });
+                }
+                showSlides(); // Show the first slide
+                carousel.dataset.initialized = 'true'; // Mark as initialized to prevent re-activation
+            }
+        }
+    }, 100);
+}
+'''
+with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
+    gr.Markdown("# 🚀 PRAgent: Paper to Social Media Post")
+    gr.Markdown("Upload a research paper PDF, and I will generate a social media post for Twitter or Xiaohongshu, complete with images and platform-specific styling.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_upload = gr.File(label="Upload PDF Paper", file_types=[".pdf"])
+            with gr.Accordion("Advanced Settings", open=True):
+                text_api_key_input = gr.Textbox(label="Text API Key", type="password", placeholder="Required: sk-...")
+                vision_api_key_input = gr.Textbox(label="Vision API Key (Optional)", type="password", placeholder="Optional: If not provided, Text API Key will be used")
+                base_url_input = gr.Textbox(label="API Base URL")
+                text_model_input = gr.Textbox(label="Text Model")
+                vision_model_input = gr.Textbox(label="Vision Model")
+            platform_select = gr.Radio(["twitter", "xiaohongshu"], label="Target Platform", value="twitter")
+            language_select = gr.Radio([("English", "en"), ("Chinese", "zh")], label="Language", value="en")
+            generate_btn = gr.Button("✨ Generate Post", variant="primary")
+        with gr.Column(scale=2):
+            status_text = gr.Markdown("Idle. Please upload a file and click generate.", visible=True)
+            output_container = gr.HTML(elem_id="output_container")
+            download_button = gr.File(label="Download Post & Images", visible=False)
+    # 绑定按钮点击事件
+    click_event = generate_btn.click(
+        fn=process_pdf,
+        inputs=[
+            pdf_upload,
+            text_api_key_input,
+            vision_api_key_input,
+            base_url_input,
+            text_model_input,
+            vision_model_input,
+            platform_select,
+            language_select
+        ],
+        outputs=[status_text, output_container, download_button]
+    )
+    # 链接 .then() 事件，在前一个事件成功后执行 JavaScript
+    click_event.then(
+        fn=None,  # 这里不需要执行 Python 函数
+        inputs=None,
+        outputs=None,
+        js=ACTIVATE_CAROUSEEL_JS # 将 JS 放在独立的事件中
+    )
+if __name__ == "__main__":
+    # Create the hidden temp directory
+    Path(".temp_output").mkdir(exist_ok=True)
+    demo.launch()

pragent/backend/__init__.py ADDED Viewed

File without changes

pragent/backend/agents.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# agent.py
+import base64
+from openai import AsyncOpenAI
+from contextlib import asynccontextmanager
+from typing import List, Dict, AsyncIterator, Optional, Any, Tuple
+from tqdm.asyncio import tqdm
+import tiktoken
+def _prepare_extra_body(model_name: str, disable_qwen_thinking: bool) -> Optional[Dict[str, Any]]:
+    if "qwen3" in model_name.lower() and disable_qwen_thinking:
+        tqdm.write("[*] 已为 Qwen3 模型启用 'disable_thinking' 模式。")
+        return {"chat_template_kwargs": {"enable_thinking": False}}
+    return None
+@asynccontextmanager
+async def setup_client(api_key: str, base_url: str) -> AsyncIterator[AsyncOpenAI]:
+    """使用异步上下文管理器来创建和妥善销毁API客户端。"""
+    client = None
+    if not api_key or "sk-" not in api_key:
+        tqdm.write("[!] 错误: API Key无效或未设置。")
+        yield None
+        return
+    try:
+        tqdm.write("[*] 正在初始化API客户端...")
+        client = AsyncOpenAI(api_key=api_key, base_url=base_url, timeout=300.0)
+        yield client
+    except Exception as e:
+        tqdm.write(f"[!] 初始化AsyncOpenAI客户端时出错: {e}")
+        yield None
+    finally:
+        if client:
+            tqdm.write("[*] 正在关闭API客户端连接...")
+            await client.close()
+            tqdm.write("[*] API客户端已关闭。")
+def encode_image_to_base64(image_path: str) -> str:
+    try:
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    except Exception as e:
+        tqdm.write(f"[!] 编码图片失败 {image_path}: {e}")
+        return ""
+async def call_text_llm_api(local_client: AsyncOpenAI, system_prompt: str, user_prompt: str, model: str, disable_qwen_thinking: bool = False) -> str:
+    """异步调用仅处理文本的大语言模型API。"""
+    if not local_client: return "错误: API客户端未配置。"
+    try:
+        extra_body = _prepare_extra_body(model, disable_qwen_thinking)
+        completion = await local_client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            extra_body=extra_body  # 应用 extra_body
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        return f"错误: 文本API调用失败 - {e}"
+async def call_multimodal_llm_api(local_client: AsyncOpenAI, system_prompt: str, user_prompt_parts: list, model: str, disable_qwen_thinking: bool = False) -> str:
+    if not local_client: return "错误: API客户端未配置。"
+    try:
+        extra_body = _prepare_extra_body(model, disable_qwen_thinking)
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt_parts}
+        ]
+        completion = await local_client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=2048,
+            extra_body=extra_body  # 应用 extra_body
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        return f"错误: 多模态API调用失败 - {e}"
+class BlogGeneratorAgent:
+    def __init__(self, prompt_template: str, model: str):
+        self.prompt_template = prompt_template
+        self.model = model
+        self.system_prompt = "You are a top-tier science and technology blogger and popular science writer."
+    async def run(self, local_client: AsyncOpenAI, paper_text: str, disable_qwen_thinking: bool = False) -> str:
+        user_prompt = self.prompt_template.format(paper_text=paper_text)
+        return await call_text_llm_api(local_client, self.system_prompt, user_prompt, self.model, disable_qwen_thinking)
+class FigureDescriberAgent:
+    def __init__(self, model: str):
+        self.model = model
+        self.system_prompt = "You are an expert academic analyst. Your task is to provide a detailed explanation of the provided image, using its original caption as context. Describe what the figure shows, what its main takeaway is, and how it supports the paper's argument. Be clear, comprehensive, and ready for a blog post."
+    async def run(self, local_client: AsyncOpenAI, figure_path: str, caption_path: str, disable_qwen_thinking: bool = False) -> str:
+        base64_figure = encode_image_to_base64(figure_path)
+        base64_caption_img = encode_image_to_base64(caption_path)
+        if not all([base64_figure, base64_caption_img]):
+            return "错误: 无法编码一张或多张图片。"
+        user_prompt = [
+            {"type": "text", "text": "Please analyze this figure and its accompanying caption. Provide a detailed, blog-ready description."},
+            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_figure}", "detail": "high"}},
+            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_caption_img}", "detail": "low"}}
+        ]
+        return await call_multimodal_llm_api(local_client, self.system_prompt, user_prompt, self.model, disable_qwen_thinking)
+class BlogIntegratorAgent:
+    def __init__(self, prompt_template: str, model: str):
+        self.prompt_template = prompt_template
+        self.model = model
+        self.system_prompt = "You are a master science communicator and blogger. Your task is to transform a dry academic text into an engaging blog post, weaving in figures and tables to tell a compelling story."
+    async def run(self, local_client: AsyncOpenAI, blog_text: str, items_with_descriptions: List[Dict], source_text: str, disable_qwen_thinking: bool = False) -> str:
+        items_list_str = []
+        for i, item in enumerate(items_with_descriptions):
+            placeholder = f"[FIGURE_PLACEHOLDER_{i}]"
+            description = item['description']
+            items_list_str.append(f"### Figure {i} (Placeholder: {placeholder})\n**Type**: {item['type']}\n**Description**: {description}\n---")
+        user_prompt = self.prompt_template.format(
+            source_text=source_text,
+            blog_text=blog_text,
+            items_list_str="\n".join(items_list_str)
+        )
+        return await call_text_llm_api(local_client, self.system_prompt, user_prompt, self.model, disable_qwen_thinking)
+async def call_text_llm_api_with_token_count(
+    local_client: AsyncOpenAI,
+    system_prompt: str,
+    user_prompt: str,
+    model: str,
+    disable_qwen_thinking: bool = False
+) -> Tuple[str, int]:
+    """
+    Calls the text LLM API and returns the content and the 'think' token count.
+    """
+    if not local_client:
+        return "错误: API客户端未配置。", 0
+    try:
+        params = {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+        }
+        extra_body = _prepare_extra_body(model, disable_qwen_thinking)
+        if extra_body:
+            params["extra_body"] = extra_body
+        completion = await local_client.chat.completions.create(**params)
+        content = completion.choices[0].message.content or ""
+        reasoning_content = getattr(completion.choices[0].message, 'reasoning_content', None)
+        think_token_count = 0
+        if reasoning_content and isinstance(reasoning_content, str):
+            try:
+                encoding = tiktoken.encoding_for_model(model)
+            except KeyError:
+                encoding = tiktoken.get_encoding("cl100k_base")
+            think_token_count = len(encoding.encode(reasoning_content))
+        return content, think_token_count
+    except Exception as e:
+        return f"错误: 文本API调用失败 - {e}", 0

pragent/backend/blog_pipeline.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# pragent/backend/blog_pipeline.py
+from tqdm.asyncio import tqdm
+import asyncio
+from pathlib import Path
+from typing import Tuple, List, Dict, Optional
+from openai import AsyncOpenAI
+import re
+import os
+import json
+# ADDED FOR OCR & CACHE SAFETY: New imports for OCR
+import pytesseract
+from PIL import Image
+import asyncio
+from pragent.backend.agents import setup_client, BlogGeneratorAgent, FigureDescriberAgent, BlogIntegratorAgent, call_text_llm_api,call_text_llm_api_with_token_count
+from pragent.backend.data_loader import load_plain_text, load_paired_image_paths
+from pragent.backend.text_processor import summarize_long_text
+from .prompts import (
+    TEXT_GENERATOR_PROMPT, TEXT_GENERATOR_PROMPT_CHINESE,
+    TWITTER_RICH_TEXT_PROMPT_ENGLISH, TWITTER_TEXT_ONLY_PROMPT_ENGLISH,
+    TWITTER_RICH_TEXT_PROMPT_CHINESE, TWITTER_TEXT_ONLY_PROMPT_CHINESE,
+    XIAOHONGSHU_PROMPT_ENGLISH, XIAOHONGSHU_PROMPT_CHINESE,
+    XIAOHONGSHU_TEXT_ONLY_PROMPT_ENGLISH, XIAOHONGSHU_TEXT_ONLY_PROMPT_CHINESE,
+    BASELINE_PROMPT_ENGLISH, BASELINE_PROMPT_CHINESE,
+    GENERIC_RICH_PROMPT_CHINESE,GENERIC_RICH_PROMPT_ENGLISH,
+    GENERIC_TEXT_ONLY_PROMPT_CHINESE,GENERIC_TEXT_ONLY_PROMPT_ENGLISH,
+    BASELINE_FEWSHOT_PROMPT_ENGLISH, BASELINE_FEWSHOT_PROMPT_CHINESE
+)
+TOKEN_THRESHOLD = 8000
+PROMPT_MAPPING = {
+    ('twitter', 'rich', 'en'): TWITTER_RICH_TEXT_PROMPT_ENGLISH,
+    ('twitter', 'text_only', 'en'): TWITTER_TEXT_ONLY_PROMPT_ENGLISH,
+    ('twitter', 'rich', 'zh'): TWITTER_RICH_TEXT_PROMPT_CHINESE,
+    ('twitter', 'text_only', 'zh'): TWITTER_TEXT_ONLY_PROMPT_CHINESE,
+    ('xiaohongshu', 'rich', 'en'): XIAOHONGSHU_PROMPT_ENGLISH,
+    ('xiaohongshu', 'rich', 'zh'): XIAOHONGSHU_PROMPT_CHINESE,
+    ('xiaohongshu', 'text_only', 'en'): XIAOHONGSHU_TEXT_ONLY_PROMPT_ENGLISH,
+    ('xiaohongshu', 'text_only', 'zh'): XIAOHONGSHU_TEXT_ONLY_PROMPT_CHINESE,
+    ('generic', 'rich', 'en'): GENERIC_RICH_PROMPT_ENGLISH,
+    ('generic', 'text_only', 'en'): GENERIC_TEXT_ONLY_PROMPT_ENGLISH,
+    ('generic', 'rich', 'zh'): GENERIC_RICH_PROMPT_CHINESE,
+    ('generic', 'text_only', 'zh'): GENERIC_TEXT_ONLY_PROMPT_CHINESE,
+}
+# ADDED FOR OCR & CACHE SAFETY: Asynchronous OCR helper function
+async def ocr_image_to_text(image_path: str) -> str:
+    """
+    Performs OCR on an image file to extract text asynchronously.
+    """
+    if not Path(image_path).exists():
+        return ""
+    try:
+        # pytesseract is a blocking library, so we run it in a thread pool
+        loop = asyncio.get_running_loop()
+        text = await loop.run_in_executor(
+            None,
+            lambda: pytesseract.image_to_string(Image.open(image_path))
+        )
+        return text.strip()
+    except Exception as e:
+        tqdm.write(f"[!] OCR failed for {image_path}: {e}")
+        return ""
+async def generate_text_blog(
+    txt_path: str, api_key: str, text_api_base: str, model: str, language: str,
+    disable_qwen_thinking: bool = False, ablation_mode: str = "none"
+) -> Tuple[str, str]:
+    """
+    Generates a structured, factual blog DRAFT in the specified language. (Stage 1)
+    """
+    async with setup_client(api_key, text_api_base) as client:
+        if not client:
+            return "Error: API client configuration failed.", None
+        paper_text = await load_plain_text(txt_path)
+        if not paper_text:
+            return "Error: Could not load text file.", None
+        text_for_generation = ""
+        if len(paper_text) > TOKEN_THRESHOLD:
+            if ablation_mode == 'no_hierarchical_summary':
+                tqdm.write(f"[*] ABLATION (no_hierarchical_summary): Truncating text to {TOKEN_THRESHOLD} characters.")
+                text_for_generation = paper_text[:TOKEN_THRESHOLD]
+            else:
+                summarized_text = await summarize_long_text(
+                    paper_text,
+                    model,
+                    client,
+                    disable_qwen_thinking=disable_qwen_thinking
+                )
+                if summarized_text.startswith("Error:"):
+                    summarized_text = paper_text[:TOKEN_THRESHOLD]
+                text_for_generation = summarized_text
+        else:
+            text_for_generation = paper_text
+        if ablation_mode in ['no_logical_draft', 'stage2']:
+            ablation_reason = "no_logical_draft" if ablation_mode != 'stage2' else 'stage2'
+            tqdm.write(f"[*] ABLATION ({ablation_reason}): Skipping structured draft generation.")
+            return text_for_generation, text_for_generation
+        draft_prompt = TEXT_GENERATOR_PROMPT_CHINESE if language == 'zh' else TEXT_GENERATOR_PROMPT
+        generator = BlogGeneratorAgent(draft_prompt, model)
+        generated_blog_draft = await generator.run(
+            client,
+            text_for_generation,
+            disable_qwen_thinking=disable_qwen_thinking
+        )
+        return generated_blog_draft, text_for_generation
+async def generate_final_post(
+    blog_draft: str,
+    source_paper_text: str,
+    assets_dir: Optional[str],
+    text_api_key: str,
+    vision_api_key: str,
+    text_api_base: str,
+    vision_api_base: str,
+    vision_model: str,
+    text_model: str,
+    platform: str,
+    language: str,
+    post_format: str,
+    description_cache_dir: Optional[str] = None,
+    pdf_hash: Optional[str] = None,
+    disable_qwen_thinking: bool = False,
+    ablation_mode: str = "none"
+) -> Optional[Tuple[str, Optional[List[Dict]]]]:
+    effective_platform = platform
+    if ablation_mode == 'no_platform_adaptation':
+        tqdm.write(f"[*] ABLATION (no_platform_adaptation): Using generic prompts instead of '{platform}' specific prompts.")
+        effective_platform = 'generic'
+    prompt_format = 'rich' if post_format == 'description_only' else post_format
+    prompt_key = (effective_platform, prompt_format, language)
+    selected_prompt = PROMPT_MAPPING.get(prompt_key)
+    if not selected_prompt:
+        tqdm.write(f"[!] Warning: No prompt found for configuration: {prompt_key}. Falling back to generic prompt.")
+        generic_fallback_key = ('generic', prompt_format, language)
+        selected_prompt = PROMPT_MAPPING.get(generic_fallback_key)
+        if not selected_prompt:
+            return f"Error: No prompt found for configuration: {prompt_key} or generic fallback.", None
+    tqdm.write(f"\n--- Generating final post for: Platform='{effective_platform}', Format='{post_format}', Language='{language}' ---")
+    items_with_descriptions = []
+    if post_format in ['rich', 'description_only'] and assets_dir and Path(assets_dir).is_dir():
+        all_items = load_paired_image_paths(Path(assets_dir))
+        all_items = all_items[:50]  # Limit to first 50 items to avoid overloading the model
+        if all_items:
+            cache_file_path = None
+            if description_cache_dir and pdf_hash:
+                sanitized_model_name = re.sub(r'[\\/:"*?<>|]', '_', vision_model)
+                cache_dir = Path(description_cache_dir) / pdf_hash
+                cache_dir.mkdir(parents=True, exist_ok=True)
+                cache_file_path = cache_dir / f"{sanitized_model_name}.json"
+            if cache_file_path and cache_file_path.exists() and ablation_mode not in ['no_visual_analysis', 'stage2']:
+                tqdm.write(f"[✓] Cache hit! Loading all descriptions from {cache_file_path}")
+                with cache_file_path.open('r', encoding='utf-8') as f:
+                    items_with_descriptions = json.load(f)
+            else:
+                # MODIFIED: Trigger this ablation also for 'stage2'
+                if ablation_mode in ['no_visual_analysis', 'stage2']:
+                    ablation_reason = "no_visual_analysis" if ablation_mode != 'stage2' else 'stage2'
+                    tqdm.write(f"[*] ABLATION ({ablation_reason}): Using OCR on caption images instead of vision model.")
+                    temp_items_with_desc = []
+                    ocr_tasks = [ocr_image_to_text(item['caption_path']) for item in all_items]
+                    ocr_results = await asyncio.gather(*ocr_tasks)
+                    for i, item in enumerate(all_items):
+                        caption_content = ocr_results[i]
+                        if caption_content:
+                            item['description'] = caption_content
+                            temp_items_with_desc.append(item)
+                    items_with_descriptions = temp_items_with_desc
+                else:
+                    # Full pipeline: use vision model
+                    tqdm.write(f"--- Cache miss. Describing {len(all_items)} new figures using model '{vision_model}'... ---")
+                    async with setup_client(vision_api_key, vision_api_base) as vision_client:
+                        if not vision_client:
+                            return "Error: Vision API client configuration failed.", None
+                        describer = FigureDescriberAgent(model=vision_model)
+                        description_tasks = [
+                            describer.run(
+                                vision_client,
+                                item['item_path'],
+                                item['caption_path'],
+                                disable_qwen_thinking=disable_qwen_thinking
+                            ) for item in all_items
+                        ]
+                        descriptions = await asyncio.gather(*description_tasks)
+                        temp_items_with_desc = []
+                        for i, item in enumerate(all_items):
+                            if not descriptions[i].startswith("Error:"):
+                                item['description'] = descriptions[i]
+                                temp_items_with_desc.append(item)
+                        items_with_descriptions = temp_items_with_desc
+                # MODIFIED: Prevent caching for 'stage2' as well
+                if cache_file_path and ablation_mode not in ['no_visual_analysis', 'stage2']:
+                    tqdm.write(f"[*] Saving all descriptions to cache file: {cache_file_path}")
+                    with cache_file_path.open('w', encoding='utf-8') as f:
+                        json.dump(items_with_descriptions, f, ensure_ascii=False, indent=4)
+                elif cache_file_path and ablation_mode in ['no_visual_analysis', 'stage2']:
+                    ablation_reason = "no_visual_analysis" if ablation_mode != 'stage2' else 'stage2'
+                    tqdm.write(f"[*] ABLATION ({ablation_reason}): Description caching is disabled for this mode to avoid saving OCR results.")
+    items_with_descriptions = items_with_descriptions[:20]
+    if post_format in ['rich', 'description_only'] and not items_with_descriptions:
+        return f"Error: '{post_format}' format requires images, but none were found/described.", None
+    async with setup_client(text_api_key, text_api_base) as text_client:
+        if not text_client: return "Error: Text API client configuration failed.", None
+        if ablation_mode in ['no_visual_integration', 'stage2'] and post_format in ['rich', 'description_only']:
+            ablation_reason = "no_visual_integration" if ablation_mode != 'stage2' else 'stage2'
+            tqdm.write(f"[*] ABLATION ({ablation_reason}): Generating text first, then appending all figures at the end.")
+            integrator = BlogIntegratorAgent(selected_prompt, model=text_model)
+            text_only_post = await integrator.run(
+                local_client=text_client,
+                blog_text=blog_draft,
+                items_with_descriptions=[],
+                source_text=source_paper_text,
+                disable_qwen_thinking=disable_qwen_thinking
+            )
+            if not text_only_post or text_only_post.startswith("Error:"):
+                return f"Blog integration failed for text-only part: {text_only_post}", None
+            final_blog_content = text_only_post
+            assets_for_packaging = []
+            for i, item_data in enumerate(items_with_descriptions):
+                if post_format == 'rich':
+                    new_asset_filename = f"img_{i}{Path(item_data['item_path']).suffix}"
+                    alt_text = f"Figure {i}"
+                    new_markdown_tag = f"\n\n![{alt_text}](./img/{new_asset_filename})"
+                    assets_for_packaging.append({'src_path': item_data['item_path'], 'dest_name': new_asset_filename, 'new_index': i})
+                    final_blog_content += new_markdown_tag
+                elif post_format == 'description_only':
+                    alt_text_description = item_data.get('description', f'Figure {i}').strip().replace('\n', ' ')
+                    new_markdown_tag = f"\n\n![{alt_text_description}]()"
+                    final_blog_content += new_markdown_tag
+            return final_blog_content, assets_for_packaging if assets_for_packaging else None
+        integrator = BlogIntegratorAgent(selected_prompt, model=text_model)
+        final_post_with_placeholders = await integrator.run(
+            local_client=text_client,
+            blog_text=blog_draft,
+            items_with_descriptions=items_with_descriptions,
+            source_text=source_paper_text,
+            disable_qwen_thinking=disable_qwen_thinking
+        )
+    if not final_post_with_placeholders or final_post_with_placeholders.startswith("Error:"):
+        return f"Blog integration failed: {final_post_with_placeholders}", None
+    found_indices = re.findall(r'\[FIGURE_PLACEHOLDER_(\d+)\]', final_post_with_placeholders)
+    final_blog_content = final_post_with_placeholders
+    assets_for_packaging = []
+    if found_indices:
+        items_map = {i: item for i, item in enumerate(items_with_descriptions)}
+        for new_index, original_index_str in enumerate(found_indices):
+            original_index = int(original_index_str)
+            item_data = items_map.get(original_index)
+            if not item_data: continue
+            placeholder_to_replace = f"[FIGURE_PLACEHOLDER_{original_index}]"
+            if post_format == 'rich':
+                new_asset_filename = f"img_{new_index}{Path(item_data['item_path']).suffix}"
+                alt_text = f"Figure {new_index}"
+                new_markdown_tag = f"![{alt_text}](./img/{new_asset_filename})"
+                assets_for_packaging.append({'src_path': item_data['item_path'], 'dest_name': new_asset_filename, 'new_index': new_index})
+            elif post_format == 'description_only':
+                alt_text_description = item_data.get('description', f'Figure {new_index}').strip().replace('\n', ' ')
+                new_markdown_tag = f"![{alt_text_description}]()"
+            else:
+                new_markdown_tag = ""
+            final_blog_content = final_blog_content.replace(placeholder_to_replace, new_markdown_tag, 1)
+    final_blog_content = re.sub(r'\[FIGURE_PLACEHOLDER_(\d+)\]', '', final_blog_content)
+    if post_format == 'rich':
+        return final_blog_content, assets_for_packaging
+    else:
+        return final_blog_content, None
+async def generate_baseline_post(
+    paper_text: str,
+    api_key: str,
+    api_base: str,
+    model: str,
+    platform: str,
+    language: str,
+    disable_qwen_thinking: bool = False,
+    mode: str = 'original',
+    assets_dir: Optional[str] = None
+) -> Tuple[str, List[Dict], int]:
+    """
+    Generates a post using a simple, single-prompt baseline method.
+    """
+    tqdm.write(f"\n--- Generating baseline post (mode: {mode}) for: Platform='{platform}', Language='{language}' ---")
+    async with setup_client(api_key, api_base) as client:
+        if not client:
+            return "Error: API client configuration failed.", [], 0
+        if mode == 'fewshot':
+            prompt_template = BASELINE_FEWSHOT_PROMPT_CHINESE if language == 'zh' else BASELINE_FEWSHOT_PROMPT_ENGLISH
+        else:
+            prompt_template = BASELINE_PROMPT_CHINESE if language == 'zh' else BASELINE_PROMPT_ENGLISH
+        user_prompt = prompt_template.format(paper_text=paper_text[:20000], platform=platform.capitalize())
+        system_prompt = "You are an assistant that summarizes academic papers for social media."
+        text_post, think_token_count = await call_text_llm_api_with_token_count(
+            local_client=client,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            model=model,
+            disable_qwen_thinking=disable_qwen_thinking
+        )
+        if text_post.startswith("Error:"):
+            return text_post, [], think_token_count
+        final_post = text_post
+        assets_for_packaging = []
+        if mode == 'with_figure' and assets_dir and Path(assets_dir).is_dir():
+            tqdm.write(f"[*] Attaching top 3 figures/tables for 'with_figure' baseline...")
+            paired_item_dirs = [
+                d for d in Path(assets_dir).rglob('paired_*')
+                if d.is_dir() and (d.name.startswith('paired_figure_') or d.name.startswith('paired_table_'))
+            ]
+            def get_global_sort_key(dir_path: Path):
+                page_num = -1
+                item_type = ''
+                item_index = -1
+                try:
+                    page_match = re.search(r'page_(\d+)', dir_path.parts[-2])
+                    if page_match:
+                        page_num = int(page_match.group(1))
+                except (IndexError, ValueError):
+                    pass
+                item_match = re.search(r'paired_(figure|table)_(\d+)', dir_path.name)
+                if item_match:
+                    item_type = item_match.group(1)
+                    item_index = int(item_match.group(2))
+                return (page_num, item_index)
+            sorted_dirs = sorted(paired_item_dirs, key=get_global_sort_key)
+            all_items = []
+            for item_dir in sorted_dirs:
+                item_type = 'figure' if 'figure' in item_dir.name else 'table'
+                item_file = next(
+                    (f for f in item_dir.iterdir() if f.is_file() and f.name.startswith(item_type) and 'caption' not in f.name),
+                    None
+                )
+                if item_file:
+                    all_items.append(item_file)
+            selected_items = all_items[:3]
+            if selected_items:
+                final_post += "\n\n--- Key Figures & Tables ---\n"
+                for i, item_path in enumerate(selected_items):
+                    new_asset_filename = f"img_{i}{item_path.suffix}"
+                    alt_text = "Table" if "table" in item_path.parent.name else "Figure"
+                    alt_text += f" {i+1}"
+                    final_post += f"\n![{alt_text}](./img/{new_asset_filename})"
+                    assets_for_packaging.append({'src_path': str(item_path), 'dest_name': new_asset_filename})
+                tqdm.write(f"[✓] Appended {len(selected_items)} items (figures/tables) to the post.")
+            else:
+                tqdm.write("[!] Warning: 'with_figure' mode was selected, but no paired items were found.")
+        return final_post, assets_for_packaging, think_token_count

pragent/backend/data_loader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# data_loader.py
+import asyncio
+import aiofiles
+from pathlib import Path
+import re
+from typing import List, Dict
+from tqdm.asyncio import tqdm
+async def load_plain_text(txt_path: str) -> str:
+    """异步地从 .txt 文件加载纯文本内容。"""
+    try:
+        async with aiofiles.open(txt_path, mode='r', encoding='utf-8') as f:
+            return await f.read()
+    except Exception as e:
+        tqdm.write(f"[!] 读取文本文件 '{txt_path}' 时出错: {e}")
+        return ""
+def load_paired_image_paths(base_dir: Path) -> List[Dict]:
+    """
+    递归地扫描 'paired_*' 文件夹，并加载主图和其标题图的路径。
+    """
+    items = []
+    if not base_dir.is_dir():
+        tqdm.write(f"[!] 错误: 找不到配对结果的基础文件夹: {base_dir}")
+        return items
+    tqdm.write(f"[*] 正在从 {base_dir} 递归加载图文对...")
+    item_dirs = sorted(
+        [d for d in base_dir.rglob('paired_*') if d.is_dir()],
+        key=lambda p: p.name
+    )
+    for item_dir in item_dirs:
+        item_files = list(item_dir.glob('*.jpg'))
+        if len(item_files) < 2:
+            continue
+        main_item_path, caption_path = None, None
+        for f in item_files:
+            if "caption" in f.name:
+                caption_path = f
+            else:
+                main_item_path = f
+        if main_item_path and caption_path:
+            items.append({
+                "type": "figure" if "figure" in item_dir.name else "table",
+                "item_path": str(main_item_path.resolve()),
+                "caption_path": str(caption_path.resolve()),
+            })
+    tqdm.write(f"[*] 加载完成，共找到 {len(items)} 个图文对。")
+    return items

pragent/backend/figure_table_pipeline.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# figure_table_pipeline.py
+import os
+import shutil
+import re
+from pathlib import Path
+from collections import defaultdict
+from pragent.backend.loader import ImagePDFLoader
+from pragent.backend.yolo import extract_and_save_layout_components
+from tqdm.asyncio import tqdm
+def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
+    """
+    一个完整的、从PDF提取并配对图表的流程。
+    这是被 app.py 调用的主函数。
+    Args:
+        pdf_path (str): 用户上传的PDF的路径。
+        base_work_dir (str): 本次会话的临时工作目录。
+    Returns:
+        str: 最终配对结果的目录路径，如果失败则返回 None。
+    """
+    if not all([ImagePDFLoader, extract_and_save_layout_components]):
+        tqdm.write("[!] 错误: figure_pipeline 的一个或多个核心依赖项未能加载。")
+        return None
+    pdf_file = Path(pdf_path)
+    pdf_stem = pdf_file.stem
+    model_path = "pragent/model/doclayout_yolo_docstructbench_imgsz1024.pt"
+    tqdm.write(f"\n--- 步骤 1/3: 将PDF '{pdf_file.name}' 转换为图片 ---")
+    page_save_dir = os.path.join(base_work_dir, "page_paper", pdf_stem)
+    os.makedirs(page_save_dir, exist_ok=True)
+    try:
+        loader = ImagePDFLoader(pdf_path)
+        page_image_paths = []
+        for i, img in enumerate(loader.load()):
+            path = os.path.join(page_save_dir, f"page_{i+1}.png")
+            img.save(path)
+            page_image_paths.append(path)
+        tqdm.write(f"[*] 所有 {len(page_image_paths)} 页已保存至: {page_save_dir}")
+    except Exception as e:
+        tqdm.write(f"[!] 错误：加载或转换PDF时失败: {e}")
+        return None
+    tqdm.write(f"\n--- 步骤 2/3: 分析页面布局以裁剪图和表 ---")
+    cropped_results_dir = os.path.join(base_work_dir, "cropped_results", pdf_stem)
+    for path in page_image_paths:
+        page_num_str = Path(path).stem
+        page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
+        extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir)
+    tqdm.write(f"[*] 所有裁剪结果已保存至: {cropped_results_dir}")
+    tqdm.write(f"\n--- 步骤 3/3: 对裁剪出的组件进行配对 ---")
+    final_paired_dir = os.path.join(base_work_dir, "paired_results", pdf_stem)
+    run_pairing_process(cropped_results_dir, final_paired_dir, threshold=30)
+    if os.path.isdir(final_paired_dir):
+        return final_paired_dir
+    return None
+def run_pairing_process(source_dir_str: str, output_dir_str: str, threshold: int):
+    """配对逻辑，现在是pipeline的一部分。"""
+    source_dir = Path(source_dir_str)
+    output_root_dir = Path(output_dir_str)
+    if output_root_dir.exists(): shutil.rmtree(output_root_dir)
+    output_root_dir.mkdir(parents=True, exist_ok=True)
+    tqdm.write(f"    开始最近邻配对流程 (阈值 = {threshold})")
+    page_dirs = sorted([d for d in source_dir.iterdir() if d.is_dir() and d.name.startswith('page_')])
+    for page_dir in page_dirs:
+        output_page_dir = output_root_dir / page_dir.name
+        output_page_dir.mkdir(exist_ok=True)
+        pair_items_on_page(str(page_dir), str(output_page_dir), threshold)
+def pair_items_on_page(page_dir: str, output_dir: str, threshold: int):
+    """处理单个页面目录，进行最近邻配对。"""
+    organized_files = defaultdict(dict)
+    component_types = ["figure", "figure_caption", "table", "table_caption_above", "table_caption_below"]
+    def parse_filename(filename: str):
+        match = re.match(r'([a-zA-Z_]+)_(\d+)_score([\d.]+)\.jpg', filename)
+        return (match.group(1), int(match.group(2))) if match else (None, None)
+    for comp_type in component_types:
+        comp_dir = os.path.join(page_dir, comp_type)
+        if os.path.isdir(comp_dir):
+            for filename in os.listdir(comp_dir):
+                _, index = parse_filename(filename)
+                if index is not None: organized_files[comp_type][index] = os.path.join(comp_dir, filename)
+    paired_files, used_captions = set(), defaultdict(set)
+    for item_type, cap_types in [("figure", ["figure_caption"]), ("table", ["table_caption_above", "table_caption_below"])]:
+        for item_index, item_path in organized_files[item_type].items():
+            best_match = {'min_diff': float('inf'), 'cap_path': None, 'cap_index': -1, 'cap_type': ''}
+            for cap_type in cap_types:
+                for cap_index, cap_path in organized_files[cap_type].items():
+                    if cap_index in used_captions[cap_type]: continue
+                    diff = abs(item_index - cap_index)
+                    if diff < best_match['min_diff']:
+                        best_match.update({'min_diff': diff, 'cap_path': cap_path, 'cap_index': cap_index, 'cap_type': cap_type})
+            if best_match['cap_path'] and best_match['min_diff'] <= threshold:
+                target_dir = os.path.join(output_dir, f"paired_{item_type}_{item_index}")
+                os.makedirs(target_dir, exist_ok=True)
+                shutil.copy(item_path, target_dir); shutil.copy(best_match['cap_path'], target_dir)
+                paired_files.add(item_path); paired_files.add(best_match['cap_path'])
+                used_captions[best_match['cap_type']].add(best_match['cap_index'])
+    for files_dict in organized_files.values():
+        for file_path in files_dict.values():
+            if file_path not in paired_files:
+                item_type, index = parse_filename(Path(file_path).name)
+                if item_type:
+                    target_dir = os.path.join(output_dir, f"unpaired_{item_type}_{index}")
+                    os.makedirs(target_dir, exist_ok=True); shutil.copy(file_path, target_dir)

pragent/backend/html2txt.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# html2txt.py
+from bs4 import BeautifulSoup
+import sys
+import aiofiles
+from tqdm.asyncio import tqdm
+async def convert_html_to_txt(html_file_path: str, output_txt_path: str) -> bool:
+    try:
+        async with aiofiles.open(html_file_path, 'r', encoding='utf-8') as f:
+            html_from_file = await f.read()
+    except FileNotFoundError:
+        tqdm.write(f"[!] Error: Intermediate HTML file not found '{html_file_path}'.", file=sys.stderr)
+        return False
+    except Exception as e:
+        tqdm.write(f"[!] Error reading HTML file: {e}", file=sys.stderr)
+        return False
+    soup = BeautifulSoup(html_from_file, "lxml")
+    paragraphs = soup.find_all('p')
+    extracted_lines = [p.get_text(separator=" ", strip=True) for p in paragraphs if p.get_text(strip=True)]
+    tqdm.write(f"[*] Text extraction complete, found {len(extracted_lines)} valid lines of text.")
+    try:
+        full_text_content = "\n".join(extracted_lines)
+        async with aiofiles.open(output_txt_path, 'w', encoding='utf-8') as f:
+            await f.write(full_text_content)
+        return True
+    except Exception as e:
+        tqdm.write(f"[!] Error writing to TXT file: {e}", file=sys.stderr)
+        return False

pragent/backend/loader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# loader.py
+import fitz
+from PIL import Image
+from typing import List
+from tqdm.asyncio import tqdm
+class ImagePDFLoader:
+    def __init__(self, file_path: str, dpi: int = 250):
+        self.file_path = file_path
+        self.dpi = dpi
+    def load(self) -> List[Image.Image]:
+        images = []
+        try:
+            doc = fitz.open(self.file_path)
+            for page in doc:
+                zoom_matrix = fitz.Matrix(self.dpi / 72, self.dpi / 72)
+                pix = page.get_pixmap(matrix=zoom_matrix, alpha=False)
+                if pix.width > 0 and pix.height > 0:
+                    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                    images.append(image)
+            doc.close()
+        except Exception as e:
+            tqdm.write(f"Error during PDF processing: {e}")
+            return []
+        return images

pragent/backend/pdf2html.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# pdf2html.py
+import fitz
+from pathlib import Path
+import sys
+from bs4 import BeautifulSoup
+import asyncio
+import aiofiles
+from tqdm.asyncio import tqdm
+def convert_pdf_sync(pdf_path: str) -> str:
+    try:
+        doc = fitz.open(pdf_path)
+        tqdm.write(f"[*] Successfully opened PDF file: {pdf_path}")
+    except Exception as e:
+        tqdm.write(f"[!] Error: Could not open PDF file. {e}", file=sys.stderr)
+        return ""
+    full_html_content = ""
+    for page in doc:
+        full_html_content += page.get_text("html")
+    doc.close()
+    soup = BeautifulSoup(full_html_content, "lxml")
+    for img_tag in soup.find_all("img"):
+        img_tag.decompose()
+    return soup.prettify()
+async def convert_pdf_to_text_only_html(pdf_path: str, output_path: str) -> bool:
+    cleaned_html = await asyncio.to_thread(convert_pdf_sync, pdf_path)
+    if not cleaned_html:
+        return False
+    try:
+        output_file = Path(output_path)
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        async with aiofiles.open(output_file, "w", encoding="utf-8") as f:
+            await f.write(cleaned_html)
+        return True
+    except Exception as e:
+        tqdm.write(f"[!] Error: Could not write HTML file. {e}", file=sys.stderr)
+        return False

pragent/backend/prompts.py ADDED Viewed

	@@ -0,0 +1,518 @@

+# prompts.py
+# --- STAGE 1 PROMPT (English) ---
+# This prompt is the foundation and remains unchanged. It creates a good, factual draft.
+TEXT_GENERATOR_PROMPT = """
+# Role: You are a top-tier technology analyst and industry commentator. Your articles are renowned for their depth, insight, and concise language, getting straight to the point and providing genuine value to readers.
+# Task: Strictly adhere to all the requirements below to transform the provided "Original Paper Text" into a high-quality, high-density blog post in Markdown format, filled with expert-level insights.
+# --- High-Quality Blog Post Example (Do Not Change This Format) ---
+**Engaging Social Media Title: A Deep Dive into AI Memory, a New Survey from Huawei Noah's Ark Lab**
+✍️ **Authors**: Y. Wang, Z. Chen, et al. (from Huawei Noah's Ark Lab)
+📚 **Paper Title**: From Human Memory to AI Memory: A Survey on Memory Mechanisms in the Era of LLMs
+🌐 **Source**: arXiv:2504.15965 (Apr 23, 2025)
+---
+*Body of the post starts here...*
+🔍 **The Research Question:** Traditional Large Language Models (LLMs) have significant limitations, especially when it comes to processing long texts and maintaining context. These constraints hinder their application in more complex tasks like multi-step reasoning, personalized dialogue, and long-term task management. While existing research offers some solutions, most only analyze memory from a temporal perspective, which is not comprehensive enough.
+💡 **Core Contributions:** To overcome these limitations, the research team proposes a novel memory taxonomy based on three dimensions—Object (individual vs. system), Form (parametric vs. non-parametric), and Time (short-term vs. long-term)—resulting in eight distinct quadrants. This framework aims to systematically understand memory in LLM-driven AI, drawing inspiration from human memory research to build more efficient systems.
+🚀 **The Key Method:** The proposed 3D-8Q memory taxonomy covers both individual and system memory, providing a detailed analysis of their form and temporal characteristics. This method allows researchers to systematically organize existing work and provides a guiding framework for future memory mechanism design.
+📊 **Key Results & Implications:** The team conducted experiments on multiple public datasets to validate the effectiveness of the 3D-8Q taxonomy. The results show that memory systems optimized with this framework demonstrate significant performance improvements in complex tasks such as multi-step reasoning, personalized dialogue, and long-term task management.
+#LLM #RAG #Agent #Multimodal #LargeModels #RetrievalAugmentedGeneration
+# --- Your Creative Task ---
+# Core Requirements (Must Be Strictly Followed):
+## 1. Title and Authorship (for S1 Score):
+- **Create a New Title**: Based on the original paper title, create a more engaging and accessible title for social media.
+- **Extract Author Info**: Accurately identify and list the main authors from the "Original Paper Text". **Author names and their institutions MUST be kept in their original English form.** Use "et al." if there are too many.
+- **Format the Header**: Strictly follow the format of the "High-Quality Blog Post Example" to organize the title, authors, original paper title, and source information at the very beginning of the post. Use the same emojis (✍️, 📚, 🌐).
+## 2. Content Structure (for S2 Score):
+Your article must clearly contain the following core analytical modules. Do not add unnecessary sections.
+- **The Research Question:** Precisely distill the core problem this paper aims to solve. What is the context and importance of this problem?
+- **Core Contributions:** Clearly list the 1-2 most significant innovations or contributions of this paper. What's new here for the field?
+- **The Key Method:** Break down the key method or core idea proposed in the paper. How does it achieve its contributions? What are the technical details?
+- **Key Results & Implications:** What key results did the paper present to support its claims? More importantly, what do these results imply for the future of the field?
+## 3. Writing Style (for S2 & S7 Scores):
+You must completely abandon the writing patterns of an AI assistant and adopt the perspective of a critical, analytical expert.
+- **【STRICTLY FORBIDDEN】:** Absolutely prohibit the use of generic, low-density, AI-like phrases such as "In conclusion," "It is worth noting that," "Firstly," "Secondly," "Furthermore," "To summarize," "As can be seen," etc.
+- **【BE CONCISE】:** Eliminate all filler words and conversational fluff. Every sentence must carry information.
+- **【CONFIDENT & DIRECT】:** As an expert, you must state points directly and confidently. Use "The method validates..." instead of "The method seems to validate...".
+## 4. Formatting (for S8 Score):
+- Use relevant emojis as visual guides for each core module, as shown in the example.
+- Include relevant technical hashtags at the end of the post.
+# Original Paper Text:
+---
+{paper_text}
+---
+Begin your creation. Remember, your goal is not to "imitate a human," but to "be an expert."
+"""
+# --- STAGE 1 PROMPT (Chinese) ---
+TEXT_GENERATOR_PROMPT_CHINESE = """
+# 角色：你是一位顶尖的科技领域分析师和行业评论员。你的文章以深度、洞察力和精炼的语言著称，能够直击要点，为读者提供真正的价值。
+# 任务：严格遵循以下的所有要求，将我提供的“原始论文文本”改编成一篇高质量、高信息密度、充满专家洞见的中文博客文章（Markdown格式）。
+# --- 优质博客范例 (请严格遵守此格式) ---
+**引人入胜的社交媒体标题：华为诺亚方舟新作，AI记忆机制的全面调查**
+✍️ **作者**: Y. Wang, Z. Chen, 等 (来自 华为诺亚方舟实验室)
+📚 **论文标题**: From Human Memory to AI Memory: A Survey on Memory Mechanisms in the Era of LLMs
+🌐 **来源**: arXiv:2504.15965 (2025年4月23日)
+---
+*正文由此开始...*
+🔍 **研究问题:** 传统大型语言模型（LLM）在处理信息时，存在明显的局限性，尤其是在处理长文本和保持上下文连贯性方面。这些局限性限制了LLM在更广泛和复杂的任务中的应用，比如多步骤推理、个性化对话和长周期任务管理。现有的研究虽然提供了一些解决方案，但大多数只从时间维度分析了记忆机制，这显然不够全面。
+💡 **核心贡献:** 为了克服当前记忆机制的局限，研究团队提出了一种新的记忆分类法，基于对象（个人和系统）、形式（参数和非参数）和时间（短期和长期）三个维度，以及八个象限来进行系统性的分类和分析。这一分类法旨在更好地理解LLM驱动的AI系统中的记忆机制，并借鉴人类记忆的研究成果，构建更高效的记忆系统。
+🚀 **重点方法:** 本文提出的3D-8Q记忆分类法，不仅涵盖了个人记忆和系统记忆，还详细分析了记忆的形式和时间特性。通过这种方法，研究团队能够更系统地组织现有的研究工作，为未来的记忆机制设计提供指导。
+📊 **关键结果与意义:** 研究团队在多个公开数据集上进行了实验，验证了3D-8Q记忆分类法的有效性。实验结果显示，通过这种分类法优化的记忆系统在多步骤推理、个性化对话和长周期任务管理等复杂任务中表现出了显著的性能提升。
+#LLM[话题]# #RAG[话题]# #agent[话题]# #multimodal[话题]# #大模型[话题]# #检索增强[话题]#
+# --- 你的创作任务 ---
+# 核心要求 (必须严格遵守):
+## 1. 标题与作者信息 (for S1 Score):
+- **创作新标题**: 基于原文标题，创作一个更吸引人、更易于理解的中文社交媒体标题。
+- **提取作者信息**: 从“原始论文文本”中准确识别并列出主要作者。**作者姓名和所属研究机构必须保留其原始英文格式，不得翻译。** 如果作者过多，可以使用“等” (et al.)。
+- **格式化头部**: 严格按照“优质博客范例”的格式，在文章最开头组织标题、作者、原始论文标题和来源信息。使用相同的表情符号 (✍️, 📚, 🌐)。
+## 2. 内容结构 (for S2 Score):
+你的文章必须清晰地包含以下几个核心分析模块，不要添加不必要的章节：
+- **研究问题:** 精准提炼这篇论文到底要解决什么核心问题？这个问题的背景和重要性是什么？
+- **核心贡献:** 清晰地列出本文最主要的1-2个创新点或贡献。这篇论文的出现，为领域带来了什么新东西？
+- **重点方法:** 详细拆解论文提出的关键方法或核心思路。它是如何实现其贡献的？技术细节是什么？
+- **关键结果与意义:** 论文通过实验得到了什么关键结果来支撑其观点？更重要的是，这些结果对未来意味着什么？
+## 3. 写作风格 (for S2 & S7 Scores):
+- **【严厉禁止】:** 绝对禁止使用“总而言之”、“值得注意的是”、“首先”、“其次”、“此外”、“综上所述”、“不难发现”这类AI常用、且降低信息密度的八股文词汇。
+- **【精炼语言】:** 砍掉所有不必要的修饰和口语化闲聊。每一句话都应承载信息。
+- **【自信与直接】:** 作为一个专家，你需要直接、自信地陈述观点。用“该方法验证了...”代替“该方法似乎验证了...”。
+## 4. 格式要求 (for S8 Score):
+- 使用贴切的表情符号作为每个核心模块的视觉引导，如范例所示。
+- 在文末附上相关的技术话题标签（Hashtags），使用 `[话题]` 格式。
+# 原始论文文本:
+---
+{paper_text}
+---
+开始你的创作。记住，你的目标不是“模仿人类”，而是“成为专家”。
+"""
+# ==============================================================================
+# --- STAGE 2 PROMPTS (FINISHERS - UNIFIED STRATEGY FOR P2 & P3 METRICS) ---
+# ==============================================================================
+# ------------------------------------------------------------------------------
+# --- A. TWITTER (X) PROMPTS ---
+# ------------------------------------------------------------------------------
+TWITTER_RICH_TEXT_PROMPT_ENGLISH = """
+# ROLE: You are an expert communicator—a researcher who can captivate both peers and the public. Your goal is to create a Twitter (X) thread that is both technically credible and excitingly viral.
+# TASK: Rewrite the provided draft into a single, high-impact Twitter thread that satisfies BOTH busy professionals and curious enthusiasts.
+# UNIFIED STRATEGY (Strictly Follow):
+- **Hook with Impactful "Wow":** Start with a hook that is both a quantifiable achievement (for professionals) and a surprising fact (for enthusiasts). E.g., "Just cut model inference time by 50% with a surprisingly simple geometric trick. Here's the story: 🧵"
+- **Intuitive Storytelling with Hard Data:** Frame the content as a story (Problem -> Insight -> Solution). Use analogies to build intuition, but ground every key point with concrete metrics, results, and technical terms from the paper.
+- **Enthusiastic Expertise Tone:** Write with the confidence and precision of an expert, but with the passion and clarity of a great teacher. Avoid dry, academic language AND overly simplistic fluff.
+- **Visually Informative:** Choose figures that are both information-dense (showing data, architecture) and visually clean/compelling.
+# YOUR INSTRUCTIONS
+1.  **Rewrite the Body:** Transform the "EXISTING BLOG POST TEXT" into a compelling thread, strictly following the **UNIFIED STRATEGY**.
+2.  **Integrate Figures:** Weave the figures into the narrative where they best support a key insight or result. Place the figure placeholder on its own new line.
+3.  **Incorporate Author/Paper Info:** Naturally integrate author and paper details. **Ensure author names and institutions remain in English.**
+4.  **Add Engagement Elements:** End with a thought-provoking question and 3-5 hashtags that appeal to both audiences (e.g., #AI, #MachineLearning, #Innovation).
+5.  **Output Format:** Your response must be **only** the final, ready-to-publish thread text.
+# ORIGINAL SOURCE TEXT (for deep context):
+---
+{source_text}
+---
+# EXISTING BLOG POST TEXT (to be rewritten):
+---
+{blog_text}
+---
+# AVAILABLE FIGURES AND DESCRIPTIONS:
+---
+{items_list_str}
+---
+"""
+TWITTER_TEXT_ONLY_PROMPT_ENGLISH = """
+# ROLE: You are an expert communicator—a researcher who can captivate both peers and the public. Your goal is to create a **text-only** Twitter (X) thread that is both technically credible and excitingly viral.
+# TASK: Rewrite the provided draft into a single, high-impact, **text-only** Twitter thread that satisfies BOTH busy professionals and curious enthusiasts.
+# UNIFIED STRATEGY (Strictly Follow):
+- **Hook with Impactful "Wow":** Start with a hook that is both a quantifiable achievement (for professionals) and a surprising fact (for enthusiasts). E.g., "Just cut model inference time by 50% with a surprisingly simple geometric trick. Here's the story: 🧵"
+- **Intuitive Storytelling with Hard Data:** Frame the content as a story (Problem -> Insight -> Solution). Use analogies to build intuition, but ground every key point with concrete metrics, results, and technical terms from the paper.
+- **Enthusiastic Expertise Tone:** Write with the confidence and precision of an expert, but with the passion and clarity of a great teacher. Avoid dry, academic language AND overly simplistic fluff.
+# YOUR INSTRUCTIONS
+1.  **Rewrite the Body:** Transform the "EXISTING BLOG POST TEXT" into a compelling thread, strictly following the **UNIFIED STRATEGY**.
+2.  **Incorporate Author/Paper Info:** Naturally integrate author and paper details. **Ensure author names and institutions remain in English.**
+3.  **Add Engagement Elements:** End with a thought-provoking question and 3-5 hashtags that appeal to both audiences (e.g., #AI, #MachineLearning, #Innovation).
+4.  **Output Format:** Your response must be **only** the final, ready-to-publish thread text.
+# EXISTING BLOG POST TEXT (to be rewritten):
+---
+{blog_text}
+---
+"""
+TWITTER_RICH_TEXT_PROMPT_CHINESE = """
+# 角色: 你是一位顶级的沟通专家——一个既能吸引同行又能吸引公众的研究者。你的目标是创作一个既有技术可信度又具病毒式传播潜力的推特（X平台）帖子。
+# 任务: 将提供的草稿改写成一个能同时满足忙碌专业人士和好奇爱好者的高影响力推文串。
+# 统一策略 (必须严格遵守):
+- **用“惊人”的“量化”成果开场:** 开头必须一句话同时包含“可量化的成果”（吸引专业人士）和“惊人/反直觉的事实”（吸引爱好者）。例如：“我们用一个惊人简单的几何技巧，把模型推理时间砍掉一半。这背后是一个有趣的故事：🧵”
+- **用硬核数据讲述直观故事:** 将内容构建成一个故事（问题 -> 洞察 -> 解决方案）。用类比来建立直觉，但每个关键节点都必须有论文中的具体指标、结果和技术术语作为支撑。
+- **充满热情的专家口吻:** 以专家的自信和严谨，结合优秀老师的热情和清晰来写作。避免干巴巴的学术腔和过于简化的“废话”。
+- **图片信息丰富且吸引人:** 选择的图片必须既信息密集（展示数据、架构），又视觉清晰、有吸引力。
+# 你的指令
+1.  **重写正文:** 严格遵循 **统一策略**，将“现有博客草稿”改写成一个引人注目的推文串。
+2.  **整合图文:** 将图表融入叙事中，选择最能支撑关键洞察或成果的位置。将图表占位符放置在单独的新行。
+3.  **融入作者/论文信息:** 自然地整合作者和论文信息。**确保作者姓名和单位保留其原始英文格式。**
+4.  **添加互动元素:** 以一个引人深思的问题结尾，并附上3-5个能同时吸引两类受众的话题标签 (例如, #人工智能, #机器学习, #科技创新)。
+5.  **输出格式:** 你的回应**只能**是最终的、可直接发布的帖子内容。
+# 原始论文（供深度参考）:
+---
+{source_text}
+---
+# 现有博客草稿（待改写）:
+---
+{blog_text}
+---
+# 可用图表及描述:
+---
+{items_list_str}
+---
+"""
+TWITTER_TEXT_ONLY_PROMPT_CHINESE = """
+# 角色: 你是一位顶级的沟通专家——一个既能吸引同行又能吸引公众的研究者。你的目标是创作一个既有技术可信度又具病毒式传播潜力的**纯文本**推特（X平台）帖子。
+# 任务: 将提供的草稿改写成一个能同时满足忙碌专业人士和好奇爱好者的高影响力**纯文本**推文串。
+# 统一策略 (必须严格遵守):
+- **用“惊人”的“量化”成果开场:** 开头必须一句话同时包含“可量化的成果”（吸引专业人士）和“惊人/反直觉的事实”（吸引爱好者）。例如：“我们用一个惊人简单的几何技巧，把模型推理时间砍掉一半。这背后是一个有趣的故事：🧵”
+- **用硬核数据讲述直观故事:** 将内容构建成一个故事（问题 -> 洞察 -> 解决方案）。用类比来建立直觉，但每个关键节点都必须有论文中的具体指标、结果和技术术语作为支撑。
+- **充满热情的专家口吻:** 以专家的自信和严谨，结合优秀老师的热情和清晰来写作。避免干巴巴的学术腔和过于简化的“废话”。
+# 你的指令
+1.  **重写正文:** 严格遵循 **统一策略**，将“现有博客草稿”改写成一个引人注目的推文串。
+2.  **融入作者/论文信息:** 自然地整合作者和论文信息。**确保作者姓名和单位保留其原始英文格式。**
+3.  **添加互动元素:** 以一个引人深思的问题结尾，并附上3-5个能同时吸引两类受众的话题标签 (例如, #人工智能, #机器学习, #科技创新)。
+4.  **输出格式:** 你的回应**只能**是最终的、可直接发布的帖子内容。
+# 现有博客草稿（待改写）:
+---
+{blog_text}
+---
+"""
+# ------------------------------------------------------------------------------
+# --- B. XIAOHONGSHU PROMPTS ---
+# ------------------------------------------------------------------------------
+XIAOHONGSHU_PROMPT_ENGLISH = """
+# ROLE: You are an expert tech content creator on Xiaohongshu. Your style is a perfect blend of a professional's "dry goods" (干货) and a science communicator's engaging storytelling.
+# TASK: Transform the provided draft into a single, high-quality Xiaohongshu post that is highly valuable to BOTH industry professionals and curious tech enthusiasts.
+# UNIFIED STRATEGY (Strictly Follow):
+- **Title is an "Impactful Hook":** The title must be a compelling hook that also states the core, quantifiable achievement. E.g., "This AI paper is a must-read! 🤯 They boosted performance by 30% with one clever trick."
+- **Narrative Structure with Clear Signposts:** Start with a story-like intro (the "why"). Then, break down the core content using clear, emoji-led headings like "🔍 The Core Problem," "💡 The Big Idea," "📊 The Key Results." This makes it scannable for professionals and easy to follow for enthusiasts.
+- **Intuition-Building backed by Data:** Explain complex ideas using simple analogies, but immediately follow up with the key technical terms and performance metrics from the paper.
+- **Visually Compelling and Informative Images:** Select figures that are clean and easy to understand, but also contain the key data or diagrams that a professional would want to see.
+# YOUR STEP-BY-STEP EXECUTION PLAN
+### STEP 1: Rewrite the Post Body
+* **Create the Title and Body:** Rewrite the entire post following the **UNIFIED STRATEGY**.
+* **Include Author Info:** After the title, you MUST include the author, paper title, and source details. **Ensure author names and institutions remain in their original English form.**
+* **Format for Scannability:** Use emojis, short paragraphs, and bold text to make the post visually appealing and easy to digest.
+### STEP 2: Select and Append Best Images
+* **Select the 3-4 most suitable figures** that align with the **UNIFIED STRATEGY**.
+* **Append ONLY the placeholders for these selected figures to the very end of the post.**
+### STEP 3: Drive Engagement
+* **Topic Tags (#):** Add a mix of broad and specific hashtags (e.g., `#AI`, `#Tech`, `#DataScience`, `#LLM`).
+* **Call to Action (CTA):** End with a CTA that invites discussion from everyone (e.g., "This could change so much! What do you all think? 👇").
+# --- AVAILABLE ASSETS ---
+## 1. Structured Draft:
+{blog_text}
+## 2. Available Figures and Descriptions:
+{items_list_str}
+# --- FINAL OUTPUT ---
+Your final output must be **only the complete, ready-to-publish post text, with the selected image placeholders at the end**.
+"""
+XIAOHONGSHU_TEXT_ONLY_PROMPT_ENGLISH = """
+# ROLE: You are an expert tech content creator on Xiaohongshu. Your style is a perfect blend of a professional's "dry goods" (干货) and a science communicator's engaging storytelling.
+# TASK: Transform the provided draft into a single, high-quality, **text-only** Xiaohongshu post that is valuable to BOTH industry professionals and curious tech enthusiasts. **DO NOT include image placeholders.**
+# UNIFIED STRATEGY (Strictly Follow):
+- **Title is an "Impactful Hook":** The title must be a compelling hook that also states the core, quantifiable achievement. E.g., "This AI paper is a must-read! 🤯 They boosted performance by 30% with one clever trick."
+- **Narrative Structure with Clear Signposts:** Start with a story-like intro (the "why"). Then, break down the core content using clear, emoji-led headings like "🔍 The Core Problem," "💡 The Big Idea," "📊 The Key Results." This makes it scannable for professionals and easy to follow for enthusiasts.
+- **Intuition-Building backed by Data:** Explain complex ideas using simple analogies, but immediately follow up with the key technical terms and performance metrics from the paper.
+# YOUR STEP-BY-STEP EXECUTION PLAN
+### STEP 1: Rewrite the Post Body
+* **Create the Title and Body:** Rewrite the entire post following the **UNIFIED STRATEGY**.
+* **Include Author Info:** After the title, you MUST include the author, paper title, and source details. **Ensure author names and institutions remain in their original English form.**
+* **Format for Scannability:** Use emojis, short paragraphs, and bold text to make the post visually appealing and easy to digest.
+### STEP 2: Drive Engagement
+* **Topic Tags (#):** Add a mix of broad and specific hashtags (e.g., `#AI`, `#Tech`, `#DataScience`, `#LLM`).
+* **Call to Action (CTA):** End with a CTA that invites discussion from everyone (e.g., "This could change so much! What do you all think? 👇").
+# --- Structured Draft ---
+{blog_text}
+# --- FINAL OUTPUT ---
+Your final output must be **only the complete, ready-to-publish text-only post**.
+"""
+XIAOHONGSHU_PROMPT_CHINESE = """
+# 角色: 你是一位顶尖的小红书科技博主，完美融合了专业人士的“干货”分享与科普作家的生动叙事。
+# 任务: 将提供的草稿，改编成一篇能同时吸引行业专家和科技爱好者的高质量小红书笔记。
+# 统一策略 (必须严格遵守):
+- **标题是“有冲击力的钩子”:** 标题必须既能激发好奇心，又包含核心的、可量化的成果。例如：“这篇AI论文必读！🤯一个巧思把性能提升30%”
+- **带有清晰路标的叙事结构:** 以故事性的“为什么”开场，然后用清晰的、表情符号引导的标题（如 🔍核心问题, 💡天才想法, 📊关键结果）来拆解核心内容。这既方便专家快速浏览，也利于爱好者跟上思路。
+- **数据支撑下的直觉建立:** 用简单的类比解释复杂概念，但紧接着必须给出论文中的关键技术术语和性能指标。
+- **图片既要信息量大又要吸引人:** 选择的图片要清晰易懂，同时包含专家想看的关键数据或架构图。
+# 你的执行步骤
+### 第一步：重写笔记正文
+* **创作标题和正文:** 严格遵循 **统一策略** 重写整个帖子。
+* **包含作者信息:** 在标题后，**必须**包含作者、论文标题和来源等详细信息。**确保作者姓名和单位保留其原始英文格式。**
+* **为易读性排版:** 大量使用表情符号、短段落和粗体，使笔记视觉上吸引人且易于消化。
+### 第二步：挑选并附加最佳图片
+* **挑选3-4张最符合统一策略的图片。**
+* **只将这些被选中图片的占位符，附加到笔记的最后面。**
+### 第三步：引导互动
+* **话题标签:** 添加组合标签，既有宽泛的也有具体的 (例如: `#AI[话题]#`, `#黑科技[话题]#`, `#数据科学[话题]#`, `#大语言模型[话题]#`)。
+* **行动号召:** 用一个能邀请所有人讨论的CTA结尾 (例如: “这个想法太妙了！大家怎么看？👇”)。
+# --- 可用材料 ---
+## 1. 结构化草稿:
+{blog_text}
+## 2. 可用图文及���述:
+{items_list_str}
+# --- 最终输出 ---
+你的全部回应**只能**是最终的、可直接发布的帖子内容，最后附加上被选中的图片占位符。
+"""
+XIAOHONGSHU_TEXT_ONLY_PROMPT_CHINESE = """
+# 角色: 你是一位顶尖的小红书科技博主，完美融合了专业人士的“干货”分享与科普作家的生动叙事。
+# 任务: 将提供的草稿，改编成一篇能同时吸引行业专家和科技爱好者的高质量**纯文本**小红书笔记。**不要包含图片占位符。**
+# 统一策略 (必须严格遵守):
+- **标题是“有冲击力的钩子”:** 标题必须既能激发好奇心，又包含核心的、可量化的成果。例如：“这篇AI论文必读！🤯一个巧思把性能提升30%”
+- **带有清晰路标的叙事结构:** 以故事性的“为什么”开场，然后用清晰的、表情符号引导的标题（如 🔍核心问题, 💡天才想法, 📊关键结果）来拆解核心内容。这既方便专家快速浏览，也利于爱好者跟上思路。
+- **数据支撑下的直觉建立:** 用简单的类比解释复杂概念，但紧接着必须给出论文中的关键技术术语和性能指标。
+# 你的执行步骤
+### 第一步：重写笔记正文
+* **创作标题和正文:** 严格遵循 **统一策略** 重写整个帖子。
+* **包含作者信息:** 在标题后，**必须**包含作者、论文标题和来源等详细信息。**确保作者姓名和单位保留其原始英文格式。**
+* **为易读性排版:** 大量使用表情符号、短段落和粗体，使笔记视觉上吸引人且易于消化。
+### 第二步：引导互动
+* **话题标签:** 添加组合标签，既有宽泛的也有具体的 (例如: `#AI[话题]#`, `#黑科技[话题]#`, `#数据科学[话题]#`, `#大语言模型[话题]#`)。
+* **行动号召:** 用一个能邀请所有人讨论的CTA结尾 (例如: “这个想法太妙了！大家怎么看？👇”)。
+# --- 结构化草稿 ---
+{blog_text}
+# --- 最终输出 ---
+你的全部回应**只能**是最终的、可直接发布的**纯文本**帖子内容。
+"""
+# ==============================================================================
+# --- NEW: BASELINE PROMPTS ---
+# ==============================================================================
+BASELINE_PROMPT_ENGLISH = """
+# ROLE: You are a helpful assistant.
+# TASK: Read the provided research paper text and write a brief social media post about it for the platform '{platform}'.
+# RESEARCH PAPER TEXT:
+---
+{paper_text}
+---
+# YOUR SOCIAL MEDIA POST:
+"""
+BASELINE_PROMPT_CHINESE = """
+# 角色: 你是一个乐于助人的助手。
+# 任务: 阅读以下提供的论文文本，并为平台 '{platform}' 撰写一篇简短的社交媒体帖子。
+# 论文文本:
+---
+{paper_text}
+---
+# 你的社交媒体帖子:
+"""
+GENERIC_RICH_PROMPT_ENGLISH = """
+# ROLE: You are an AI assistant.
+# TASK: Rewrite the following structured draft into a simple and clear social media post.
+- The post should be easy for a general audience to understand.
+- If figures are provided, integrate them into the text where they seem most relevant using the format `[FIGURE_PLACEHOLDER_X]`, where X is the figure number.
+- Your output must be ONLY the final text for the post.
+# EXISTING BLOG POST TEXT (to be rewritten):
+---
+{blog_text}
+---
+# AVAILABLE FIGURES AND DESCRIPTIONS:
+---
+{items_list_str}
+---
+"""
+GENERIC_TEXT_ONLY_PROMPT_ENGLISH = """
+# ROLE: You are an AI assistant.
+# TASK: Rewrite the following structured draft into a simple, clear, text-only social media post.
+- The post should be easy for a general audience to understand.
+- Your output must be ONLY the final text for the post.
+# EXISTING BLOG POST TEXT (to be rewritten):
+---
+{blog_text}
+---
+"""
+GENERIC_RICH_PROMPT_CHINESE = """
+# 角色: 你是一个AI助手。
+# 任务: 将以下结构化草稿，改写成一篇简单、清晰的社交媒体帖子。
+- 帖子内容应便于普通读者理解。
+- 如果提供了图表信息，请在文本中最相关的位置使用 `[FIGURE_PLACEHOLDER_X]` 格式来引用它们，X是图表编号。
+- 你的输出必须只有最终的帖子文本。
+# 现有博客草稿 (待改写):
+---
+{blog_text}
+---
+# 可用图表及描述:
+---
+{items_list_str}
+---
+"""
+GENERIC_TEXT_ONLY_PROMPT_CHINESE = """
+# 角色: 你是一个AI助手。
+# 任务: 将以下结构化草稿，改写成一篇简单、清晰的纯文本社交媒体帖子。
+- 帖子内容应便于普通读者理解。
+- 你的输出必须只有最终的帖子文本。
+# 现有博客草稿 (待改写):
+---
+{blog_text}
+---
+"""
+BASELINE_FEWSHOT_PROMPT_ENGLISH = """
+# ROLE: You are a helpful assistant.
+# TASK: Read the provided example and write a academic promotion social media post about it for the platform '{platform}'. Follow the example provided.
+# --- EXAMPLE ---
+## PLATFORM: Twitter
+## Example:
+I’m stoked to share our new paper: “Harnessing the Universal Geometry of Embeddings” with @jxmnop
+, Collin Zhang, and @shmatikov.
+We present the first method to translate text embeddings across different spaces without any paired data or encoders.
+Here's why we're excited: 🧵👇🏾
+--------------------------------------------------------------------------
+🌀 Preserving Geometry
+Our method, vec2vec, reveals that all encoders—regardless of architecture or training data—learn nearly the same representations!
+We demonstrate how to translate between these black-box embeddings without any paired data, maintaining high fidelity.
+--------------------------------------------------------------------------
+🔐 Security Implications
+Using vec2vec, we show that vector databases reveal (almost) as much as their inputs.
+Given just vectors (e.g., from a compromised vector database), we show that an adversary can extract sensitive information (e.g., PII) about the underlying text.
+--------------------------------------------------------------------------
+🧠 Strong Platonic Representation Hypothesis (S-PRH)
+We thus strengthen Huh et al.'s PRH to say:
+The universal latent structure of text representations can be learned and harnessed to translate representations from one space to another without any paired data or encoders.
+--------------------------------------------------------------------------
+📄 Read the Full Paper
+Dive into the details here: https://arxiv.org/pdf/2505.12540
+We welcome feedback and discussion!
+---
+# --- YOUR TASK ---
+# RESEARCH PAPER TEXT:
+---
+{paper_text}
+---
+# YOUR SOCIAL MEDIA POST:
+"""
+BASELINE_FEWSHOT_PROMPT_CHINESE = """
+# 角色: 你是一个乐于助人的助手。
+# 任务: 阅读以下提供的例子，并为平台 '{platform}' 撰写一篇宣传论文的社交媒体帖子。请参考范例。
+# --- 范例 ---
+## 平台: 小红书
+## 范例:
+🌐arXiv ID: arXiv:2504.15965
+📚论文标题: From Human Memory to AI Memory: A Survey on Memory Mechanisms in the Era of LLMs
+🔍 问题背景：传统大型语言模型（LLM）在处理信息时，存在明显的局限性，尤其是在处理长文本和保持上下文连贯性方面。这些局限性限制了LLM在更广泛和复杂的任务中的应用，比如多步骤推理、个性化对话和长周期任务管理。现有的研究虽然提供了一些解决方案，但大多数只从时间维度分析了记忆机制，这显然不够全面。
+💡 研究动机：为了克服当前记忆机制的局限，研究团队提出了一种新的记忆分类法，基于对象（个人和系统）、形式（参数和非参数）和时间（短期和长期）三个维度，以及八个象限来进行系统性的分类和分析。这一分类法旨在更好地理解LLM驱动的AI系统中的记忆机制，并借鉴人类记忆的研究成果，构建更高效的记忆系统。
+🚀 方法简介：本文提出的3D-8Q记忆分类法，不仅涵盖了个人记忆和系统记忆，还详细分析了记忆的形式和时间特性。通过这种方法，研究团队能够更系统地组织现有的研究工作，为未来的记忆机制设计提供指导。
+📊 实验设计：研究团队在多个公开数据集上进行了实验，验证了3D-8Q记忆分类法的有效性。实验结果显示，通过这种分类法优化的记忆系统在多步骤推理、个性化对话和长周期任务管理等复杂任务中表现出了显著的性能提升。
+#LLM[话题]# #RAG[话题]# #agent[话题]# #multimodal[话题]# #大模型[话题]# #检索增强[话题]# #多模态[话题]#
+---
+# --- 你的任务 ---
+# 论文文本:
+---
+{paper_text}
+---
+# 你的社交媒体帖子:
+"""

pragent/backend/text_pipeline.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# pragent/backend/text_pipeline.py
+import asyncio
+import sys
+import os
+from pathlib import Path
+import aiofiles.os
+from tqdm.asyncio import tqdm
+from pragent.backend.pdf2html import convert_pdf_to_text_only_html
+from pragent.backend.html2txt import convert_html_to_txt
+# MODIFIED FOR ABLATION STUDY: Added ablation_mode parameter
+async def pipeline(pdf_path: str, output_txt_path: str, ablation_mode: str = "none"):
+    """
+    Defines the complete ASYNCHRONOUS conversion flow from PDF to TXT.
+    The ablation_mode parameter is accepted but the primary logic for summarization
+    ablation is handled downstream in blog_pipeline.py.
+    """
+    tqdm.write("--- PDF to TXT Conversion Pipeline Started ---")
+    pdf_file = Path(pdf_path)
+    intermediate_html_path = pdf_file.with_suffix(".temp.html")
+    tqdm.write("\n--- Step 1/3: Converting PDF to HTML ---")
+    if not await convert_pdf_to_text_only_html(pdf_path, str(intermediate_html_path)):
+        tqdm.write("[!] PDF to HTML conversion failed. Aborting pipeline.", file=sys.stderr)
+        return
+    tqdm.write(f"\n--- Step 2/3: Converting HTML to TXT ---")
+    if not await convert_html_to_txt(str(intermediate_html_path), output_txt_path):
+        tqdm.write("[!] HTML to TXT conversion failed. Aborting pipeline.", file=sys.stderr)
+    else:
+        tqdm.write(f"\n[✓] Success! Final text file saved to: {output_txt_path}")
+    tqdm.write(f"\n--- Step 3/3: Cleaning up temporary files ---")
+    try:
+        await aiofiles.os.remove(intermediate_html_path)
+        tqdm.write(f"[*] Temporary file '{intermediate_html_path.name}' deleted successfully.")
+    except OSError as e:
+        tqdm.write(f"[!] Error deleting temporary file: {e}", file=sys.stderr)
+    tqdm.write("\n--- Pipeline Finished ---")

pragent/backend/text_processor.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# pragent/backend/text_processor.py
+import re
+from typing import List, Tuple
+from langchain_openai import ChatOpenAI
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.prompts import PromptTemplate
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from openai import AsyncOpenAI, BadRequestError
+from tqdm.asyncio import tqdm
+SUMMARIZATION_THRESHOLD = 4000
+FALLBACK_HEADER_SIZE = 3000
+def create_llm(model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False):
+    """Creates a LangChain LLM object from the provided client."""
+    if not client:
+        raise ValueError("API client is not initialized.")
+    model_kwargs = {}
+    if "qwen3" in model.lower() and disable_qwen_thinking:
+        tqdm.write("[*] Summarizer: Enabling 'disable_thinking' for Qwen3 model.")
+        model_kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
+    return ChatOpenAI(
+        model_name=model,
+        openai_api_key=client.api_key,
+        openai_api_base=str(client.base_url),
+        model_kwargs=model_kwargs  # Pass the extra arguments here
+    )
+def split_text_by_structure(long_text: str) -> Tuple[str, str]:
+    """
+    Intelligently splits the text into a "header" (title, authors, abstract) and "body".
+    It looks for keywords like "Abstract" and "Introduction" to determine the split point.
+    """
+    abstract_match = re.search(r'\bAbstract\b', long_text, re.IGNORECASE)
+    if not abstract_match:
+        tqdm.write("[!] 'Abstract' keyword not found. Falling back to fixed character count for splitting.")
+        return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
+    intro_match = re.search(r'(\n\s*(\d+|I|II|III|IV|V)\.?\s*)?Introduction', long_text[abstract_match.end():], re.IGNORECASE)
+    if not intro_match:
+        tqdm.write("[!] 'Introduction' keyword not found after 'Abstract'. Falling back to fixed character count for splitting.")
+        return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
+    split_point = abstract_match.end() + intro_match.start()
+    header_text = long_text[:split_point]
+    body_text = long_text[split_point:]
+    tqdm.write(f"[*] Successfully separated header via keywords ({len(header_text)} characters).")
+    return header_text, body_text
+# --- MODIFIED: Added disable_qwen_thinking parameter ---
+async def summarize_long_text(long_text: str, model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False) -> str:
+    """
+    Asynchronously summarizes long text using a structure-aware hybrid strategy.
+    """
+    if not long_text:
+        return ""
+    if len(long_text) <= SUMMARIZATION_THRESHOLD:
+        tqdm.write(f"[*] Total text length ({len(long_text)} chars) is below threshold {SUMMARIZATION_THRESHOLD}. Skipping summarization.")
+        return long_text
+    header_text, body_text = split_text_by_structure(long_text)
+    if not body_text:
+        tqdm.write("[!] Could not separate the body text. Returning the full original text.")
+        return header_text
+    tqdm.write(f"[*] Summarizing the identified body text ({len(body_text)} characters)...")
+    try:
+        # Pass the flag down to the LLM creator
+        llm = create_llm(model, client, disable_qwen_thinking=disable_qwen_thinking)
+    except ValueError as e:
+        return f"Error: {e}"
+    body_summary = ""
+    tqdm.write("[*] Attempting high-speed 'stuff' summarization strategy for the body text...")
+    try:
+        stuff_prompt_template = """
+        # INSTRUCTION
+        You are a senior editor. Your task is to read the following body text of a research paper and synthesize it into a single, coherent, and detailed summary.
+        This summary needs to cover all the essential aspects of the provided text.
+        # PAPER BODY TEXT:
+        ---
+        {text}
+        ---
+        # YOUR DETAILED SYNTHESIZED SUMMARY:
+        """
+        STUFF_PROMPT = PromptTemplate(template=stuff_prompt_template, input_variables=["text"])
+        stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=STUFF_PROMPT, verbose=True)
+        docs = [Document(page_content=body_text)]
+        body_summary = await stuff_chain.arun(docs)
+        tqdm.write("[✓] 'Stuff' strategy for the body text was successful!")
+    except BadRequestError as e:
+        if "context_length_exceeded" not in str(e).lower() and "maximum context length" not in str(e).lower() and "context length" not in str(e).lower():
+            tqdm.write(f"[!] Unexpected API error with 'stuff' strategy: {e}")
+            return f"Error: API call failed - {e}"
+        tqdm.write("[!] Body text is too long for the 'stuff' strategy. Falling back to 'map_reduce'.")
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=30000,
+            chunk_overlap=3000
+        )
+        chunks = text_splitter.split_text(body_text)
+        docs = [Document(page_content=t) for t in chunks]
+        tqdm.write(f"[*] Body text has been split into {len(docs)} chunks for summarization.")
+        map_prompt_template = """
+        # INSTRUCTION
+        You are a research analyst. Your task is to read the following text segment from a scientific paper and generate a concise summary.
+        Focus only on the most critical information: the research question, the proposed method, key results, and the main conclusion.
+        The language must be refined and to the point.
+        # TEXT SEGMENT:
+        ---
+        {text}
+        ---
+        # YOUR CONCISE SUMMARY:
+        """
+        MAP_PROMPT = PromptTemplate(template=map_prompt_template, input_variables=["text"])
+        combine_prompt_template = """
+        # INSTRUCTION
+        You are a senior editor. You have received several summaries extracted from different parts of the same research paper.
+        Your task is to synthesize these summaries into a single, coherent final summary.
+        # LIST OF SUMMARIES:
+        ---
+        {text}
+        ---
+        # YOUR SYNTHESIZED FINAL DETAILED SUMMARY:
+        """
+        COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template, input_variables=["text"])
+        map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT, verbose=True)
+        try:
+            body_summary = await map_reduce_chain.arun(docs)
+            tqdm.write("[✓] 'Map-Reduce' summarization for the body text is complete.")
+        except Exception as chain_error:
+            tqdm.write(f"[!] 'Map-Reduce' chain execution failed: {chain_error}")
+            return f"Error: 'Map-Reduce' summarization failed - {chain_error}"
+    except Exception as e:
+        tqdm.write(f"[!] An unknown error occurred during the summarization process: {e}")
+        return f"Error: Summarization failed - {e}"
+    final_text = f"{header_text}\n\n[--- Body Summary ---]\n\n{body_summary}"
+    return final_text

pragent/backend/yolo.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# yolo.py
+import os
+from PIL import Image
+from doclayout_yolo import YOLOv10
+from tqdm.asyncio import tqdm
+CLASS_NAMES = {
+    0: "title",
+    1: "plain_text",
+    2: "abandon",
+    3: "figure",
+    4: "figure_caption",
+    5: "table",
+    6: "table_caption_above",
+    7: "table_caption_below",
+    8: "formula",
+    9: "formula_caption",
+}
+def extract_and_save_layout_components(image_path, model_path, save_base_dir="./cropped_results", imgsz=1024, conf=0.2, device="cpu"):
+    """
+    从图像中提取文档布局组件，并按类别保存截图。
+    Args:
+        image_path (str): 输入图像路径
+        model_path (str): 模型权重路径（.pt）
+        save_base_dir (str): 保存截图的根目录
+        imgsz (int): 输入图像的尺寸（会缩放到这个大小）
+        conf (float): 检测框的置信度阈值
+        device (str): 使用的计算设备，比如 'cuda:0' 或 'cpu'
+    """
+    model = YOLOv10(model_path)
+    image = Image.open(image_path)
+    det_results = model.predict(image_path, imgsz=imgsz, conf=conf, device=device)
+    result = det_results[0]
+    boxes = result.boxes.xyxy.cpu().tolist()
+    classes = result.boxes.cls.cpu().tolist()
+    scores = result.boxes.conf.cpu().tolist()
+    for idx, (box, cls_id, score) in enumerate(zip(boxes, classes, scores)):
+        cls_id = int(cls_id)
+        class_name = CLASS_NAMES.get(cls_id, f"cls{cls_id}")
+        save_dir = os.path.join(save_base_dir, class_name)
+        os.makedirs(save_dir, exist_ok=True)
+        x1, y1, x2, y2 = map(int, box)
+        cropped = image.crop((x1, y1, x2, y2))
+        if cropped.mode == 'RGBA':
+            cropped = cropped.convert('RGB')
+        save_path = os.path.join(save_dir, f"{class_name}_{idx}_score{score:.2f}.jpg")
+        cropped.save(save_path)
+    tqdm.write(f"共保存 {len(boxes)} 张截图，按类别分别保存在 {save_base_dir}/")

pragent/logo/logo.png ADDED Viewed

pragent/model/doclayout_yolo_docstructbench_imgsz1024.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a2ee0220fe3d9ad31b47e1d9f1282f46959a54e4618fce9cffcc9715b8286e2
+size 40709302

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+aiofiles==24.1.0
+arxiv==2.2.0
+beautifulsoup4==4.13.5
+bert_score==0.3.13
+doclayout_yolo==0.0.4
+fitz==0.0.1.dev2
+gradio
+langchain==0.3.27
+langchain_openai==0.3.33
+matplotlib==3.10.6
+numpy==2.3.3
+openai==1.108.1
+pandas==2.3.2
+pdfplumber==0.11.7
+Pillow==11.3.0
+prettytable==3.16.0
+pydantic==2.11.9
+pytesseract==0.3.13
+python-dotenv==1.1.1
+PyYAML==6.0.2
+rouge_score==0.1.2
+scipy==1.16.2
+seaborn==0.13.2
+simpledorff==0.0.2
+tiktoken==0.11.0
+tqdm==4.67.1
+huggingface_hub
+sentence-transformers