yzweak commited on
Commit
ec3d86e
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
.vscode/settings.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
3
+ "python-envs.defaultPackageManager": "ms-python.python:conda",
4
+ "python-envs.pythonProjects": []
5
+ }
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AutoPR
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # PRAgent: A Modular Agentic Framework for Paper-to-PR Conversion
13
+
14
+ As the volume of peer-reviewed research surges, scholars increasingly rely on social platforms for discovery, while authors invest significant effort in promotion to sustain visibility and citations. This project aims to address that challenge.
15
+
16
+ To accomplish this, we developed **PRAgent**, a modular agentic framework for automatically transforming research papers into promotional posts optimized for specific social media platforms.
app.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import gradio as gr
4
+ import asyncio
5
+ import os
6
+ import shutil
7
+ from pathlib import Path
8
+ import time
9
+ import json
10
+ import traceback
11
+ from typing import List, Dict, Tuple, Optional
12
+
13
+ from pragent.backend.text_pipeline import pipeline as run_text_extraction
14
+ from pragent.backend.figure_table_pipeline import run_figure_extraction
15
+ from pragent.backend.blog_pipeline import generate_text_blog, generate_final_post
16
+ from pragent.backend.agents import setup_client, call_text_llm_api
17
+
18
+ import base64
19
+ import mimetypes
20
+ import re
21
+
22
+
23
+ # --- 新增模块:用于将 Markdown Post 格式化为结构化 JSON ---
24
+
25
+ FORMAT_PROMPT_TEMPLATE = '''
26
+ You are an expert in structuring social media content. Your task is to convert a post written in Markdown format into a structured JSON format. The JSON structure depends on the target platform.
27
+
28
+ **Platform:** {platform}
29
+ **Markdown Content:**
30
+ ---
31
+ {markdown_text}
32
+ ---
33
+
34
+ **Instructions:**
35
+ {platform_instructions}
36
+ '''
37
+
38
+ TWITTER_INSTRUCTIONS = '''
39
+ Convert the content into a JSON array representing a Twitter thread. Each element in the array is a tweet object.
40
+ - Each tweet object must have a "text" key. The text should be plain text, without any Markdown formatting (e.g., no `*`, `#`, `[]()`)
41
+ - If a tweet is associated with an image, add an "image_index" key with the corresponding zero-based index from the provided asset list. For example, if the first image in the Markdown `![...](img_0.png)` is used, its index is 0.
42
+ - Ensure the thread flows logically. Split the text into multiple tweets if necessary.
43
+
44
+ **Asset List (for reference):**
45
+ {asset_list}
46
+
47
+ **JSON Output Format:**
48
+ [
49
+ {{ "text": "Text of the first tweet.", "image_index": 0 }},
50
+ {{ "text": "Text of the second tweet." }},
51
+ {{ "text": "Text of the third tweet.", "image_index": 1 }}
52
+ ]
53
+ '''
54
+
55
+ XIAOHONGSHU_INSTRUCTIONS = '''
56
+ Convert the content into a single JSON object for a Xiaohongshu post.
57
+ - The JSON object must have a "title" key. Extract the main title from the Markdown (usually the first H1/H2 heading). The title should be plain text.
58
+ - The JSON object must have a "body" key containing the main text content, with emojis. The body text should be plain text, without any Markdown formatting (e.g., no `*`, `#`, `[]()`)
59
+ - The JSON object must have an "image_indices" key, which is an array of all image indexes used in the post, in the order they appear.
60
+
61
+ **Asset List (for reference):**
62
+ {asset_list}
63
+
64
+ **JSON Output Format:**
65
+ {{
66
+ "title": "Your Catchy Title Here",
67
+ "body": "The full body text of the post...",
68
+ "image_indices": [0, 1, 2, 3]
69
+ }}
70
+ '''
71
+
72
+ def image_to_base64(path: str) -> str:
73
+ """读取图片文件并将其转换为 Base64 Data URL 字符串"""
74
+ try:
75
+ # 根据文件路径猜测MIME类型
76
+ mime_type, _ = mimetypes.guess_type(path)
77
+ if mime_type is None:
78
+ mime_type = "image/jpeg"
79
+ if path.lower().endswith(".png"):
80
+ mime_type = "image/png"
81
+ else:
82
+ mime_type = "image/jpeg"
83
+
84
+ with open(path, "rb") as image_file:
85
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
86
+ return f"data:{mime_type};base64,{encoded_string}"
87
+ except Exception as e:
88
+ print(f"[!] Error converting image to base64: {e}")
89
+ return ""
90
+
91
+ LOGO_PATH = "pragent/logo/logo.png"
92
+ LOGO_BASE64 = ""
93
+ if os.path.exists(LOGO_PATH):
94
+ LOGO_BASE64 = image_to_base64(LOGO_PATH)
95
+ else:
96
+ print(f"[!] Warning: Logo file not found at {LOGO_PATH}")
97
+
98
+ async def format_post_for_display(
99
+ markdown_text: str,
100
+ assets: Optional[List[Dict]],
101
+ platform: str,
102
+ client,
103
+ model: str
104
+ ) -> Optional[Dict]:
105
+ """
106
+ 使用 LLM 将 Markdown 格式的帖子转换为结构化的 JSON 以便在 UI 中显示。
107
+ """
108
+ if platform == 'twitter':
109
+ instructions = TWITTER_INSTRUCTIONS
110
+ elif platform == 'xiaohongshu':
111
+ instructions = XIAOHONGSHU_INSTRUCTIONS
112
+ else:
113
+ return None
114
+
115
+ asset_str = "No assets provided."
116
+ if assets:
117
+ asset_str = "\n".join([f"- Index {i}: {asset['dest_name']}" for i, asset in enumerate(assets)])
118
+
119
+ prompt = FORMAT_PROMPT_TEMPLATE.format(
120
+ platform=platform.capitalize(),
121
+ markdown_text=markdown_text,
122
+ platform_instructions=instructions.format(asset_list=asset_str),
123
+ )
124
+
125
+ system_prompt = "You are a content formatting expert. Output only valid JSON."
126
+ response_str = ""
127
+ try:
128
+ response_str = await call_text_llm_api(client, system_prompt, prompt, model)
129
+ json_str = None
130
+
131
+ match = re.search(r"```(?:json)?\s*([\s\S]+?)\s*```", response_str)
132
+ if match:
133
+ json_str = match.group(1)
134
+ else:
135
+ json_str = response_str
136
+ return json.loads(json_str.strip())
137
+
138
+ except Exception as e:
139
+ print(f"[!] Error formatting post for display: {e}")
140
+ traceback.print_exc()
141
+ return None
142
+
143
+
144
+ # --- Gradio UI 渲染帮助函数 ---
145
+
146
+ def render_twitter_thread(thread_data: List[Dict], assets: List[str]) -> str:
147
+ html_parts = []
148
+ for i, tweet in enumerate(thread_data):
149
+ text_html = tweet.get("text", "").replace("\n", "<br>")
150
+ image_html = ""
151
+ if "image_index" in tweet and tweet["image_index"] < len(assets):
152
+ img_idx = tweet["image_index"]
153
+ img_path = assets[img_idx]
154
+ base64_string = image_to_base64(img_path)
155
+ image_html = f'<div class="tweet-image-container"><img src="{base64_string}" class="tweet-image"></div>'
156
+
157
+ tweet_html = f'''
158
+ <div class="tweet-row">
159
+ <div class="avatar-container">
160
+ <img src="{LOGO_BASE64}" class="avatar">
161
+ </div>
162
+ <div class="tweet-content">
163
+ <div class="user-info">
164
+ <strong>PRAgent</strong> <span>@pr_agent</span>
165
+ </div>
166
+ <div class="tweet-text">{text_html}</div>
167
+ {image_html}
168
+ </div>
169
+ </div>
170
+ '''
171
+ html_parts.append(tweet_html)
172
+
173
+ return "".join(html_parts)
174
+
175
+ def render_xiaohongshu_post(post_data: Dict, assets: List[str]) -> str:
176
+ """V6 - Final Version: Returns ONLY pure HTML structure."""
177
+ title_html = f"<h2 class='xhs-title'>{post_data.get('title', '')}</h2>"
178
+ body_text = post_data.get('body', '').replace('\n', '<br>')
179
+ body_html = f"<div class='xhs-body'>{body_text}</div>"
180
+
181
+ gallery_html = ""
182
+ if "image_indices" in post_data and post_data["image_indices"]:
183
+ image_indices = post_data["image_indices"]
184
+ # Fix: Remove duplicate indices to prevent carousel showing duplicate images, while preserving order.
185
+ unique_indices = list(dict.fromkeys(image_indices))
186
+ valid_assets = [assets[i] for i in unique_indices if i < len(assets)]
187
+
188
+ if valid_assets:
189
+ # We still need a unique ID for the observer to find it
190
+ carousel_id = f"carousel_{int(time.time() * 1000)}"
191
+
192
+ slides_html = ""
193
+ for i, img_path in enumerate(valid_assets):
194
+ base64_string = image_to_base64(img_path)
195
+ slides_html += f'<div class="carousel-slide"><div class="carousel-numbertext">{i + 1} / {len(valid_assets)}</div><img src="{base64_string}"></div>'
196
+
197
+ arrows_html = ""
198
+ if len(valid_assets) > 1:
199
+ arrows_html = '<a class="prev">&#10094;</a><a class="next">&#10095;</a>'
200
+
201
+ gallery_html = f'<div class="carousel-container" id="{carousel_id}">{slides_html}{arrows_html}</div>'
202
+
203
+ return f"{gallery_html}{title_html}{body_html}"
204
+
205
+ # --- 主处理流程 ---
206
+
207
+ async def process_pdf(
208
+ pdf_file,
209
+ text_api_key,
210
+ vision_api_key,
211
+ base_url,
212
+ text_model,
213
+ vision_model,
214
+ platform,
215
+ language,
216
+ progress=gr.Progress(track_tqdm=True)
217
+ ):
218
+ # Use text_api_key for vision_api_key if it's not provided
219
+ vision_api_key = vision_api_key or text_api_key
220
+
221
+ if not all([pdf_file, text_api_key, vision_api_key, base_url, text_model, vision_model, platform, language]):
222
+ raise gr.Error("Please fill in all required fields and upload a PDF.")
223
+
224
+ work_dir = None
225
+ try:
226
+ # 1. 创建临时工作目录
227
+ session_id = f"session_{int(time.time())}"
228
+ work_dir = Path(".temp_output") / session_id
229
+ work_dir.mkdir(parents=True, exist_ok=True)
230
+
231
+ pdf_path = Path(work_dir) / Path(pdf_file.name).name
232
+ shutil.copy(pdf_file.name, pdf_path)
233
+ final_assets = []
234
+
235
+ # 步骤 1: 提取文本
236
+ yield gr.update(value="🚀 **Processing...** Please wait.", visible=True), gr.update(value="", visible=False), gr.update(visible=False)
237
+ progress(0.1, desc="Step 1/5: Extracting text from PDF...")
238
+ txt_output_path = work_dir / "paper.txt"
239
+ await run_text_extraction(str(pdf_path), str(txt_output_path))
240
+ if not txt_output_path.exists():
241
+ raise gr.Error("Failed to extract text from the PDF.")
242
+
243
+ # 步骤 2: 提取图片
244
+ progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
245
+ extraction_work_dir = work_dir / "figure_extraction"
246
+ extraction_work_dir.mkdir()
247
+ paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir))
248
+ if not paired_dir or not any(Path(paired_dir).iterdir()):
249
+ raise gr.Error("Failed to extract any figures from the PDF.")
250
+
251
+ # 步骤 3: 生成草稿
252
+ progress(0.5, desc="Step 3/5: Generating structured text draft...")
253
+ blog_draft, source_paper_text = await generate_text_blog(
254
+ txt_path=str(txt_output_path),
255
+ api_key=text_api_key,
256
+ text_api_base=base_url,
257
+ model=text_model,
258
+ language=language
259
+ )
260
+ if not blog_draft or blog_draft.startswith("Error:"):
261
+ raise gr.Error(f"Failed to generate blog draft: {blog_draft}")
262
+
263
+ # 步骤 4: 生成带图分析的最终 Markdown
264
+ progress(0.7, desc="Step 4/5: Generating final post with vision analysis...")
265
+ final_post_md, assets_info = await generate_final_post(
266
+ blog_draft=blog_draft,
267
+ source_paper_text=source_paper_text,
268
+ assets_dir=paired_dir,
269
+ text_api_key=text_api_key,
270
+ vision_api_key=vision_api_key,
271
+ text_api_base=base_url,
272
+ vision_api_base=base_url,
273
+ text_model=text_model,
274
+ vision_model=vision_model,
275
+ platform=platform,
276
+ language=language,
277
+ post_format='rich'
278
+ )
279
+ if not final_post_md or final_post_md.startswith("Error:"):
280
+ raise gr.Error(f"Failed to generate final post: {final_post_md}")
281
+
282
+ # 将最终的 Markdown 和图片保存到单独的 "post" 文件夹中,以便压缩
283
+ post_content_dir = work_dir / "post"
284
+ post_content_dir.mkdir()
285
+
286
+ if assets_info:
287
+ for asset in assets_info:
288
+ # 复制图片到 post_content_dir
289
+ dest_path = post_content_dir / Path(asset['src_path']).name
290
+ shutil.copy(asset['src_path'], dest_path)
291
+ # The path for rendering needs to be the absolute path to the copied file
292
+ absolute_path_str = str(dest_path.resolve()).replace('\\', '/')
293
+ final_assets.append(absolute_path_str)
294
+
295
+ # 保存 Markdown
296
+ (post_content_dir / "post.md").write_text(final_post_md, encoding='utf-8')
297
+
298
+ # 步骤 5: 格式化为JSON
299
+ progress(0.9, desc="Step 5/5: Formatting for rich display...")
300
+ async with setup_client(text_api_key, base_url) as client:
301
+ structured_data = await format_post_for_display(
302
+ final_post_md, assets_info, platform, client, text_model
303
+ )
304
+ if not structured_data:
305
+ raise gr.Error("Failed to format post for display.")
306
+
307
+ # 保存 structured_data
308
+ (post_content_dir / "post.json").write_text(json.dumps(structured_data, indent=2, ensure_ascii=False), encoding='utf-8')
309
+
310
+ # 渲染最终UI
311
+ if platform == 'twitter':
312
+ final_html = render_twitter_thread(structured_data, final_assets)
313
+ else: # xiaohongshu
314
+ final_html = render_xiaohongshu_post(structured_data, final_assets)
315
+
316
+ # 创建 ZIP 压缩包
317
+ zip_filename_base = f"PRAgent_post_{platform}_{session_id}"
318
+ zip_path = shutil.make_archive(
319
+ base_name=str(work_dir / zip_filename_base),
320
+ format='zip',
321
+ root_dir=str(work_dir),
322
+ base_dir="post"
323
+ )
324
+
325
+ # 使用 gr.update(value=...) 更新 gr.HTML 组件
326
+ yield gr.update(value="✅ **Done!**"), gr.update(value=final_html, visible=True), gr.update(value=zip_path, visible=True)
327
+
328
+ except Exception as e:
329
+ traceback.print_exc()
330
+ error_html = f"<h2>Error:</h2><pre>{traceback.format_exc()}</pre>"
331
+ yield gr.update(value=f"❌ An error occurred: {e}"), gr.update(value=error_html, visible=True), gr.update(visible=False)
332
+ finally:
333
+ # Cleanup is disabled to prevent race conditions with Gradio's reloader
334
+ # and to allow inspection of generated files.
335
+ pass
336
+ # if work_dir and work_dir.exists():
337
+ # shutil.rmtree(work_dir)
338
+
339
+
340
+ # --- Gradio 应用界面定义 ---
341
+
342
+ # 自定义 CSS
343
+ CUSTOM_CSS = '''
344
+ /* --- Twitter Style --- */
345
+ .tweet-row {
346
+ display: flex;
347
+ align-items: flex-start;
348
+ padding: 16px;
349
+ border: 1px solid #e1e8ed;
350
+ border-radius: 15px;
351
+ margin-bottom: 12px;
352
+ background-color: #ffffff;
353
+ }
354
+ .avatar-container {
355
+ flex-shrink: 0;
356
+ margin-right: 12px;
357
+ }
358
+ .avatar {
359
+ width: 48px;
360
+ height: 48px;
361
+ border-radius: 50%;
362
+ object-fit: cover;
363
+ }
364
+ .tweet-content {
365
+ width: 100%;
366
+ }
367
+ .user-info {
368
+ font-size: 15px;
369
+ font-weight: bold;
370
+ }
371
+ .user-info span {
372
+ color: #536471;
373
+ font-weight: normal;
374
+ }
375
+ .tweet-text {
376
+ font-size: 15px;
377
+ line-height: 1.5;
378
+ color: #0f1419;
379
+ margin-top: 4px;
380
+ word-wrap: break-word;
381
+ }
382
+ .tweet-image-container {
383
+ margin-top: 12px;
384
+ }
385
+ .tweet-image {
386
+ width: 100%;
387
+ border-radius: 15px;
388
+ border: 1px solid #ddd;
389
+ display: block;
390
+ }
391
+
392
+ /* --- Xiaohongshu Style --- */
393
+ .xhs-title { font-size: 20px; font-weight: bold; color: #333; margin-bottom: 10px; }
394
+ .xhs-body { font-size: 16px; line-height: 1.8; color: #555; word-wrap: break-word; }
395
+
396
+ #output_container {
397
+ border: 2px dashed #ccc;
398
+ padding: 20px;
399
+ min-height: 100px;
400
+ border-radius: 15px;
401
+ }
402
+ .carousel-container { position: relative; max-width: 100%; margin: auto; overflow: hidden; border-radius: 10px; }
403
+ .carousel-slide { display: none; animation: fade 0.5s ease-in-out; }
404
+ .carousel-slide:first-child { display: block; }
405
+ .carousel-slide img { width: 100%; display: block; }
406
+ .prev, .next { cursor: pointer; position: absolute; top: 50%; width: auto; padding: 16px; margin-top: -22px; color: white; font-weight: bold; font-size: 20px; transition: 0.3s ease; border-radius: 0 3px 3px 0; user-select: none; background-color: rgba(0,0,0,0.3); }
407
+ .next { right: 0; border-radius: 3px 0 0 3px; }
408
+ .prev:hover, .next:hover { background-color: rgba(0,0,0,0.6); }
409
+ .carousel-numbertext { color: #f2f2f2; font-size: 12px; padding: 8px 12px; position: absolute; top: 0; background-color: rgba(0,0,0,0.5); border-radius: 0 0 5px 0; }
410
+ @keyframes fade { from {opacity: .4} to {opacity: 1}}
411
+ '''
412
+
413
+ ACTIVATE_CAROUSEEL_JS = '''
414
+ () => {
415
+ // We use a small 100ms delay to ensure Gradio has finished updating the HTML DOM
416
+ setTimeout(() => {
417
+ const container = document.getElementById('output_container');
418
+ if (container) {
419
+ const carousel = container.querySelector('.carousel-container');
420
+ // Check if a carousel exists and hasn't been initialized yet
421
+ if (carousel && !carousel.dataset.initialized) {
422
+ console.log("PRAgent Carousel Script: JS listener has found and is activating the carousel ->", carousel.id);
423
+
424
+ let slideIndex = 1;
425
+ const slides = carousel.getElementsByClassName("carousel-slide");
426
+ const prevButton = carousel.querySelector(".prev");
427
+ const nextButton = carousel.querySelector(".next");
428
+ if (slides.length === 0) return;
429
+
430
+ const showSlides = () => {
431
+ if (slideIndex > slides.length) { slideIndex = 1; }
432
+ if (slideIndex < 1) { slideIndex = slides.length; }
433
+ for (let i = 0; i < slides.length; i++) {
434
+ slides[i].style.display = "none";
435
+ }
436
+ slides[slideIndex - 1].style.display = "block";
437
+ };
438
+
439
+ if (prevButton) {
440
+ prevButton.addEventListener('click', () => { slideIndex--; showSlides(); });
441
+ }
442
+ if (nextButton) {
443
+ nextButton.addEventListener('click', () => { slideIndex++; showSlides(); });
444
+ }
445
+
446
+ showSlides(); // Show the first slide
447
+ carousel.dataset.initialized = 'true'; // Mark as initialized to prevent re-activation
448
+ }
449
+ }
450
+ }, 100);
451
+ }
452
+ '''
453
+
454
+ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
455
+ gr.Markdown("# 🚀 PRAgent: Paper to Social Media Post")
456
+ gr.Markdown("Upload a research paper PDF, and I will generate a social media post for Twitter or Xiaohongshu, complete with images and platform-specific styling.")
457
+
458
+ with gr.Row():
459
+ with gr.Column(scale=1):
460
+ pdf_upload = gr.File(label="Upload PDF Paper", file_types=[".pdf"])
461
+
462
+ with gr.Accordion("Advanced Settings", open=True):
463
+ text_api_key_input = gr.Textbox(label="Text API Key", type="password", placeholder="Required: sk-...")
464
+ vision_api_key_input = gr.Textbox(label="Vision API Key (Optional)", type="password", placeholder="Optional: If not provided, Text API Key will be used")
465
+ base_url_input = gr.Textbox(label="API Base URL")
466
+ text_model_input = gr.Textbox(label="Text Model")
467
+ vision_model_input = gr.Textbox(label="Vision Model")
468
+
469
+ platform_select = gr.Radio(["twitter", "xiaohongshu"], label="Target Platform", value="twitter")
470
+ language_select = gr.Radio([("English", "en"), ("Chinese", "zh")], label="Language", value="en")
471
+
472
+ generate_btn = gr.Button("✨ Generate Post", variant="primary")
473
+
474
+ with gr.Column(scale=2):
475
+ status_text = gr.Markdown("Idle. Please upload a file and click generate.", visible=True)
476
+ output_container = gr.HTML(elem_id="output_container")
477
+ download_button = gr.File(label="Download Post & Images", visible=False)
478
+
479
+ # 绑定按钮点击事件
480
+ click_event = generate_btn.click(
481
+ fn=process_pdf,
482
+ inputs=[
483
+ pdf_upload,
484
+ text_api_key_input,
485
+ vision_api_key_input,
486
+ base_url_input,
487
+ text_model_input,
488
+ vision_model_input,
489
+ platform_select,
490
+ language_select
491
+ ],
492
+ outputs=[status_text, output_container, download_button]
493
+ )
494
+
495
+ # 链接 .then() 事件,在前一个事件成功后执行 JavaScript
496
+ click_event.then(
497
+ fn=None, # 这里不需要执行 Python 函数
498
+ inputs=None,
499
+ outputs=None,
500
+ js=ACTIVATE_CAROUSEEL_JS # 将 JS 放在独立的事件中
501
+ )
502
+
503
+ if __name__ == "__main__":
504
+ # Create the hidden temp directory
505
+ Path(".temp_output").mkdir(exist_ok=True)
506
+ demo.launch()
pragent/backend/__init__.py ADDED
File without changes
pragent/backend/agents.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agent.py
2
+
3
+ import base64
4
+ from openai import AsyncOpenAI
5
+ from contextlib import asynccontextmanager
6
+ from typing import List, Dict, AsyncIterator, Optional, Any, Tuple
7
+ from tqdm.asyncio import tqdm
8
+ import tiktoken
9
+
10
+
11
+ def _prepare_extra_body(model_name: str, disable_qwen_thinking: bool) -> Optional[Dict[str, Any]]:
12
+ if "qwen3" in model_name.lower() and disable_qwen_thinking:
13
+ tqdm.write("[*] 已为 Qwen3 模型启用 'disable_thinking' 模式。")
14
+ return {"chat_template_kwargs": {"enable_thinking": False}}
15
+ return None
16
+
17
+ @asynccontextmanager
18
+ async def setup_client(api_key: str, base_url: str) -> AsyncIterator[AsyncOpenAI]:
19
+ """使用异步上下文管理器来创建和妥善销毁API客户端。"""
20
+ client = None
21
+ if not api_key or "sk-" not in api_key:
22
+ tqdm.write("[!] 错误: API Key无效或未设置。")
23
+ yield None
24
+ return
25
+
26
+ try:
27
+ tqdm.write("[*] 正在初始化API客户端...")
28
+ client = AsyncOpenAI(api_key=api_key, base_url=base_url, timeout=300.0)
29
+ yield client
30
+ except Exception as e:
31
+ tqdm.write(f"[!] 初始化AsyncOpenAI客户端时出错: {e}")
32
+ yield None
33
+ finally:
34
+ if client:
35
+ tqdm.write("[*] 正在关闭API客户端连接...")
36
+ await client.close()
37
+ tqdm.write("[*] API客户端已关闭。")
38
+
39
+ def encode_image_to_base64(image_path: str) -> str:
40
+
41
+ try:
42
+ with open(image_path, "rb") as image_file:
43
+ return base64.b64encode(image_file.read()).decode('utf-8')
44
+ except Exception as e:
45
+ tqdm.write(f"[!] 编码图片失败 {image_path}: {e}")
46
+ return ""
47
+
48
+
49
+ async def call_text_llm_api(local_client: AsyncOpenAI, system_prompt: str, user_prompt: str, model: str, disable_qwen_thinking: bool = False) -> str:
50
+ """异步调用仅处理文本的大语言模型API。"""
51
+ if not local_client: return "错误: API客户端未配置。"
52
+ try:
53
+ extra_body = _prepare_extra_body(model, disable_qwen_thinking)
54
+ completion = await local_client.chat.completions.create(
55
+ model=model,
56
+ messages=[
57
+ {"role": "system", "content": system_prompt},
58
+ {"role": "user", "content": user_prompt}
59
+ ],
60
+ extra_body=extra_body # 应用 extra_body
61
+ )
62
+ return completion.choices[0].message.content
63
+ except Exception as e:
64
+ return f"错误: 文本API调用失败 - {e}"
65
+
66
+
67
+ async def call_multimodal_llm_api(local_client: AsyncOpenAI, system_prompt: str, user_prompt_parts: list, model: str, disable_qwen_thinking: bool = False) -> str:
68
+
69
+ if not local_client: return "错误: API客户端未配置。"
70
+ try:
71
+ extra_body = _prepare_extra_body(model, disable_qwen_thinking)
72
+ messages = [
73
+ {"role": "system", "content": system_prompt},
74
+ {"role": "user", "content": user_prompt_parts}
75
+ ]
76
+ completion = await local_client.chat.completions.create(
77
+ model=model,
78
+ messages=messages,
79
+ max_tokens=2048,
80
+ extra_body=extra_body # 应用 extra_body
81
+ )
82
+ return completion.choices[0].message.content
83
+ except Exception as e:
84
+ return f"错误: 多模态API调用失败 - {e}"
85
+
86
+ class BlogGeneratorAgent:
87
+
88
+ def __init__(self, prompt_template: str, model: str):
89
+ self.prompt_template = prompt_template
90
+ self.model = model
91
+ self.system_prompt = "You are a top-tier science and technology blogger and popular science writer."
92
+
93
+ async def run(self, local_client: AsyncOpenAI, paper_text: str, disable_qwen_thinking: bool = False) -> str:
94
+ user_prompt = self.prompt_template.format(paper_text=paper_text)
95
+ return await call_text_llm_api(local_client, self.system_prompt, user_prompt, self.model, disable_qwen_thinking)
96
+
97
+ class FigureDescriberAgent:
98
+ def __init__(self, model: str):
99
+ self.model = model
100
+ self.system_prompt = "You are an expert academic analyst. Your task is to provide a detailed explanation of the provided image, using its original caption as context. Describe what the figure shows, what its main takeaway is, and how it supports the paper's argument. Be clear, comprehensive, and ready for a blog post."
101
+
102
+ async def run(self, local_client: AsyncOpenAI, figure_path: str, caption_path: str, disable_qwen_thinking: bool = False) -> str:
103
+ base64_figure = encode_image_to_base64(figure_path)
104
+ base64_caption_img = encode_image_to_base64(caption_path)
105
+ if not all([base64_figure, base64_caption_img]):
106
+ return "错误: 无法编码一张或多张图片。"
107
+
108
+ user_prompt = [
109
+ {"type": "text", "text": "Please analyze this figure and its accompanying caption. Provide a detailed, blog-ready description."},
110
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_figure}", "detail": "high"}},
111
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_caption_img}", "detail": "low"}}
112
+ ]
113
+ return await call_multimodal_llm_api(local_client, self.system_prompt, user_prompt, self.model, disable_qwen_thinking)
114
+
115
+ class BlogIntegratorAgent:
116
+ def __init__(self, prompt_template: str, model: str):
117
+ self.prompt_template = prompt_template
118
+ self.model = model
119
+ self.system_prompt = "You are a master science communicator and blogger. Your task is to transform a dry academic text into an engaging blog post, weaving in figures and tables to tell a compelling story."
120
+
121
+ async def run(self, local_client: AsyncOpenAI, blog_text: str, items_with_descriptions: List[Dict], source_text: str, disable_qwen_thinking: bool = False) -> str:
122
+ items_list_str = []
123
+ for i, item in enumerate(items_with_descriptions):
124
+ placeholder = f"[FIGURE_PLACEHOLDER_{i}]"
125
+ description = item['description']
126
+ items_list_str.append(f"### Figure {i} (Placeholder: {placeholder})\n**Type**: {item['type']}\n**Description**: {description}\n---")
127
+
128
+ user_prompt = self.prompt_template.format(
129
+ source_text=source_text,
130
+ blog_text=blog_text,
131
+ items_list_str="\n".join(items_list_str)
132
+ )
133
+ return await call_text_llm_api(local_client, self.system_prompt, user_prompt, self.model, disable_qwen_thinking)
134
+
135
+
136
+ async def call_text_llm_api_with_token_count(
137
+ local_client: AsyncOpenAI,
138
+ system_prompt: str,
139
+ user_prompt: str,
140
+ model: str,
141
+ disable_qwen_thinking: bool = False
142
+ ) -> Tuple[str, int]:
143
+ """
144
+ Calls the text LLM API and returns the content and the 'think' token count.
145
+ """
146
+ if not local_client:
147
+ return "错误: API客户端未配置。", 0
148
+ try:
149
+ params = {
150
+ "model": model,
151
+ "messages": [
152
+ {"role": "system", "content": system_prompt},
153
+ {"role": "user", "content": user_prompt}
154
+ ]
155
+ }
156
+ extra_body = _prepare_extra_body(model, disable_qwen_thinking)
157
+ if extra_body:
158
+ params["extra_body"] = extra_body
159
+
160
+ completion = await local_client.chat.completions.create(**params)
161
+
162
+ content = completion.choices[0].message.content or ""
163
+ reasoning_content = getattr(completion.choices[0].message, 'reasoning_content', None)
164
+
165
+ think_token_count = 0
166
+ if reasoning_content and isinstance(reasoning_content, str):
167
+ try:
168
+ encoding = tiktoken.encoding_for_model(model)
169
+ except KeyError:
170
+ encoding = tiktoken.get_encoding("cl100k_base")
171
+ think_token_count = len(encoding.encode(reasoning_content))
172
+
173
+ return content, think_token_count
174
+
175
+ except Exception as e:
176
+ return f"错误: 文本API调用失败 - {e}", 0
pragent/backend/blog_pipeline.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pragent/backend/blog_pipeline.py
2
+
3
+ from tqdm.asyncio import tqdm
4
+ import asyncio
5
+ from pathlib import Path
6
+ from typing import Tuple, List, Dict, Optional
7
+ from openai import AsyncOpenAI
8
+ import re
9
+ import os
10
+ import json
11
+ # ADDED FOR OCR & CACHE SAFETY: New imports for OCR
12
+ import pytesseract
13
+ from PIL import Image
14
+ import asyncio
15
+
16
+ from pragent.backend.agents import setup_client, BlogGeneratorAgent, FigureDescriberAgent, BlogIntegratorAgent, call_text_llm_api,call_text_llm_api_with_token_count
17
+ from pragent.backend.data_loader import load_plain_text, load_paired_image_paths
18
+ from pragent.backend.text_processor import summarize_long_text
19
+ from .prompts import (
20
+ TEXT_GENERATOR_PROMPT, TEXT_GENERATOR_PROMPT_CHINESE,
21
+ TWITTER_RICH_TEXT_PROMPT_ENGLISH, TWITTER_TEXT_ONLY_PROMPT_ENGLISH,
22
+ TWITTER_RICH_TEXT_PROMPT_CHINESE, TWITTER_TEXT_ONLY_PROMPT_CHINESE,
23
+ XIAOHONGSHU_PROMPT_ENGLISH, XIAOHONGSHU_PROMPT_CHINESE,
24
+ XIAOHONGSHU_TEXT_ONLY_PROMPT_ENGLISH, XIAOHONGSHU_TEXT_ONLY_PROMPT_CHINESE,
25
+ BASELINE_PROMPT_ENGLISH, BASELINE_PROMPT_CHINESE,
26
+ GENERIC_RICH_PROMPT_CHINESE,GENERIC_RICH_PROMPT_ENGLISH,
27
+ GENERIC_TEXT_ONLY_PROMPT_CHINESE,GENERIC_TEXT_ONLY_PROMPT_ENGLISH,
28
+ BASELINE_FEWSHOT_PROMPT_ENGLISH, BASELINE_FEWSHOT_PROMPT_CHINESE
29
+ )
30
+ TOKEN_THRESHOLD = 8000
31
+
32
+ PROMPT_MAPPING = {
33
+ ('twitter', 'rich', 'en'): TWITTER_RICH_TEXT_PROMPT_ENGLISH,
34
+ ('twitter', 'text_only', 'en'): TWITTER_TEXT_ONLY_PROMPT_ENGLISH,
35
+ ('twitter', 'rich', 'zh'): TWITTER_RICH_TEXT_PROMPT_CHINESE,
36
+ ('twitter', 'text_only', 'zh'): TWITTER_TEXT_ONLY_PROMPT_CHINESE,
37
+ ('xiaohongshu', 'rich', 'en'): XIAOHONGSHU_PROMPT_ENGLISH,
38
+ ('xiaohongshu', 'rich', 'zh'): XIAOHONGSHU_PROMPT_CHINESE,
39
+ ('xiaohongshu', 'text_only', 'en'): XIAOHONGSHU_TEXT_ONLY_PROMPT_ENGLISH,
40
+ ('xiaohongshu', 'text_only', 'zh'): XIAOHONGSHU_TEXT_ONLY_PROMPT_CHINESE,
41
+ ('generic', 'rich', 'en'): GENERIC_RICH_PROMPT_ENGLISH,
42
+ ('generic', 'text_only', 'en'): GENERIC_TEXT_ONLY_PROMPT_ENGLISH,
43
+ ('generic', 'rich', 'zh'): GENERIC_RICH_PROMPT_CHINESE,
44
+ ('generic', 'text_only', 'zh'): GENERIC_TEXT_ONLY_PROMPT_CHINESE,
45
+ }
46
+
47
+
48
+ # ADDED FOR OCR & CACHE SAFETY: Asynchronous OCR helper function
49
+ async def ocr_image_to_text(image_path: str) -> str:
50
+ """
51
+ Performs OCR on an image file to extract text asynchronously.
52
+ """
53
+ if not Path(image_path).exists():
54
+ return ""
55
+ try:
56
+ # pytesseract is a blocking library, so we run it in a thread pool
57
+ loop = asyncio.get_running_loop()
58
+ text = await loop.run_in_executor(
59
+ None,
60
+ lambda: pytesseract.image_to_string(Image.open(image_path))
61
+ )
62
+ return text.strip()
63
+ except Exception as e:
64
+ tqdm.write(f"[!] OCR failed for {image_path}: {e}")
65
+ return ""
66
+
67
+
68
+ async def generate_text_blog(
69
+ txt_path: str, api_key: str, text_api_base: str, model: str, language: str,
70
+ disable_qwen_thinking: bool = False, ablation_mode: str = "none"
71
+ ) -> Tuple[str, str]:
72
+ """
73
+ Generates a structured, factual blog DRAFT in the specified language. (Stage 1)
74
+ """
75
+ async with setup_client(api_key, text_api_base) as client:
76
+ if not client:
77
+ return "Error: API client configuration failed.", None
78
+
79
+ paper_text = await load_plain_text(txt_path)
80
+ if not paper_text:
81
+ return "Error: Could not load text file.", None
82
+
83
+ text_for_generation = ""
84
+ if len(paper_text) > TOKEN_THRESHOLD:
85
+ if ablation_mode == 'no_hierarchical_summary':
86
+ tqdm.write(f"[*] ABLATION (no_hierarchical_summary): Truncating text to {TOKEN_THRESHOLD} characters.")
87
+ text_for_generation = paper_text[:TOKEN_THRESHOLD]
88
+ else:
89
+ summarized_text = await summarize_long_text(
90
+ paper_text,
91
+ model,
92
+ client,
93
+ disable_qwen_thinking=disable_qwen_thinking
94
+ )
95
+ if summarized_text.startswith("Error:"):
96
+ summarized_text = paper_text[:TOKEN_THRESHOLD]
97
+ text_for_generation = summarized_text
98
+ else:
99
+ text_for_generation = paper_text
100
+
101
+ if ablation_mode in ['no_logical_draft', 'stage2']:
102
+ ablation_reason = "no_logical_draft" if ablation_mode != 'stage2' else 'stage2'
103
+ tqdm.write(f"[*] ABLATION ({ablation_reason}): Skipping structured draft generation.")
104
+ return text_for_generation, text_for_generation
105
+
106
+ draft_prompt = TEXT_GENERATOR_PROMPT_CHINESE if language == 'zh' else TEXT_GENERATOR_PROMPT
107
+ generator = BlogGeneratorAgent(draft_prompt, model)
108
+ generated_blog_draft = await generator.run(
109
+ client,
110
+ text_for_generation,
111
+ disable_qwen_thinking=disable_qwen_thinking
112
+ )
113
+ return generated_blog_draft, text_for_generation
114
+
115
+
116
+ async def generate_final_post(
117
+ blog_draft: str,
118
+ source_paper_text: str,
119
+ assets_dir: Optional[str],
120
+ text_api_key: str,
121
+ vision_api_key: str,
122
+ text_api_base: str,
123
+ vision_api_base: str,
124
+ vision_model: str,
125
+ text_model: str,
126
+ platform: str,
127
+ language: str,
128
+ post_format: str,
129
+ description_cache_dir: Optional[str] = None,
130
+ pdf_hash: Optional[str] = None,
131
+ disable_qwen_thinking: bool = False,
132
+ ablation_mode: str = "none"
133
+ ) -> Optional[Tuple[str, Optional[List[Dict]]]]:
134
+ effective_platform = platform
135
+ if ablation_mode == 'no_platform_adaptation':
136
+ tqdm.write(f"[*] ABLATION (no_platform_adaptation): Using generic prompts instead of '{platform}' specific prompts.")
137
+ effective_platform = 'generic'
138
+
139
+ prompt_format = 'rich' if post_format == 'description_only' else post_format
140
+ prompt_key = (effective_platform, prompt_format, language)
141
+ selected_prompt = PROMPT_MAPPING.get(prompt_key)
142
+
143
+ if not selected_prompt:
144
+ tqdm.write(f"[!] Warning: No prompt found for configuration: {prompt_key}. Falling back to generic prompt.")
145
+ generic_fallback_key = ('generic', prompt_format, language)
146
+ selected_prompt = PROMPT_MAPPING.get(generic_fallback_key)
147
+ if not selected_prompt:
148
+ return f"Error: No prompt found for configuration: {prompt_key} or generic fallback.", None
149
+
150
+ tqdm.write(f"\n--- Generating final post for: Platform='{effective_platform}', Format='{post_format}', Language='{language}' ---")
151
+
152
+ items_with_descriptions = []
153
+ if post_format in ['rich', 'description_only'] and assets_dir and Path(assets_dir).is_dir():
154
+ all_items = load_paired_image_paths(Path(assets_dir))
155
+ all_items = all_items[:50] # Limit to first 50 items to avoid overloading the model
156
+ if all_items:
157
+ cache_file_path = None
158
+ if description_cache_dir and pdf_hash:
159
+ sanitized_model_name = re.sub(r'[\\/:"*?<>|]', '_', vision_model)
160
+ cache_dir = Path(description_cache_dir) / pdf_hash
161
+ cache_dir.mkdir(parents=True, exist_ok=True)
162
+ cache_file_path = cache_dir / f"{sanitized_model_name}.json"
163
+
164
+ if cache_file_path and cache_file_path.exists() and ablation_mode not in ['no_visual_analysis', 'stage2']:
165
+ tqdm.write(f"[✓] Cache hit! Loading all descriptions from {cache_file_path}")
166
+ with cache_file_path.open('r', encoding='utf-8') as f:
167
+ items_with_descriptions = json.load(f)
168
+
169
+ else:
170
+ # MODIFIED: Trigger this ablation also for 'stage2'
171
+ if ablation_mode in ['no_visual_analysis', 'stage2']:
172
+ ablation_reason = "no_visual_analysis" if ablation_mode != 'stage2' else 'stage2'
173
+ tqdm.write(f"[*] ABLATION ({ablation_reason}): Using OCR on caption images instead of vision model.")
174
+ temp_items_with_desc = []
175
+
176
+ ocr_tasks = [ocr_image_to_text(item['caption_path']) for item in all_items]
177
+ ocr_results = await asyncio.gather(*ocr_tasks)
178
+
179
+ for i, item in enumerate(all_items):
180
+ caption_content = ocr_results[i]
181
+ if caption_content:
182
+ item['description'] = caption_content
183
+ temp_items_with_desc.append(item)
184
+ items_with_descriptions = temp_items_with_desc
185
+ else:
186
+ # Full pipeline: use vision model
187
+ tqdm.write(f"--- Cache miss. Describing {len(all_items)} new figures using model '{vision_model}'... ---")
188
+ async with setup_client(vision_api_key, vision_api_base) as vision_client:
189
+ if not vision_client:
190
+ return "Error: Vision API client configuration failed.", None
191
+
192
+ describer = FigureDescriberAgent(model=vision_model)
193
+ description_tasks = [
194
+ describer.run(
195
+ vision_client,
196
+ item['item_path'],
197
+ item['caption_path'],
198
+ disable_qwen_thinking=disable_qwen_thinking
199
+ ) for item in all_items
200
+ ]
201
+ descriptions = await asyncio.gather(*description_tasks)
202
+
203
+ temp_items_with_desc = []
204
+ for i, item in enumerate(all_items):
205
+ if not descriptions[i].startswith("Error:"):
206
+ item['description'] = descriptions[i]
207
+ temp_items_with_desc.append(item)
208
+ items_with_descriptions = temp_items_with_desc
209
+
210
+ # MODIFIED: Prevent caching for 'stage2' as well
211
+ if cache_file_path and ablation_mode not in ['no_visual_analysis', 'stage2']:
212
+ tqdm.write(f"[*] Saving all descriptions to cache file: {cache_file_path}")
213
+ with cache_file_path.open('w', encoding='utf-8') as f:
214
+ json.dump(items_with_descriptions, f, ensure_ascii=False, indent=4)
215
+ elif cache_file_path and ablation_mode in ['no_visual_analysis', 'stage2']:
216
+ ablation_reason = "no_visual_analysis" if ablation_mode != 'stage2' else 'stage2'
217
+ tqdm.write(f"[*] ABLATION ({ablation_reason}): Description caching is disabled for this mode to avoid saving OCR results.")
218
+
219
+ items_with_descriptions = items_with_descriptions[:20]
220
+ if post_format in ['rich', 'description_only'] and not items_with_descriptions:
221
+ return f"Error: '{post_format}' format requires images, but none were found/described.", None
222
+
223
+ async with setup_client(text_api_key, text_api_base) as text_client:
224
+ if not text_client: return "Error: Text API client configuration failed.", None
225
+
226
+ if ablation_mode in ['no_visual_integration', 'stage2'] and post_format in ['rich', 'description_only']:
227
+ ablation_reason = "no_visual_integration" if ablation_mode != 'stage2' else 'stage2'
228
+ tqdm.write(f"[*] ABLATION ({ablation_reason}): Generating text first, then appending all figures at the end.")
229
+
230
+ integrator = BlogIntegratorAgent(selected_prompt, model=text_model)
231
+ text_only_post = await integrator.run(
232
+ local_client=text_client,
233
+ blog_text=blog_draft,
234
+ items_with_descriptions=[],
235
+ source_text=source_paper_text,
236
+ disable_qwen_thinking=disable_qwen_thinking
237
+ )
238
+
239
+ if not text_only_post or text_only_post.startswith("Error:"):
240
+ return f"Blog integration failed for text-only part: {text_only_post}", None
241
+
242
+ final_blog_content = text_only_post
243
+ assets_for_packaging = []
244
+ for i, item_data in enumerate(items_with_descriptions):
245
+ if post_format == 'rich':
246
+ new_asset_filename = f"img_{i}{Path(item_data['item_path']).suffix}"
247
+ alt_text = f"Figure {i}"
248
+ new_markdown_tag = f"\n\n![{alt_text}](./img/{new_asset_filename})"
249
+ assets_for_packaging.append({'src_path': item_data['item_path'], 'dest_name': new_asset_filename, 'new_index': i})
250
+ final_blog_content += new_markdown_tag
251
+ elif post_format == 'description_only':
252
+ alt_text_description = item_data.get('description', f'Figure {i}').strip().replace('\n', ' ')
253
+ new_markdown_tag = f"\n\n![{alt_text_description}]()"
254
+ final_blog_content += new_markdown_tag
255
+
256
+ return final_blog_content, assets_for_packaging if assets_for_packaging else None
257
+
258
+ integrator = BlogIntegratorAgent(selected_prompt, model=text_model)
259
+ final_post_with_placeholders = await integrator.run(
260
+ local_client=text_client,
261
+ blog_text=blog_draft,
262
+ items_with_descriptions=items_with_descriptions,
263
+ source_text=source_paper_text,
264
+ disable_qwen_thinking=disable_qwen_thinking
265
+ )
266
+
267
+ if not final_post_with_placeholders or final_post_with_placeholders.startswith("Error:"):
268
+ return f"Blog integration failed: {final_post_with_placeholders}", None
269
+
270
+ found_indices = re.findall(r'\[FIGURE_PLACEHOLDER_(\d+)\]', final_post_with_placeholders)
271
+ final_blog_content = final_post_with_placeholders
272
+ assets_for_packaging = []
273
+
274
+ if found_indices:
275
+ items_map = {i: item for i, item in enumerate(items_with_descriptions)}
276
+ for new_index, original_index_str in enumerate(found_indices):
277
+ original_index = int(original_index_str)
278
+ item_data = items_map.get(original_index)
279
+ if not item_data: continue
280
+
281
+ placeholder_to_replace = f"[FIGURE_PLACEHOLDER_{original_index}]"
282
+
283
+ if post_format == 'rich':
284
+ new_asset_filename = f"img_{new_index}{Path(item_data['item_path']).suffix}"
285
+ alt_text = f"Figure {new_index}"
286
+ new_markdown_tag = f"![{alt_text}](./img/{new_asset_filename})"
287
+ assets_for_packaging.append({'src_path': item_data['item_path'], 'dest_name': new_asset_filename, 'new_index': new_index})
288
+ elif post_format == 'description_only':
289
+ alt_text_description = item_data.get('description', f'Figure {new_index}').strip().replace('\n', ' ')
290
+ new_markdown_tag = f"![{alt_text_description}]()"
291
+ else:
292
+ new_markdown_tag = ""
293
+ final_blog_content = final_blog_content.replace(placeholder_to_replace, new_markdown_tag, 1)
294
+
295
+ final_blog_content = re.sub(r'\[FIGURE_PLACEHOLDER_(\d+)\]', '', final_blog_content)
296
+
297
+ if post_format == 'rich':
298
+ return final_blog_content, assets_for_packaging
299
+ else:
300
+ return final_blog_content, None
301
+
302
+
303
+ async def generate_baseline_post(
304
+ paper_text: str,
305
+ api_key: str,
306
+ api_base: str,
307
+ model: str,
308
+ platform: str,
309
+ language: str,
310
+ disable_qwen_thinking: bool = False,
311
+ mode: str = 'original',
312
+ assets_dir: Optional[str] = None
313
+ ) -> Tuple[str, List[Dict], int]:
314
+ """
315
+ Generates a post using a simple, single-prompt baseline method.
316
+ """
317
+ tqdm.write(f"\n--- Generating baseline post (mode: {mode}) for: Platform='{platform}', Language='{language}' ---")
318
+
319
+ async with setup_client(api_key, api_base) as client:
320
+ if not client:
321
+ return "Error: API client configuration failed.", [], 0
322
+
323
+ if mode == 'fewshot':
324
+ prompt_template = BASELINE_FEWSHOT_PROMPT_CHINESE if language == 'zh' else BASELINE_FEWSHOT_PROMPT_ENGLISH
325
+ else:
326
+ prompt_template = BASELINE_PROMPT_CHINESE if language == 'zh' else BASELINE_PROMPT_ENGLISH
327
+
328
+ user_prompt = prompt_template.format(paper_text=paper_text[:20000], platform=platform.capitalize())
329
+ system_prompt = "You are an assistant that summarizes academic papers for social media."
330
+
331
+ text_post, think_token_count = await call_text_llm_api_with_token_count(
332
+ local_client=client,
333
+ system_prompt=system_prompt,
334
+ user_prompt=user_prompt,
335
+ model=model,
336
+ disable_qwen_thinking=disable_qwen_thinking
337
+ )
338
+
339
+ if text_post.startswith("Error:"):
340
+ return text_post, [], think_token_count
341
+
342
+ final_post = text_post
343
+ assets_for_packaging = []
344
+ if mode == 'with_figure' and assets_dir and Path(assets_dir).is_dir():
345
+ tqdm.write(f"[*] Attaching top 3 figures/tables for 'with_figure' baseline...")
346
+
347
+ paired_item_dirs = [
348
+ d for d in Path(assets_dir).rglob('paired_*')
349
+ if d.is_dir() and (d.name.startswith('paired_figure_') or d.name.startswith('paired_table_'))
350
+ ]
351
+ def get_global_sort_key(dir_path: Path):
352
+ page_num = -1
353
+ item_type = ''
354
+ item_index = -1
355
+
356
+ try:
357
+
358
+ page_match = re.search(r'page_(\d+)', dir_path.parts[-2])
359
+ if page_match:
360
+ page_num = int(page_match.group(1))
361
+ except (IndexError, ValueError):
362
+ pass
363
+
364
+ item_match = re.search(r'paired_(figure|table)_(\d+)', dir_path.name)
365
+ if item_match:
366
+ item_type = item_match.group(1)
367
+ item_index = int(item_match.group(2))
368
+
369
+ return (page_num, item_index)
370
+
371
+ sorted_dirs = sorted(paired_item_dirs, key=get_global_sort_key)
372
+
373
+ all_items = []
374
+
375
+ for item_dir in sorted_dirs:
376
+ item_type = 'figure' if 'figure' in item_dir.name else 'table'
377
+
378
+ item_file = next(
379
+ (f for f in item_dir.iterdir() if f.is_file() and f.name.startswith(item_type) and 'caption' not in f.name),
380
+ None
381
+ )
382
+ if item_file:
383
+ all_items.append(item_file)
384
+
385
+
386
+ selected_items = all_items[:3]
387
+
388
+ if selected_items:
389
+ final_post += "\n\n--- Key Figures & Tables ---\n"
390
+ for i, item_path in enumerate(selected_items):
391
+ new_asset_filename = f"img_{i}{item_path.suffix}"
392
+ alt_text = "Table" if "table" in item_path.parent.name else "Figure"
393
+ alt_text += f" {i+1}"
394
+
395
+ final_post += f"\n![{alt_text}](./img/{new_asset_filename})"
396
+ assets_for_packaging.append({'src_path': str(item_path), 'dest_name': new_asset_filename})
397
+ tqdm.write(f"[✓] Appended {len(selected_items)} items (figures/tables) to the post.")
398
+ else:
399
+ tqdm.write("[!] Warning: 'with_figure' mode was selected, but no paired items were found.")
400
+
401
+ return final_post, assets_for_packaging, think_token_count
pragent/backend/data_loader.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data_loader.py
2
+ import asyncio
3
+ import aiofiles
4
+ from pathlib import Path
5
+ import re
6
+ from typing import List, Dict
7
+ from tqdm.asyncio import tqdm
8
+ async def load_plain_text(txt_path: str) -> str:
9
+ """异步地从 .txt 文件加载纯文本内容。"""
10
+ try:
11
+ async with aiofiles.open(txt_path, mode='r', encoding='utf-8') as f:
12
+ return await f.read()
13
+ except Exception as e:
14
+ tqdm.write(f"[!] 读取文本文件 '{txt_path}' 时出错: {e}")
15
+ return ""
16
+
17
+ def load_paired_image_paths(base_dir: Path) -> List[Dict]:
18
+ """
19
+ 递归地扫描 'paired_*' 文件夹,并加载主图和其标题图的路径。
20
+ """
21
+ items = []
22
+ if not base_dir.is_dir():
23
+ tqdm.write(f"[!] 错误: 找不到配对结果的基础文件夹: {base_dir}")
24
+ return items
25
+
26
+ tqdm.write(f"[*] 正在从 {base_dir} 递归加载图文对...")
27
+
28
+ item_dirs = sorted(
29
+ [d for d in base_dir.rglob('paired_*') if d.is_dir()],
30
+ key=lambda p: p.name
31
+ )
32
+
33
+ for item_dir in item_dirs:
34
+ item_files = list(item_dir.glob('*.jpg'))
35
+ if len(item_files) < 2:
36
+ continue
37
+
38
+ main_item_path, caption_path = None, None
39
+ for f in item_files:
40
+ if "caption" in f.name:
41
+ caption_path = f
42
+ else:
43
+ main_item_path = f
44
+
45
+ if main_item_path and caption_path:
46
+ items.append({
47
+ "type": "figure" if "figure" in item_dir.name else "table",
48
+ "item_path": str(main_item_path.resolve()),
49
+ "caption_path": str(caption_path.resolve()),
50
+ })
51
+
52
+ tqdm.write(f"[*] 加载完成,共找到 {len(items)} 个图文对。")
53
+ return items
pragent/backend/figure_table_pipeline.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # figure_table_pipeline.py
2
+ import os
3
+ import shutil
4
+ import re
5
+ from pathlib import Path
6
+ from collections import defaultdict
7
+ from pragent.backend.loader import ImagePDFLoader
8
+ from pragent.backend.yolo import extract_and_save_layout_components
9
+ from tqdm.asyncio import tqdm
10
+
11
+ def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
12
+ """
13
+ 一个完整的、从PDF提取并配对图表的流程。
14
+ 这是被 app.py 调用的主函数。
15
+
16
+ Args:
17
+ pdf_path (str): 用户上传的PDF的路径。
18
+ base_work_dir (str): 本次会话的临时工作目录。
19
+
20
+ Returns:
21
+ str: 最终配对结果的目录路径,如果失败则返回 None。
22
+ """
23
+ if not all([ImagePDFLoader, extract_and_save_layout_components]):
24
+ tqdm.write("[!] 错误: figure_pipeline 的一个或多个核心依赖项未能加载。")
25
+ return None
26
+
27
+ pdf_file = Path(pdf_path)
28
+ pdf_stem = pdf_file.stem
29
+ model_path = "pragent/model/doclayout_yolo_docstructbench_imgsz1024.pt"
30
+
31
+ tqdm.write(f"\n--- 步骤 1/3: 将PDF '{pdf_file.name}' 转换为图片 ---")
32
+ page_save_dir = os.path.join(base_work_dir, "page_paper", pdf_stem)
33
+ os.makedirs(page_save_dir, exist_ok=True)
34
+ try:
35
+ loader = ImagePDFLoader(pdf_path)
36
+ page_image_paths = []
37
+ for i, img in enumerate(loader.load()):
38
+ path = os.path.join(page_save_dir, f"page_{i+1}.png")
39
+ img.save(path)
40
+ page_image_paths.append(path)
41
+ tqdm.write(f"[*] 所有 {len(page_image_paths)} 页已保存至: {page_save_dir}")
42
+ except Exception as e:
43
+ tqdm.write(f"[!] 错误:加载或转换PDF时失败: {e}")
44
+ return None
45
+
46
+ tqdm.write(f"\n--- 步骤 2/3: 分析页面布局以裁剪图和表 ---")
47
+ cropped_results_dir = os.path.join(base_work_dir, "cropped_results", pdf_stem)
48
+ for path in page_image_paths:
49
+ page_num_str = Path(path).stem
50
+ page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
51
+ extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir)
52
+ tqdm.write(f"[*] 所有裁剪结果已保存至: {cropped_results_dir}")
53
+
54
+ tqdm.write(f"\n--- 步骤 3/3: 对裁剪出的组件进行配对 ---")
55
+ final_paired_dir = os.path.join(base_work_dir, "paired_results", pdf_stem)
56
+ run_pairing_process(cropped_results_dir, final_paired_dir, threshold=30)
57
+
58
+ if os.path.isdir(final_paired_dir):
59
+ return final_paired_dir
60
+ return None
61
+
62
+ def run_pairing_process(source_dir_str: str, output_dir_str: str, threshold: int):
63
+ """配对逻辑,现在是pipeline的一部分。"""
64
+ source_dir = Path(source_dir_str)
65
+ output_root_dir = Path(output_dir_str)
66
+ if output_root_dir.exists(): shutil.rmtree(output_root_dir)
67
+ output_root_dir.mkdir(parents=True, exist_ok=True)
68
+
69
+ tqdm.write(f" 开始最近邻配对流程 (阈值 = {threshold})")
70
+
71
+ page_dirs = sorted([d for d in source_dir.iterdir() if d.is_dir() and d.name.startswith('page_')])
72
+ for page_dir in page_dirs:
73
+ output_page_dir = output_root_dir / page_dir.name
74
+ output_page_dir.mkdir(exist_ok=True)
75
+ pair_items_on_page(str(page_dir), str(output_page_dir), threshold)
76
+
77
+ def pair_items_on_page(page_dir: str, output_dir: str, threshold: int):
78
+ """处理单个页面目录,进行最近邻配对。"""
79
+ organized_files = defaultdict(dict)
80
+ component_types = ["figure", "figure_caption", "table", "table_caption_above", "table_caption_below"]
81
+
82
+ def parse_filename(filename: str):
83
+ match = re.match(r'([a-zA-Z_]+)_(\d+)_score([\d.]+)\.jpg', filename)
84
+ return (match.group(1), int(match.group(2))) if match else (None, None)
85
+
86
+ for comp_type in component_types:
87
+ comp_dir = os.path.join(page_dir, comp_type)
88
+ if os.path.isdir(comp_dir):
89
+ for filename in os.listdir(comp_dir):
90
+ _, index = parse_filename(filename)
91
+ if index is not None: organized_files[comp_type][index] = os.path.join(comp_dir, filename)
92
+
93
+ paired_files, used_captions = set(), defaultdict(set)
94
+
95
+ for item_type, cap_types in [("figure", ["figure_caption"]), ("table", ["table_caption_above", "table_caption_below"])]:
96
+ for item_index, item_path in organized_files[item_type].items():
97
+ best_match = {'min_diff': float('inf'), 'cap_path': None, 'cap_index': -1, 'cap_type': ''}
98
+ for cap_type in cap_types:
99
+ for cap_index, cap_path in organized_files[cap_type].items():
100
+ if cap_index in used_captions[cap_type]: continue
101
+ diff = abs(item_index - cap_index)
102
+ if diff < best_match['min_diff']:
103
+ best_match.update({'min_diff': diff, 'cap_path': cap_path, 'cap_index': cap_index, 'cap_type': cap_type})
104
+
105
+ if best_match['cap_path'] and best_match['min_diff'] <= threshold:
106
+ target_dir = os.path.join(output_dir, f"paired_{item_type}_{item_index}")
107
+ os.makedirs(target_dir, exist_ok=True)
108
+ shutil.copy(item_path, target_dir); shutil.copy(best_match['cap_path'], target_dir)
109
+ paired_files.add(item_path); paired_files.add(best_match['cap_path'])
110
+ used_captions[best_match['cap_type']].add(best_match['cap_index'])
111
+
112
+ for files_dict in organized_files.values():
113
+ for file_path in files_dict.values():
114
+ if file_path not in paired_files:
115
+ item_type, index = parse_filename(Path(file_path).name)
116
+ if item_type:
117
+ target_dir = os.path.join(output_dir, f"unpaired_{item_type}_{index}")
118
+ os.makedirs(target_dir, exist_ok=True); shutil.copy(file_path, target_dir)
pragent/backend/html2txt.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # html2txt.py
2
+ from bs4 import BeautifulSoup
3
+ import sys
4
+ import aiofiles
5
+ from tqdm.asyncio import tqdm
6
+ async def convert_html_to_txt(html_file_path: str, output_txt_path: str) -> bool:
7
+ try:
8
+ async with aiofiles.open(html_file_path, 'r', encoding='utf-8') as f:
9
+ html_from_file = await f.read()
10
+ except FileNotFoundError:
11
+ tqdm.write(f"[!] Error: Intermediate HTML file not found '{html_file_path}'.", file=sys.stderr)
12
+ return False
13
+ except Exception as e:
14
+ tqdm.write(f"[!] Error reading HTML file: {e}", file=sys.stderr)
15
+ return False
16
+
17
+ soup = BeautifulSoup(html_from_file, "lxml")
18
+ paragraphs = soup.find_all('p')
19
+
20
+ extracted_lines = [p.get_text(separator=" ", strip=True) for p in paragraphs if p.get_text(strip=True)]
21
+ tqdm.write(f"[*] Text extraction complete, found {len(extracted_lines)} valid lines of text.")
22
+
23
+ try:
24
+ full_text_content = "\n".join(extracted_lines)
25
+ async with aiofiles.open(output_txt_path, 'w', encoding='utf-8') as f:
26
+ await f.write(full_text_content)
27
+ return True
28
+ except Exception as e:
29
+ tqdm.write(f"[!] Error writing to TXT file: {e}", file=sys.stderr)
30
+ return False
pragent/backend/loader.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # loader.py
2
+ import fitz
3
+ from PIL import Image
4
+ from typing import List
5
+ from tqdm.asyncio import tqdm
6
+ class ImagePDFLoader:
7
+ def __init__(self, file_path: str, dpi: int = 250):
8
+ self.file_path = file_path
9
+ self.dpi = dpi
10
+
11
+ def load(self) -> List[Image.Image]:
12
+ images = []
13
+ try:
14
+ doc = fitz.open(self.file_path)
15
+ for page in doc:
16
+ zoom_matrix = fitz.Matrix(self.dpi / 72, self.dpi / 72)
17
+ pix = page.get_pixmap(matrix=zoom_matrix, alpha=False)
18
+ if pix.width > 0 and pix.height > 0:
19
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
20
+ images.append(image)
21
+ doc.close()
22
+ except Exception as e:
23
+ tqdm.write(f"Error during PDF processing: {e}")
24
+ return []
25
+ return images
pragent/backend/pdf2html.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf2html.py
2
+ import fitz
3
+ from pathlib import Path
4
+ import sys
5
+ from bs4 import BeautifulSoup
6
+ import asyncio
7
+ import aiofiles
8
+ from tqdm.asyncio import tqdm
9
+ def convert_pdf_sync(pdf_path: str) -> str:
10
+ try:
11
+ doc = fitz.open(pdf_path)
12
+ tqdm.write(f"[*] Successfully opened PDF file: {pdf_path}")
13
+ except Exception as e:
14
+ tqdm.write(f"[!] Error: Could not open PDF file. {e}", file=sys.stderr)
15
+ return ""
16
+ full_html_content = ""
17
+ for page in doc:
18
+ full_html_content += page.get_text("html")
19
+ doc.close()
20
+ soup = BeautifulSoup(full_html_content, "lxml")
21
+ for img_tag in soup.find_all("img"):
22
+ img_tag.decompose()
23
+
24
+ return soup.prettify()
25
+
26
+ async def convert_pdf_to_text_only_html(pdf_path: str, output_path: str) -> bool:
27
+ cleaned_html = await asyncio.to_thread(convert_pdf_sync, pdf_path)
28
+ if not cleaned_html:
29
+ return False
30
+ try:
31
+ output_file = Path(output_path)
32
+ output_file.parent.mkdir(parents=True, exist_ok=True)
33
+ async with aiofiles.open(output_file, "w", encoding="utf-8") as f:
34
+ await f.write(cleaned_html)
35
+ return True
36
+ except Exception as e:
37
+ tqdm.write(f"[!] Error: Could not write HTML file. {e}", file=sys.stderr)
38
+ return False
pragent/backend/prompts.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prompts.py
2
+ # --- STAGE 1 PROMPT (English) ---
3
+ # This prompt is the foundation and remains unchanged. It creates a good, factual draft.
4
+ TEXT_GENERATOR_PROMPT = """
5
+ # Role: You are a top-tier technology analyst and industry commentator. Your articles are renowned for their depth, insight, and concise language, getting straight to the point and providing genuine value to readers.
6
+
7
+ # Task: Strictly adhere to all the requirements below to transform the provided "Original Paper Text" into a high-quality, high-density blog post in Markdown format, filled with expert-level insights.
8
+
9
+ # --- High-Quality Blog Post Example (Do Not Change This Format) ---
10
+
11
+ **Engaging Social Media Title: A Deep Dive into AI Memory, a New Survey from Huawei Noah's Ark Lab**
12
+
13
+ ✍️ **Authors**: Y. Wang, Z. Chen, et al. (from Huawei Noah's Ark Lab)
14
+ 📚 **Paper Title**: From Human Memory to AI Memory: A Survey on Memory Mechanisms in the Era of LLMs
15
+ 🌐 **Source**: arXiv:2504.15965 (Apr 23, 2025)
16
+
17
+ ---
18
+ *Body of the post starts here...*
19
+
20
+ 🔍 **The Research Question:** Traditional Large Language Models (LLMs) have significant limitations, especially when it comes to processing long texts and maintaining context. These constraints hinder their application in more complex tasks like multi-step reasoning, personalized dialogue, and long-term task management. While existing research offers some solutions, most only analyze memory from a temporal perspective, which is not comprehensive enough.
21
+
22
+ 💡 **Core Contributions:** To overcome these limitations, the research team proposes a novel memory taxonomy based on three dimensions—Object (individual vs. system), Form (parametric vs. non-parametric), and Time (short-term vs. long-term)—resulting in eight distinct quadrants. This framework aims to systematically understand memory in LLM-driven AI, drawing inspiration from human memory research to build more efficient systems.
23
+
24
+ 🚀 **The Key Method:** The proposed 3D-8Q memory taxonomy covers both individual and system memory, providing a detailed analysis of their form and temporal characteristics. This method allows researchers to systematically organize existing work and provides a guiding framework for future memory mechanism design.
25
+
26
+ 📊 **Key Results & Implications:** The team conducted experiments on multiple public datasets to validate the effectiveness of the 3D-8Q taxonomy. The results show that memory systems optimized with this framework demonstrate significant performance improvements in complex tasks such as multi-step reasoning, personalized dialogue, and long-term task management.
27
+
28
+ #LLM #RAG #Agent #Multimodal #LargeModels #RetrievalAugmentedGeneration
29
+
30
+ # --- Your Creative Task ---
31
+
32
+ # Core Requirements (Must Be Strictly Followed):
33
+
34
+ ## 1. Title and Authorship (for S1 Score):
35
+ - **Create a New Title**: Based on the original paper title, create a more engaging and accessible title for social media.
36
+ - **Extract Author Info**: Accurately identify and list the main authors from the "Original Paper Text". **Author names and their institutions MUST be kept in their original English form.** Use "et al." if there are too many.
37
+ - **Format the Header**: Strictly follow the format of the "High-Quality Blog Post Example" to organize the title, authors, original paper title, and source information at the very beginning of the post. Use the same emojis (✍️, 📚, 🌐).
38
+
39
+ ## 2. Content Structure (for S2 Score):
40
+ Your article must clearly contain the following core analytical modules. Do not add unnecessary sections.
41
+ - **The Research Question:** Precisely distill the core problem this paper aims to solve. What is the context and importance of this problem?
42
+ - **Core Contributions:** Clearly list the 1-2 most significant innovations or contributions of this paper. What's new here for the field?
43
+ - **The Key Method:** Break down the key method or core idea proposed in the paper. How does it achieve its contributions? What are the technical details?
44
+ - **Key Results & Implications:** What key results did the paper present to support its claims? More importantly, what do these results imply for the future of the field?
45
+
46
+ ## 3. Writing Style (for S2 & S7 Scores):
47
+ You must completely abandon the writing patterns of an AI assistant and adopt the perspective of a critical, analytical expert.
48
+ - **【STRICTLY FORBIDDEN】:** Absolutely prohibit the use of generic, low-density, AI-like phrases such as "In conclusion," "It is worth noting that," "Firstly," "Secondly," "Furthermore," "To summarize," "As can be seen," etc.
49
+ - **【BE CONCISE】:** Eliminate all filler words and conversational fluff. Every sentence must carry information.
50
+ - **【CONFIDENT & DIRECT】:** As an expert, you must state points directly and confidently. Use "The method validates..." instead of "The method seems to validate...".
51
+
52
+ ## 4. Formatting (for S8 Score):
53
+ - Use relevant emojis as visual guides for each core module, as shown in the example.
54
+ - Include relevant technical hashtags at the end of the post.
55
+
56
+ # Original Paper Text:
57
+ ---
58
+ {paper_text}
59
+ ---
60
+
61
+ Begin your creation. Remember, your goal is not to "imitate a human," but to "be an expert."
62
+ """
63
+
64
+ # --- STAGE 1 PROMPT (Chinese) ---
65
+ TEXT_GENERATOR_PROMPT_CHINESE = """
66
+ # 角色:你是一位顶尖的科技领域分析师和行业评论员。你的文章以深度、洞察力和精炼的语言著称,能够直击要点,为读者提供真正的价值。
67
+
68
+ # 任务:严格遵循以下的所有要求,将我提供的“原始论文文本”改编成一篇高质量、高信息密度、充满专家洞见的中文博客文章(Markdown格式)。
69
+
70
+ # --- 优质博客范例 (请严格遵守此格式) ---
71
+
72
+ **引人入胜的社交媒体标题:华为诺亚方舟新作,AI记忆机制的全面调查**
73
+
74
+ ✍️ **作者**: Y. Wang, Z. Chen, 等 (来自 华为诺亚方舟实验室)
75
+ 📚 **论文标题**: From Human Memory to AI Memory: A Survey on Memory Mechanisms in the Era of LLMs
76
+ 🌐 **来源**: arXiv:2504.15965 (2025年4月23日)
77
+
78
+ ---
79
+ *正文由此开始...*
80
+
81
+ 🔍 **研究问题:** 传统大型语言模型(LLM)在处理信息时,存在明显的局限性,尤其是在处理长文本和保持上下文连贯性方面。这些局限性限制了LLM在更广泛和复杂的任务中的应用,比如多步骤推理、个性化对话和长周期任务管理。现有的研究虽然提供了一些解决方案,但大多数只从时间维度分析了记忆机制,这显然不够全面。
82
+
83
+ 💡 **核心贡献:** 为了克服当前记忆机制的局限,研究团队提出了一种新的记忆分类法,基于对象(个人和系统)、形式(参数和非参数)和时间(短期和长期)三个维度,以及八个象限来进行系统性的分类和分析。这一分类法旨在更好地理解LLM驱动的AI系统中的记忆机制,并借鉴人类记忆的研究成果,构建更高效的记忆系统。
84
+
85
+ 🚀 **重点方法:** 本文提出的3D-8Q记忆分类法,不仅涵盖了个人记忆和系统记忆,还详细分析了记忆的形式和时间特性。通过这种方法,研究团队能够更系统地组织现有的研究工作,为未来的记忆机制设计提供指导。
86
+
87
+ 📊 **关键结果与意义:** 研究团队在多个公开数据集上进行了实验,验证了3D-8Q记忆分类法的有效性。实验结果显示,通过这种分类法优化的记忆系统在多步骤推理、个性化对话和长周期任务管理等复杂任务中表现出了显著的性能提升。
88
+
89
+ #LLM[话题]# #RAG[话题]# #agent[话题]# #multimodal[话题]# #大模型[话题]# #检索增强[话题]#
90
+
91
+ # --- 你的创作任务 ---
92
+
93
+ # 核心要求 (必须严格遵守):
94
+
95
+ ## 1. 标题与作者信息 (for S1 Score):
96
+ - **创作新标题**: 基于原文标题,创作一个更吸引人、更易于理解的中文社交媒体标题。
97
+ - **提取作者信息**: 从“原始论文文本”中准确识别并列出主要作者。**作者姓名和所属研究机构必须保留其原始英文格式,不得翻译。** 如果作者过多,可以使用“等” (et al.)。
98
+ - **格式化头部**: 严格按照“优质博客范例”的格式,在文章最开头组织标题、作者、原始论文标题和来源信息。使用相同的表情符号 (✍️, 📚, 🌐)。
99
+
100
+ ## 2. 内容结构 (for S2 Score):
101
+ 你的文章必须清晰地包含以下几个核心分析模块,不要添加不必要的章节:
102
+ - **研究问题:** 精准提炼这篇论文到底要解决什么核心问题?这个问题的背景和重要性是什么?
103
+ - **核心贡献:** 清晰地列出本文最主要的1-2个创新点或贡献。这篇论文的出现,为领域带来了什么新东西?
104
+ - **重点方法:** 详细拆解论文提出的关键方法或核心思路。它是如何实现其贡献的?技术细节是什么?
105
+ - **关键结果与意义:** 论文通过实验得到了什么关键结果来支撑其观点?更重要的是,这些结果对未来意味着什么?
106
+
107
+ ## 3. 写作风格 (for S2 & S7 Scores):
108
+ - **【严厉禁止】:** 绝对禁止使用“总而言之”、“值得注意的是”、“首先”、“其次”、“此外”、“综上所述”、“不难发现”这类AI常用、且降低信息密度的八股文词汇。
109
+ - **【精炼语言】:** 砍掉所有不必要的修饰和口语化闲聊。每一句话都应承载信息。
110
+ - **【自信与直接】:** 作为一个专家,你需要直接、自信地陈述观点。用“该方法验证了...”代替“该方法似乎验证了...”。
111
+
112
+ ## 4. 格式要求 (for S8 Score):
113
+ - 使用贴切的表情符号作为每个核心模块的视觉引导,如范例所示。
114
+ - 在文末附上相关的技术话题标签(Hashtags),使用 `[话题]` 格式。
115
+
116
+ # 原始论文文本:
117
+ ---
118
+ {paper_text}
119
+ ---
120
+
121
+ 开始你的创作。记住,你的目标不是“模仿人类”,而是“成为专家”。
122
+ """
123
+
124
+ # ==============================================================================
125
+ # --- STAGE 2 PROMPTS (FINISHERS - UNIFIED STRATEGY FOR P2 & P3 METRICS) ---
126
+ # ==============================================================================
127
+
128
+ # ------------------------------------------------------------------------------
129
+ # --- A. TWITTER (X) PROMPTS ---
130
+ # ------------------------------------------------------------------------------
131
+ TWITTER_RICH_TEXT_PROMPT_ENGLISH = """
132
+ # ROLE: You are an expert communicator—a researcher who can captivate both peers and the public. Your goal is to create a Twitter (X) thread that is both technically credible and excitingly viral.
133
+
134
+ # TASK: Rewrite the provided draft into a single, high-impact Twitter thread that satisfies BOTH busy professionals and curious enthusiasts.
135
+
136
+ # UNIFIED STRATEGY (Strictly Follow):
137
+ - **Hook with Impactful "Wow":** Start with a hook that is both a quantifiable achievement (for professionals) and a surprising fact (for enthusiasts). E.g., "Just cut model inference time by 50% with a surprisingly simple geometric trick. Here's the story: 🧵"
138
+ - **Intuitive Storytelling with Hard Data:** Frame the content as a story (Problem -> Insight -> Solution). Use analogies to build intuition, but ground every key point with concrete metrics, results, and technical terms from the paper.
139
+ - **Enthusiastic Expertise Tone:** Write with the confidence and precision of an expert, but with the passion and clarity of a great teacher. Avoid dry, academic language AND overly simplistic fluff.
140
+ - **Visually Informative:** Choose figures that are both information-dense (showing data, architecture) and visually clean/compelling.
141
+
142
+ # YOUR INSTRUCTIONS
143
+ 1. **Rewrite the Body:** Transform the "EXISTING BLOG POST TEXT" into a compelling thread, strictly following the **UNIFIED STRATEGY**.
144
+ 2. **Integrate Figures:** Weave the figures into the narrative where they best support a key insight or result. Place the figure placeholder on its own new line.
145
+ 3. **Incorporate Author/Paper Info:** Naturally integrate author and paper details. **Ensure author names and institutions remain in English.**
146
+ 4. **Add Engagement Elements:** End with a thought-provoking question and 3-5 hashtags that appeal to both audiences (e.g., #AI, #MachineLearning, #Innovation).
147
+ 5. **Output Format:** Your response must be **only** the final, ready-to-publish thread text.
148
+
149
+ # ORIGINAL SOURCE TEXT (for deep context):
150
+ ---
151
+ {source_text}
152
+ ---
153
+ # EXISTING BLOG POST TEXT (to be rewritten):
154
+ ---
155
+ {blog_text}
156
+ ---
157
+ # AVAILABLE FIGURES AND DESCRIPTIONS:
158
+ ---
159
+ {items_list_str}
160
+ ---
161
+ """
162
+
163
+ TWITTER_TEXT_ONLY_PROMPT_ENGLISH = """
164
+ # ROLE: You are an expert communicator—a researcher who can captivate both peers and the public. Your goal is to create a **text-only** Twitter (X) thread that is both technically credible and excitingly viral.
165
+
166
+ # TASK: Rewrite the provided draft into a single, high-impact, **text-only** Twitter thread that satisfies BOTH busy professionals and curious enthusiasts.
167
+
168
+ # UNIFIED STRATEGY (Strictly Follow):
169
+ - **Hook with Impactful "Wow":** Start with a hook that is both a quantifiable achievement (for professionals) and a surprising fact (for enthusiasts). E.g., "Just cut model inference time by 50% with a surprisingly simple geometric trick. Here's the story: 🧵"
170
+ - **Intuitive Storytelling with Hard Data:** Frame the content as a story (Problem -> Insight -> Solution). Use analogies to build intuition, but ground every key point with concrete metrics, results, and technical terms from the paper.
171
+ - **Enthusiastic Expertise Tone:** Write with the confidence and precision of an expert, but with the passion and clarity of a great teacher. Avoid dry, academic language AND overly simplistic fluff.
172
+
173
+ # YOUR INSTRUCTIONS
174
+ 1. **Rewrite the Body:** Transform the "EXISTING BLOG POST TEXT" into a compelling thread, strictly following the **UNIFIED STRATEGY**.
175
+ 2. **Incorporate Author/Paper Info:** Naturally integrate author and paper details. **Ensure author names and institutions remain in English.**
176
+ 3. **Add Engagement Elements:** End with a thought-provoking question and 3-5 hashtags that appeal to both audiences (e.g., #AI, #MachineLearning, #Innovation).
177
+ 4. **Output Format:** Your response must be **only** the final, ready-to-publish thread text.
178
+
179
+ # EXISTING BLOG POST TEXT (to be rewritten):
180
+ ---
181
+ {blog_text}
182
+ ---
183
+ """
184
+
185
+ TWITTER_RICH_TEXT_PROMPT_CHINESE = """
186
+ # 角色: 你是一位顶级的沟通专家——一个既能吸引同行又能吸引公众的研究者。你的目标是创作一个既有技术可信度又具病毒式传播潜力的推特(X平台)帖子。
187
+
188
+ # 任务: 将提供的草稿改写成一个能同时满足忙碌专业人士和好奇爱好者的高影响力推文串。
189
+
190
+ # 统一策略 (必须严格遵守):
191
+ - **用“惊人”的“量化”成果开场:** 开头必须一句话同时包含“可量化的成果”(吸引专业人士)和“惊人/反直觉的事实”(吸引爱好者)。例如:“我们用一个惊人简单的几何技巧,把模型推理时间砍掉一半。这背后是一个有趣的故事:🧵”
192
+ - **用硬核数据讲述直观故事:** 将内容构建成一个故事(问题 -> 洞察 -> 解决方案)。用类比来建立直觉,但每个关键节点都必须有论文中的具体指标、结果和技术术语作为支撑。
193
+ - **充满热情的专家口吻:** 以专家的自信和严谨,结合优秀老师的热情和清晰来写作。避免干巴巴的学术腔和过于简化的“废话”。
194
+ - **图片信息丰富且吸引人:** 选择的图片必须既信息密集(展示数据、架构),又视觉清晰、有吸引力。
195
+
196
+ # 你的指令
197
+ 1. **重写正文:** 严格遵循 **统一策略**,将“现有博客草稿”改写成一个引人注目的推文串。
198
+ 2. **整合图文:** 将图表融入叙事中,选择最能支撑关键洞察或成果的位置。将图表占位符放置在单独的新行。
199
+ 3. **融入作者/论文信息:** 自然地整合作者和论文信息。**确保作者姓名和单位保留其原始英文格式。**
200
+ 4. **添加互动元素:** 以一个引人深思的问题结尾,并附上3-5个能同时吸引两类受众的话题标签 (例如, #人工智能, #机器学习, #科技创新)。
201
+ 5. **输出格式:** 你的回应**只能**是最终的、可直接发布的帖子内容。
202
+
203
+ # 原始论文(供深度参考):
204
+ ---
205
+ {source_text}
206
+ ---
207
+ # 现有博客草稿(待改写):
208
+ ---
209
+ {blog_text}
210
+ ---
211
+ # 可用图表及描述:
212
+ ---
213
+ {items_list_str}
214
+ ---
215
+ """
216
+
217
+ TWITTER_TEXT_ONLY_PROMPT_CHINESE = """
218
+ # 角色: 你是一位顶级的沟通专家——一个既能吸引同行又能吸引公众的研究者。你的目标是创作一个既有技术可信度又具病毒式传播潜力的**纯文本**推特(X平台)帖子。
219
+
220
+ # 任务: 将提供的草稿改写成一个能同时满足忙碌专业人士和好奇爱好者的高影响力**纯文本**推文串。
221
+
222
+ # 统一策略 (必须严格遵守):
223
+ - **用“惊人”的“量化”成果开场:** 开头必须一句话同时包含“可量化的成果”(吸引专业人士)和“惊人/反直觉的事实”(吸引爱好者)。例如:“我们用一个惊人简单的几何技巧,把模型推理时间砍掉一半。这背后是一个有趣的故事:🧵”
224
+ - **用硬核数据讲述直观故事:** 将内容构建成一个故事(问题 -> 洞察 -> 解决方案)。用类比来建立直觉,但每个关键节点都必须有论文中的具体指标、结果和技术术语作为支撑。
225
+ - **充满热情的专家口吻:** 以专家的自信和严谨,结合优秀老师的热情和清晰来写作。避免干巴巴的学术腔和过于简化的“废话”。
226
+
227
+ # 你的指令
228
+ 1. **重写正文:** 严格遵循 **统一策略**,将“现有博客草稿”改写成一个引人注目的推文串。
229
+ 2. **融入作者/论文信息:** 自然地整合作者和论文信息。**确保作者姓名和单位保留其原始英文格式。**
230
+ 3. **添加互动元素:** 以一个引人深思的问题结尾,并附上3-5个能同时吸引两类受众的话题标签 (例如, #人工智能, #机器学习, #科技创新)。
231
+ 4. **输出格式:** 你的回应**只能**是最终的、可直接发布的帖子内容。
232
+
233
+ # 现有博客草稿(待改写):
234
+ ---
235
+ {blog_text}
236
+ ---
237
+ """
238
+
239
+ # ------------------------------------------------------------------------------
240
+ # --- B. XIAOHONGSHU PROMPTS ---
241
+ # ------------------------------------------------------------------------------
242
+ XIAOHONGSHU_PROMPT_ENGLISH = """
243
+ # ROLE: You are an expert tech content creator on Xiaohongshu. Your style is a perfect blend of a professional's "dry goods" (干货) and a science communicator's engaging storytelling.
244
+
245
+ # TASK: Transform the provided draft into a single, high-quality Xiaohongshu post that is highly valuable to BOTH industry professionals and curious tech enthusiasts.
246
+
247
+ # UNIFIED STRATEGY (Strictly Follow):
248
+ - **Title is an "Impactful Hook":** The title must be a compelling hook that also states the core, quantifiable achievement. E.g., "This AI paper is a must-read! 🤯 They boosted performance by 30% with one clever trick."
249
+ - **Narrative Structure with Clear Signposts:** Start with a story-like intro (the "why"). Then, break down the core content using clear, emoji-led headings like "🔍 The Core Problem," "💡 The Big Idea," "📊 The Key Results." This makes it scannable for professionals and easy to follow for enthusiasts.
250
+ - **Intuition-Building backed by Data:** Explain complex ideas using simple analogies, but immediately follow up with the key technical terms and performance metrics from the paper.
251
+ - **Visually Compelling and Informative Images:** Select figures that are clean and easy to understand, but also contain the key data or diagrams that a professional would want to see.
252
+
253
+ # YOUR STEP-BY-STEP EXECUTION PLAN
254
+ ### STEP 1: Rewrite the Post Body
255
+ * **Create the Title and Body:** Rewrite the entire post following the **UNIFIED STRATEGY**.
256
+ * **Include Author Info:** After the title, you MUST include the author, paper title, and source details. **Ensure author names and institutions remain in their original English form.**
257
+ * **Format for Scannability:** Use emojis, short paragraphs, and bold text to make the post visually appealing and easy to digest.
258
+ ### STEP 2: Select and Append Best Images
259
+ * **Select the 3-4 most suitable figures** that align with the **UNIFIED STRATEGY**.
260
+ * **Append ONLY the placeholders for these selected figures to the very end of the post.**
261
+ ### STEP 3: Drive Engagement
262
+ * **Topic Tags (#):** Add a mix of broad and specific hashtags (e.g., `#AI`, `#Tech`, `#DataScience`, `#LLM`).
263
+ * **Call to Action (CTA):** End with a CTA that invites discussion from everyone (e.g., "This could change so much! What do you all think? 👇").
264
+
265
+ # --- AVAILABLE ASSETS ---
266
+ ## 1. Structured Draft:
267
+ {blog_text}
268
+ ## 2. Available Figures and Descriptions:
269
+ {items_list_str}
270
+ # --- FINAL OUTPUT ---
271
+ Your final output must be **only the complete, ready-to-publish post text, with the selected image placeholders at the end**.
272
+ """
273
+
274
+ XIAOHONGSHU_TEXT_ONLY_PROMPT_ENGLISH = """
275
+ # ROLE: You are an expert tech content creator on Xiaohongshu. Your style is a perfect blend of a professional's "dry goods" (干货) and a science communicator's engaging storytelling.
276
+
277
+ # TASK: Transform the provided draft into a single, high-quality, **text-only** Xiaohongshu post that is valuable to BOTH industry professionals and curious tech enthusiasts. **DO NOT include image placeholders.**
278
+
279
+ # UNIFIED STRATEGY (Strictly Follow):
280
+ - **Title is an "Impactful Hook":** The title must be a compelling hook that also states the core, quantifiable achievement. E.g., "This AI paper is a must-read! 🤯 They boosted performance by 30% with one clever trick."
281
+ - **Narrative Structure with Clear Signposts:** Start with a story-like intro (the "why"). Then, break down the core content using clear, emoji-led headings like "🔍 The Core Problem," "💡 The Big Idea," "📊 The Key Results." This makes it scannable for professionals and easy to follow for enthusiasts.
282
+ - **Intuition-Building backed by Data:** Explain complex ideas using simple analogies, but immediately follow up with the key technical terms and performance metrics from the paper.
283
+
284
+ # YOUR STEP-BY-STEP EXECUTION PLAN
285
+ ### STEP 1: Rewrite the Post Body
286
+ * **Create the Title and Body:** Rewrite the entire post following the **UNIFIED STRATEGY**.
287
+ * **Include Author Info:** After the title, you MUST include the author, paper title, and source details. **Ensure author names and institutions remain in their original English form.**
288
+ * **Format for Scannability:** Use emojis, short paragraphs, and bold text to make the post visually appealing and easy to digest.
289
+ ### STEP 2: Drive Engagement
290
+ * **Topic Tags (#):** Add a mix of broad and specific hashtags (e.g., `#AI`, `#Tech`, `#DataScience`, `#LLM`).
291
+ * **Call to Action (CTA):** End with a CTA that invites discussion from everyone (e.g., "This could change so much! What do you all think? 👇").
292
+
293
+ # --- Structured Draft ---
294
+ {blog_text}
295
+ # --- FINAL OUTPUT ---
296
+ Your final output must be **only the complete, ready-to-publish text-only post**.
297
+ """
298
+
299
+ XIAOHONGSHU_PROMPT_CHINESE = """
300
+ # 角色: 你是一位顶尖的小红书科技博主,完美融合了专业人士的“干货”分享与科普作家的生动叙事。
301
+
302
+ # 任务: 将提供的草稿,改编成一篇能同时吸引行业专家和科技爱好者的高质量小红书笔记。
303
+
304
+ # 统一策略 (必须严格遵守):
305
+ - **标题是“有冲击力的钩子”:** 标题必须既能激发好奇心,又包含核心的、可量化的成果。例如:“这篇AI论文必读!🤯一个巧思把性能提升30%”
306
+ - **带有清晰路标的叙事结构:** 以故事性的“为什么”开场,然后用清晰的、表情符号引导的标题(如 🔍核心问题, 💡天才想法, 📊关键结果)来拆解核心内容。这既方便专家快速浏览,也利于爱好者跟上思路。
307
+ - **数据支撑下的直觉建立:** 用简单的类比解释复杂概念,但紧接着必须给出论文中的关键技术术语和性能指标。
308
+ - **图片既要信息量大又要吸引人:** 选择的图片要清晰易懂,同时包含专家想看的关键数据或架构图。
309
+
310
+ # 你的执行步骤
311
+ ### 第一步:重写笔记正文
312
+ * **创作标题和正文:** 严格遵循 **统一策略** 重写整个帖子。
313
+ * **包含作者信息:** 在标题后,**必须**包含作者、论文标题和来源等详细信息。**确保作者姓名和单位保留其原始英文格式。**
314
+ * **为易读性排版:** 大量使用表情符号、短段落和粗体,使笔记视觉上吸引人且易于消化。
315
+ ### 第二步:挑选并附加最佳图片
316
+ * **挑选3-4张最符合统一策略的图片。**
317
+ * **只将这些被选中图片的占位符,附加到笔记的最后面。**
318
+ ### 第三步:引导互动
319
+ * **话题标签:** 添加组合标签,既有宽泛的也有具体的 (例如: `#AI[话题]#`, `#黑科技[话题]#`, `#数据科学[话题]#`, `#大语言模型[话题]#`)。
320
+ * **行动号召:** 用一个能邀请所有人讨论的CTA结尾 (例如: “这个想法太妙了!大家怎么看?👇”)。
321
+
322
+ # --- 可用材料 ---
323
+ ## 1. 结构化草稿:
324
+ {blog_text}
325
+ ## 2. 可用图文及���述:
326
+ {items_list_str}
327
+ # --- 最终输出 ---
328
+ 你的全部回应**只能**是最终的、可直接发布的帖子内容,最后附加上被选中的图片占位符。
329
+ """
330
+
331
+ XIAOHONGSHU_TEXT_ONLY_PROMPT_CHINESE = """
332
+ # 角色: 你是一位顶尖的小红书科技博主,完美融合了专业人士的“干货”分享与科普作家的生动叙事。
333
+
334
+ # 任务: 将提供的草稿,改编成一篇能同时吸引行业专家和科技爱好者的高质量**纯文本**小红书笔记。**不要包含图片占位符。**
335
+
336
+ # 统一策略 (必须严格遵守):
337
+ - **标题是“有冲击力的钩子”:** 标题必须既能激发好奇心,又包含核心的、可量化的成果。例如:“这篇AI论文必读!🤯一个巧思把性能提升30%”
338
+ - **带有清晰路标的叙事结构:** 以故事性的“为什么”开场,然后用清晰的、表情符号引导的标题(如 🔍核心问题, 💡天才想法, 📊关键结果)来拆解核心内容。这既方便专家快速浏览,也利于爱好者跟上思路。
339
+ - **数据支撑下的直觉建立:** 用简单的类比解释复杂概念,但紧接着必须给出论文中的关键技术术语和性能指标。
340
+
341
+ # 你的执行步骤
342
+ ### 第一步:重写笔记正文
343
+ * **创作标题和正文:** 严格遵循 **统一策略** 重写整个帖子。
344
+ * **包含作者信息:** 在标题后,**必须**包含作者、论文标题和来源等详细信息。**确保作者姓名和单位保留其原始英文格式。**
345
+ * **为易读性排版:** 大量使用表情符号、短段落和粗体,使笔记视觉上吸引人且易于消化。
346
+ ### 第二步:引导互动
347
+ * **话题标签:** 添加组合标签,既有宽泛的也有具体的 (例如: `#AI[话题]#`, `#黑科技[话题]#`, `#数据科学[话题]#`, `#大语言模型[话题]#`)。
348
+ * **行动号召:** 用一个能邀请所有人讨论的CTA结尾 (例如: “这个想法太妙了!大家怎么看?👇”)。
349
+
350
+ # --- 结构化草稿 ---
351
+ {blog_text}
352
+ # --- 最终输出 ---
353
+ 你的全部回应**只能**是最终的、可直接发布的**纯文本**帖子内容。
354
+ """
355
+
356
+ # ==============================================================================
357
+ # --- NEW: BASELINE PROMPTS ---
358
+ # ==============================================================================
359
+
360
+ BASELINE_PROMPT_ENGLISH = """
361
+ # ROLE: You are a helpful assistant.
362
+
363
+ # TASK: Read the provided research paper text and write a brief social media post about it for the platform '{platform}'.
364
+
365
+ # RESEARCH PAPER TEXT:
366
+ ---
367
+ {paper_text}
368
+ ---
369
+
370
+ # YOUR SOCIAL MEDIA POST:
371
+ """
372
+
373
+ BASELINE_PROMPT_CHINESE = """
374
+ # 角色: 你是一个乐于助人的助手。
375
+
376
+ # 任务: 阅读以下提供的论文文本,并为平台 '{platform}' 撰写一篇简短的社交媒体帖子。
377
+
378
+ # 论文文本:
379
+ ---
380
+ {paper_text}
381
+ ---
382
+
383
+ # 你的社交媒体帖子:
384
+ """
385
+
386
+
387
+ GENERIC_RICH_PROMPT_ENGLISH = """
388
+ # ROLE: You are an AI assistant.
389
+
390
+ # TASK: Rewrite the following structured draft into a simple and clear social media post.
391
+ - The post should be easy for a general audience to understand.
392
+ - If figures are provided, integrate them into the text where they seem most relevant using the format `[FIGURE_PLACEHOLDER_X]`, where X is the figure number.
393
+ - Your output must be ONLY the final text for the post.
394
+
395
+ # EXISTING BLOG POST TEXT (to be rewritten):
396
+ ---
397
+ {blog_text}
398
+ ---
399
+ # AVAILABLE FIGURES AND DESCRIPTIONS:
400
+ ---
401
+ {items_list_str}
402
+ ---
403
+ """
404
+
405
+ GENERIC_TEXT_ONLY_PROMPT_ENGLISH = """
406
+ # ROLE: You are an AI assistant.
407
+
408
+ # TASK: Rewrite the following structured draft into a simple, clear, text-only social media post.
409
+ - The post should be easy for a general audience to understand.
410
+ - Your output must be ONLY the final text for the post.
411
+
412
+ # EXISTING BLOG POST TEXT (to be rewritten):
413
+ ---
414
+ {blog_text}
415
+ ---
416
+ """
417
+
418
+ GENERIC_RICH_PROMPT_CHINESE = """
419
+ # 角色: 你是一个AI助手。
420
+
421
+ # 任务: 将以下结构化草稿,改写成一篇简单、清晰的社交媒体帖子。
422
+ - 帖子内容应便于普通读者理解。
423
+ - 如果提供了图表信息,请在文本中最相关的位置使用 `[FIGURE_PLACEHOLDER_X]` 格式来引用它们,X是图表编号。
424
+ - 你的输出必须只有最终的帖子文本。
425
+
426
+ # 现有博客草稿 (待改写):
427
+ ---
428
+ {blog_text}
429
+ ---
430
+ # 可用图表及描述:
431
+ ---
432
+ {items_list_str}
433
+ ---
434
+ """
435
+
436
+ GENERIC_TEXT_ONLY_PROMPT_CHINESE = """
437
+ # 角色: 你是一个AI助手。
438
+
439
+ # 任务: 将以下结构化草稿,改写成一篇简单、清晰的纯文本社交媒体帖子。
440
+ - 帖子内容应便于普通读者理解。
441
+ - 你的输出必须只有最终的帖子文本。
442
+
443
+ # 现有博客草稿 (待改写):
444
+ ---
445
+ {blog_text}
446
+ ---
447
+ """
448
+
449
+
450
+
451
+ BASELINE_FEWSHOT_PROMPT_ENGLISH = """
452
+ # ROLE: You are a helpful assistant.
453
+
454
+ # TASK: Read the provided example and write a academic promotion social media post about it for the platform '{platform}'. Follow the example provided.
455
+
456
+ # --- EXAMPLE ---
457
+ ## PLATFORM: Twitter
458
+ ## Example:
459
+
460
+ I’m stoked to share our new paper: “Harnessing the Universal Geometry of Embeddings” with @jxmnop
461
+ , Collin Zhang, and @shmatikov.
462
+ We present the first method to translate text embeddings across different spaces without any paired data or encoders.
463
+ Here's why we're excited: 🧵👇🏾
464
+ --------------------------------------------------------------------------
465
+ 🌀 Preserving Geometry
466
+ Our method, vec2vec, reveals that all encoders—regardless of architecture or training data—learn nearly the same representations!
467
+ We demonstrate how to translate between these black-box embeddings without any paired data, maintaining high fidelity.
468
+ --------------------------------------------------------------------------
469
+ 🔐 Security Implications
470
+ Using vec2vec, we show that vector databases reveal (almost) as much as their inputs.
471
+ Given just vectors (e.g., from a compromised vector database), we show that an adversary can extract sensitive information (e.g., PII) about the underlying text.
472
+ --------------------------------------------------------------------------
473
+ 🧠 Strong Platonic Representation Hypothesis (S-PRH)
474
+ We thus strengthen Huh et al.'s PRH to say:
475
+ The universal latent structure of text representations can be learned and harnessed to translate representations from one space to another without any paired data or encoders.
476
+ --------------------------------------------------------------------------
477
+ 📄 Read the Full Paper
478
+ Dive into the details here: https://arxiv.org/pdf/2505.12540
479
+ We welcome feedback and discussion!
480
+
481
+
482
+ ---
483
+ # --- YOUR TASK ---
484
+
485
+ # RESEARCH PAPER TEXT:
486
+ ---
487
+ {paper_text}
488
+ ---
489
+
490
+ # YOUR SOCIAL MEDIA POST:
491
+ """
492
+
493
+ BASELINE_FEWSHOT_PROMPT_CHINESE = """
494
+ # 角色: 你是一个乐于助人的助手。
495
+
496
+ # 任务: 阅读以下提供的例子,并为平台 '{platform}' 撰写一篇宣传论文的社交媒体帖子。请参考范例。
497
+
498
+ # --- 范例 ---
499
+ ## 平台: 小红书
500
+ ## 范例:
501
+ 🌐arXiv ID: arXiv:2504.15965
502
+ 📚论文标题: From Human Memory to AI Memory: A Survey on Memory Mechanisms in the Era of LLMs
503
+ 🔍 问题背景:传统大型语言模型(LLM)在处理信息时,存在明显的局限性,尤其是在处理长文本和保持上下文连贯性方面。这些局限性限制了LLM在更广泛和复杂的任务中的应用,比如多步骤推理、个性化对话和长周期任务管理。现有的研究虽然提供了一些解决方案,但大多数只从时间维度分析了记忆机制,这显然不够全面。
504
+ 💡 研究动机:为了克服当前记忆机制的局限,研究团队提出了一种新的记忆分类法,基于对象(个人和系统)、形式(参数和非参数)和时间(短期和长期)三个维度,以及八个象限来进行系统性的分类和分析。这一分类法旨在更好地理解LLM驱动的AI系统中的记忆机制,并借鉴人类记忆的研究成果,构建更高效的记忆系统。
505
+ 🚀 方法简介:本文提出的3D-8Q记忆分类法,不仅涵盖了个人记忆和系统记忆,还详细分析了记忆的形式和时间特性。通过这种方法,研究团队能够更系统地组织现有的研究工作,为未来的记忆机制设计提供指导。
506
+ 📊 实验设计:研究团队在多个公开数据集上进行了实验,验证了3D-8Q记忆分类法的有效性。实验结果显示,通过这种分类法优化的记忆系统在多步骤推理、个性化对话和长周期任务管理等复杂任务中表现出了显著的性能提升。
507
+
508
+ #LLM[话题]# #RAG[话题]# #agent[话题]# #multimodal[话题]# #大模型[话题]# #检索增强[话题]# #多模态[话题]#
509
+ ---
510
+ # --- 你的任务 ---
511
+
512
+ # 论文文本:
513
+ ---
514
+ {paper_text}
515
+ ---
516
+
517
+ # 你的社交媒体帖子:
518
+ """
pragent/backend/text_pipeline.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pragent/backend/text_pipeline.py
2
+
3
+ import asyncio
4
+ import sys
5
+ import os
6
+ from pathlib import Path
7
+ import aiofiles.os
8
+ from tqdm.asyncio import tqdm
9
+ from pragent.backend.pdf2html import convert_pdf_to_text_only_html
10
+ from pragent.backend.html2txt import convert_html_to_txt
11
+
12
+ # MODIFIED FOR ABLATION STUDY: Added ablation_mode parameter
13
+ async def pipeline(pdf_path: str, output_txt_path: str, ablation_mode: str = "none"):
14
+ """
15
+ Defines the complete ASYNCHRONOUS conversion flow from PDF to TXT.
16
+ The ablation_mode parameter is accepted but the primary logic for summarization
17
+ ablation is handled downstream in blog_pipeline.py.
18
+ """
19
+ tqdm.write("--- PDF to TXT Conversion Pipeline Started ---")
20
+
21
+ pdf_file = Path(pdf_path)
22
+ intermediate_html_path = pdf_file.with_suffix(".temp.html")
23
+
24
+ tqdm.write("\n--- Step 1/3: Converting PDF to HTML ---")
25
+ if not await convert_pdf_to_text_only_html(pdf_path, str(intermediate_html_path)):
26
+ tqdm.write("[!] PDF to HTML conversion failed. Aborting pipeline.", file=sys.stderr)
27
+ return
28
+
29
+ tqdm.write(f"\n--- Step 2/3: Converting HTML to TXT ---")
30
+ if not await convert_html_to_txt(str(intermediate_html_path), output_txt_path):
31
+ tqdm.write("[!] HTML to TXT conversion failed. Aborting pipeline.", file=sys.stderr)
32
+ else:
33
+ tqdm.write(f"\n[✓] Success! Final text file saved to: {output_txt_path}")
34
+
35
+ tqdm.write(f"\n--- Step 3/3: Cleaning up temporary files ---")
36
+ try:
37
+ await aiofiles.os.remove(intermediate_html_path)
38
+ tqdm.write(f"[*] Temporary file '{intermediate_html_path.name}' deleted successfully.")
39
+ except OSError as e:
40
+ tqdm.write(f"[!] Error deleting temporary file: {e}", file=sys.stderr)
41
+
42
+ tqdm.write("\n--- Pipeline Finished ---")
pragent/backend/text_processor.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pragent/backend/text_processor.py
2
+ import re
3
+ from typing import List, Tuple
4
+ from langchain_openai import ChatOpenAI
5
+ from langchain.chains.summarize import load_summarize_chain
6
+ from langchain.docstore.document import Document
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from openai import AsyncOpenAI, BadRequestError
10
+ from tqdm.asyncio import tqdm
11
+
12
+ SUMMARIZATION_THRESHOLD = 4000
13
+ FALLBACK_HEADER_SIZE = 3000
14
+
15
+ def create_llm(model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False):
16
+ """Creates a LangChain LLM object from the provided client."""
17
+ if not client:
18
+ raise ValueError("API client is not initialized.")
19
+
20
+ model_kwargs = {}
21
+ if "qwen3" in model.lower() and disable_qwen_thinking:
22
+ tqdm.write("[*] Summarizer: Enabling 'disable_thinking' for Qwen3 model.")
23
+ model_kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
24
+
25
+ return ChatOpenAI(
26
+ model_name=model,
27
+ openai_api_key=client.api_key,
28
+ openai_api_base=str(client.base_url),
29
+ model_kwargs=model_kwargs # Pass the extra arguments here
30
+ )
31
+
32
+ def split_text_by_structure(long_text: str) -> Tuple[str, str]:
33
+ """
34
+ Intelligently splits the text into a "header" (title, authors, abstract) and "body".
35
+ It looks for keywords like "Abstract" and "Introduction" to determine the split point.
36
+ """
37
+ abstract_match = re.search(r'\bAbstract\b', long_text, re.IGNORECASE)
38
+ if not abstract_match:
39
+ tqdm.write("[!] 'Abstract' keyword not found. Falling back to fixed character count for splitting.")
40
+ return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
41
+
42
+ intro_match = re.search(r'(\n\s*(\d+|I|II|III|IV|V)\.?\s*)?Introduction', long_text[abstract_match.end():], re.IGNORECASE)
43
+
44
+ if not intro_match:
45
+ tqdm.write("[!] 'Introduction' keyword not found after 'Abstract'. Falling back to fixed character count for splitting.")
46
+ return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
47
+
48
+ split_point = abstract_match.end() + intro_match.start()
49
+
50
+ header_text = long_text[:split_point]
51
+ body_text = long_text[split_point:]
52
+
53
+ tqdm.write(f"[*] Successfully separated header via keywords ({len(header_text)} characters).")
54
+ return header_text, body_text
55
+
56
+ # --- MODIFIED: Added disable_qwen_thinking parameter ---
57
+ async def summarize_long_text(long_text: str, model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False) -> str:
58
+ """
59
+ Asynchronously summarizes long text using a structure-aware hybrid strategy.
60
+ """
61
+ if not long_text:
62
+ return ""
63
+
64
+ if len(long_text) <= SUMMARIZATION_THRESHOLD:
65
+ tqdm.write(f"[*] Total text length ({len(long_text)} chars) is below threshold {SUMMARIZATION_THRESHOLD}. Skipping summarization.")
66
+ return long_text
67
+
68
+ header_text, body_text = split_text_by_structure(long_text)
69
+
70
+ if not body_text:
71
+ tqdm.write("[!] Could not separate the body text. Returning the full original text.")
72
+ return header_text
73
+
74
+ tqdm.write(f"[*] Summarizing the identified body text ({len(body_text)} characters)...")
75
+
76
+ try:
77
+ # Pass the flag down to the LLM creator
78
+ llm = create_llm(model, client, disable_qwen_thinking=disable_qwen_thinking)
79
+ except ValueError as e:
80
+ return f"Error: {e}"
81
+
82
+ body_summary = ""
83
+
84
+ tqdm.write("[*] Attempting high-speed 'stuff' summarization strategy for the body text...")
85
+ try:
86
+ stuff_prompt_template = """
87
+ # INSTRUCTION
88
+ You are a senior editor. Your task is to read the following body text of a research paper and synthesize it into a single, coherent, and detailed summary.
89
+ This summary needs to cover all the essential aspects of the provided text.
90
+
91
+ # PAPER BODY TEXT:
92
+ ---
93
+ {text}
94
+ ---
95
+
96
+ # YOUR DETAILED SYNTHESIZED SUMMARY:
97
+ """
98
+ STUFF_PROMPT = PromptTemplate(template=stuff_prompt_template, input_variables=["text"])
99
+ stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=STUFF_PROMPT, verbose=True)
100
+
101
+ docs = [Document(page_content=body_text)]
102
+ body_summary = await stuff_chain.arun(docs)
103
+ tqdm.write("[✓] 'Stuff' strategy for the body text was successful!")
104
+
105
+ except BadRequestError as e:
106
+ if "context_length_exceeded" not in str(e).lower() and "maximum context length" not in str(e).lower() and "context length" not in str(e).lower():
107
+ tqdm.write(f"[!] Unexpected API error with 'stuff' strategy: {e}")
108
+ return f"Error: API call failed - {e}"
109
+ tqdm.write("[!] Body text is too long for the 'stuff' strategy. Falling back to 'map_reduce'.")
110
+
111
+ text_splitter = RecursiveCharacterTextSplitter(
112
+ chunk_size=30000,
113
+ chunk_overlap=3000
114
+ )
115
+ chunks = text_splitter.split_text(body_text)
116
+ docs = [Document(page_content=t) for t in chunks]
117
+ tqdm.write(f"[*] Body text has been split into {len(docs)} chunks for summarization.")
118
+
119
+ map_prompt_template = """
120
+ # INSTRUCTION
121
+ You are a research analyst. Your task is to read the following text segment from a scientific paper and generate a concise summary.
122
+ Focus only on the most critical information: the research question, the proposed method, key results, and the main conclusion.
123
+ The language must be refined and to the point.
124
+
125
+ # TEXT SEGMENT:
126
+ ---
127
+ {text}
128
+ ---
129
+
130
+ # YOUR CONCISE SUMMARY:
131
+ """
132
+ MAP_PROMPT = PromptTemplate(template=map_prompt_template, input_variables=["text"])
133
+
134
+ combine_prompt_template = """
135
+ # INSTRUCTION
136
+ You are a senior editor. You have received several summaries extracted from different parts of the same research paper.
137
+ Your task is to synthesize these summaries into a single, coherent final summary.
138
+
139
+ # LIST OF SUMMARIES:
140
+ ---
141
+ {text}
142
+ ---
143
+
144
+ # YOUR SYNTHESIZED FINAL DETAILED SUMMARY:
145
+ """
146
+ COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template, input_variables=["text"])
147
+
148
+ map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT, verbose=True)
149
+
150
+ try:
151
+ body_summary = await map_reduce_chain.arun(docs)
152
+ tqdm.write("[✓] 'Map-Reduce' summarization for the body text is complete.")
153
+ except Exception as chain_error:
154
+ tqdm.write(f"[!] 'Map-Reduce' chain execution failed: {chain_error}")
155
+ return f"Error: 'Map-Reduce' summarization failed - {chain_error}"
156
+
157
+ except Exception as e:
158
+ tqdm.write(f"[!] An unknown error occurred during the summarization process: {e}")
159
+ return f"Error: Summarization failed - {e}"
160
+
161
+ final_text = f"{header_text}\n\n[--- Body Summary ---]\n\n{body_summary}"
162
+ return final_text
pragent/backend/yolo.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # yolo.py
2
+ import os
3
+ from PIL import Image
4
+ from doclayout_yolo import YOLOv10
5
+ from tqdm.asyncio import tqdm
6
+ CLASS_NAMES = {
7
+ 0: "title",
8
+ 1: "plain_text",
9
+ 2: "abandon",
10
+ 3: "figure",
11
+ 4: "figure_caption",
12
+ 5: "table",
13
+ 6: "table_caption_above",
14
+ 7: "table_caption_below",
15
+ 8: "formula",
16
+ 9: "formula_caption",
17
+ }
18
+
19
+ def extract_and_save_layout_components(image_path, model_path, save_base_dir="./cropped_results", imgsz=1024, conf=0.2, device="cpu"):
20
+ """
21
+ 从图像中提取文档布局组件,并按类别保存截图。
22
+
23
+ Args:
24
+ image_path (str): 输入图像路径
25
+ model_path (str): 模型权重路径(.pt)
26
+ save_base_dir (str): 保存截图的根目录
27
+ imgsz (int): 输入图像的尺寸(会缩放到这个大小)
28
+ conf (float): 检测框的置信度阈值
29
+ device (str): 使用的计算设备,比如 'cuda:0' 或 'cpu'
30
+ """
31
+ model = YOLOv10(model_path)
32
+ image = Image.open(image_path)
33
+ det_results = model.predict(image_path, imgsz=imgsz, conf=conf, device=device)
34
+
35
+ result = det_results[0]
36
+ boxes = result.boxes.xyxy.cpu().tolist()
37
+ classes = result.boxes.cls.cpu().tolist()
38
+ scores = result.boxes.conf.cpu().tolist()
39
+
40
+ for idx, (box, cls_id, score) in enumerate(zip(boxes, classes, scores)):
41
+ cls_id = int(cls_id)
42
+ class_name = CLASS_NAMES.get(cls_id, f"cls{cls_id}")
43
+ save_dir = os.path.join(save_base_dir, class_name)
44
+ os.makedirs(save_dir, exist_ok=True)
45
+ x1, y1, x2, y2 = map(int, box)
46
+ cropped = image.crop((x1, y1, x2, y2))
47
+ if cropped.mode == 'RGBA':
48
+ cropped = cropped.convert('RGB')
49
+ save_path = os.path.join(save_dir, f"{class_name}_{idx}_score{score:.2f}.jpg")
50
+ cropped.save(save_path)
51
+ tqdm.write(f"共保存 {len(boxes)} 张截图,按类别分别保存在 {save_base_dir}/")
pragent/logo/logo.png ADDED
pragent/model/doclayout_yolo_docstructbench_imgsz1024.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a2ee0220fe3d9ad31b47e1d9f1282f46959a54e4618fce9cffcc9715b8286e2
3
+ size 40709302
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ arxiv==2.2.0
3
+ beautifulsoup4==4.13.5
4
+ bert_score==0.3.13
5
+ doclayout_yolo==0.0.4
6
+ fitz==0.0.1.dev2
7
+ gradio
8
+ langchain==0.3.27
9
+ langchain_openai==0.3.33
10
+ matplotlib==3.10.6
11
+ numpy==2.3.3
12
+ openai==1.108.1
13
+ pandas==2.3.2
14
+ pdfplumber==0.11.7
15
+ Pillow==11.3.0
16
+ prettytable==3.16.0
17
+ pydantic==2.11.9
18
+ pytesseract==0.3.13
19
+ python-dotenv==1.1.1
20
+ PyYAML==6.0.2
21
+ rouge_score==0.1.2
22
+ scipy==1.16.2
23
+ seaborn==0.13.2
24
+ simpledorff==0.0.2
25
+ tiktoken==0.11.0
26
+ tqdm==4.67.1
27
+ huggingface_hub
28
+ sentence-transformers