092_agent_api / pdf_processing.py
anhkhoiphan's picture
Thêm các hàm xử lý pdf
20a314b
import re
import pdfplumber
def _clean_text(text: str) -> str:
if not text:
return ""
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
lines = [line.rstrip() for line in text.split('\n')]
return '\n'.join(lines).strip()
def _table_to_markdown(table: list) -> str:
if not table:
return ""
cleaned = [[str(c).strip() if c else "" for c in row] for row in table]
num_cols = len(cleaned[0])
col_widths = [0] * num_cols
for row in cleaned:
for i, cell in enumerate(row[:num_cols]):
col_widths[i] = max(col_widths[i], len(cell))
lines = []
header = cleaned[0]
lines.append("| " + " | ".join(c.ljust(col_widths[i]) for i, c in enumerate(header[:num_cols])) + " |")
lines.append("| " + " | ".join("-" * w for w in col_widths) + " |")
for row in cleaned[1:]:
lines.append("| " + " | ".join(c.ljust(col_widths[i]) for i, c in enumerate(row[:num_cols])) + " |")
return "\n".join(lines)
def pdf_to_markdown(pdf_path: str) -> str:
parts = []
with pdfplumber.open(pdf_path) as pdf:
total = len(pdf.pages)
for n, page in enumerate(pdf.pages, 1):
parts.append(f"\n---\n## Trang {n}/{total}\n")
tables = page.extract_tables()
text = page.extract_text()
if text:
parts.append(_clean_text(text))
parts.append("\n")
if tables:
for i, tbl in enumerate(tables, 1):
if tbl:
parts.append(f"\n**Bảng {i}:**\n")
parts.append(_table_to_markdown(tbl))
parts.append("\n")
return _clean_text("\n".join(parts))
def format_chat_history(messages: list[dict]) -> str:
if not messages:
return "(Không có lịch sử trò chuyện)"
lines = []
for m in messages:
sender = m.get("senderName") or m.get("sender_id") or "?"
content = m.get("content") or m.get("message") or ""
ts = m.get("timestamp") or m.get("created_at") or ""
prefix = f"[{ts}] " if ts else ""
lines.append(f"{prefix}{sender}: {content}")
return "\n".join(lines)