Spaces:
Running
Running
| import re | |
| import pdfplumber | |
| def _clean_text(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) | |
| lines = [line.rstrip() for line in text.split('\n')] | |
| return '\n'.join(lines).strip() | |
| def _table_to_markdown(table: list) -> str: | |
| if not table: | |
| return "" | |
| cleaned = [[str(c).strip() if c else "" for c in row] for row in table] | |
| num_cols = len(cleaned[0]) | |
| col_widths = [0] * num_cols | |
| for row in cleaned: | |
| for i, cell in enumerate(row[:num_cols]): | |
| col_widths[i] = max(col_widths[i], len(cell)) | |
| lines = [] | |
| header = cleaned[0] | |
| lines.append("| " + " | ".join(c.ljust(col_widths[i]) for i, c in enumerate(header[:num_cols])) + " |") | |
| lines.append("| " + " | ".join("-" * w for w in col_widths) + " |") | |
| for row in cleaned[1:]: | |
| lines.append("| " + " | ".join(c.ljust(col_widths[i]) for i, c in enumerate(row[:num_cols])) + " |") | |
| return "\n".join(lines) | |
| def pdf_to_markdown(pdf_path: str) -> str: | |
| parts = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| total = len(pdf.pages) | |
| for n, page in enumerate(pdf.pages, 1): | |
| parts.append(f"\n---\n## Trang {n}/{total}\n") | |
| tables = page.extract_tables() | |
| text = page.extract_text() | |
| if text: | |
| parts.append(_clean_text(text)) | |
| parts.append("\n") | |
| if tables: | |
| for i, tbl in enumerate(tables, 1): | |
| if tbl: | |
| parts.append(f"\n**Bảng {i}:**\n") | |
| parts.append(_table_to_markdown(tbl)) | |
| parts.append("\n") | |
| return _clean_text("\n".join(parts)) | |
| def format_chat_history(messages: list[dict]) -> str: | |
| if not messages: | |
| return "(Không có lịch sử trò chuyện)" | |
| lines = [] | |
| for m in messages: | |
| sender = m.get("senderName") or m.get("sender_id") or "?" | |
| content = m.get("content") or m.get("message") or "" | |
| ts = m.get("timestamp") or m.get("created_at") or "" | |
| prefix = f"[{ts}] " if ts else "" | |
| lines.append(f"{prefix}{sender}: {content}") | |
| return "\n".join(lines) | |