| | import os |
| | import sys |
| | from pathlib import Path |
| | import markdown |
| | from sqlalchemy.orm import Session |
| |
|
| | |
| | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) |
| |
|
| | from backend.database import get_db, engine |
| | from backend.models.book_content import BookContent, Base |
| | from backend.ai.embeddings import qwen_embeddings |
| | from backend.database.vector_db import vector_db |
| |
|
| |
|
| | def create_tables(): |
| | """Create database tables if they don't exist""" |
| | Base.metadata.create_all(bind=engine) |
| |
|
| |
|
| | def read_markdown_files(docs_dir: str): |
| | """Recursively read all markdown files in the docs directory""" |
| | md_files = [] |
| | for root, dirs, files in os.walk(docs_dir): |
| | for file in files: |
| | if file.endswith('.md') or file.endswith('.mdx'): |
| | file_path = os.path.join(root, file) |
| | md_files.append(file_path) |
| | return md_files |
| |
|
| |
|
| | def extract_content_from_file(file_path: str): |
| | """Extract content from a markdown file, removing frontmatter if present""" |
| | with open(file_path, 'r', encoding='utf-8') as f: |
| | content = f.read() |
| | |
| | |
| | if content.startswith('---'): |
| | |
| | parts = content.split('---', 2) |
| | if len(parts) >= 3: |
| | content = parts[2] |
| | |
| | |
| | |
| | html = markdown.markdown(content) |
| | |
| | import re |
| | plain_text = re.sub('<[^<]+?>', '', html) |
| | |
| | return plain_text.strip() |
| |
|
| |
|
| | def process_book_content(): |
| | """Process all book content and store in database with embeddings""" |
| | print("Starting book content processing...") |
| | |
| | |
| | create_tables() |
| | |
| | |
| | db_gen = get_db() |
| | db: Session = next(db_gen) |
| | |
| | try: |
| | |
| | docs_dir = "docs" |
| | md_files = read_markdown_files(docs_dir) |
| | |
| | print(f"Found {len(md_files)} markdown files to process") |
| | |
| | for file_path in md_files: |
| | print(f"Processing: {file_path}") |
| | |
| | try: |
| | |
| | content_text = extract_content_from_file(file_path) |
| | |
| | |
| | if len(content_text.strip()) < 10: |
| | print(f"Skipping {file_path}, content too short") |
| | continue |
| | |
| | |
| | embedding = qwen_embeddings.get_embedding(content_text[:2000]) |
| | |
| | |
| | book_content = BookContent( |
| | title=Path(file_path).stem, |
| | content=content_text, |
| | source_path=file_path, |
| | embedding_vector=str(embedding) |
| | ) |
| | |
| | |
| | db.add(book_content) |
| | |
| | |
| | content_id = book_content.id |
| | vector_db.store_embedding( |
| | content_id=content_id, |
| | embedding=embedding, |
| | content_metadata={ |
| | "title": book_content.title, |
| | "content": book_content.content[:500], |
| | "source_path": book_content.source_path |
| | } |
| | ) |
| | |
| | print(f"Processed: {file_path}") |
| | |
| | except Exception as e: |
| | print(f"Error processing {file_path}: {str(e)}") |
| | continue |
| | |
| | |
| | db.commit() |
| | print("Book content processing completed successfully!") |
| | |
| | except Exception as e: |
| | print(f"Error during book content processing: {str(e)}") |
| | db.rollback() |
| | finally: |
| | db.close() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | process_book_content() |