|
|
"""Document processor for parsing and chunking HPMOR HTML.""" |
|
|
|
|
|
import re |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import List, Dict, Optional |
|
|
from bs4 import BeautifulSoup |
|
|
from llama_index.core import Document |
|
|
from llama_index.core.node_parser import SentenceSplitter |
|
|
from src.config import config |
|
|
|
|
|
|
|
|
class HPMORProcessor: |
|
|
"""Process HPMOR HTML document into chunks for RAG.""" |
|
|
|
|
|
def __init__(self): |
|
|
self.chunk_size = config.chunk_size |
|
|
self.chunk_overlap = config.chunk_overlap |
|
|
self.processed_dir = config.processed_data_dir |
|
|
|
|
|
def parse_html(self, file_path: Path) -> List[Dict]: |
|
|
"""Parse HTML file and extract chapters with metadata.""" |
|
|
print(f"Parsing HTML file: {file_path}") |
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
html_content = f.read() |
|
|
|
|
|
soup = BeautifulSoup(html_content, 'lxml') |
|
|
|
|
|
|
|
|
for tag in soup(['style', 'script']): |
|
|
tag.decompose() |
|
|
|
|
|
|
|
|
chapters = [] |
|
|
chapter_pattern = re.compile(r'Chapter\s+(\d+)', re.IGNORECASE) |
|
|
|
|
|
|
|
|
headers = soup.find_all(['h1', 'h2', 'h3']) |
|
|
|
|
|
current_chapter = None |
|
|
current_content = [] |
|
|
chapter_num = 0 |
|
|
|
|
|
for header in headers: |
|
|
header_text = header.get_text(strip=True) |
|
|
match = chapter_pattern.search(header_text) |
|
|
|
|
|
if match: |
|
|
|
|
|
if current_chapter and current_content: |
|
|
chapters.append({ |
|
|
'chapter_number': current_chapter['number'], |
|
|
'chapter_title': current_chapter['title'], |
|
|
'content': '\n'.join(current_content) |
|
|
}) |
|
|
|
|
|
|
|
|
chapter_num = int(match.group(1)) |
|
|
current_chapter = { |
|
|
'number': chapter_num, |
|
|
'title': header_text |
|
|
} |
|
|
current_content = [] |
|
|
|
|
|
|
|
|
for sibling in header.find_next_siblings(): |
|
|
if sibling.name in ['h1', 'h2', 'h3']: |
|
|
if chapter_pattern.search(sibling.get_text()): |
|
|
break |
|
|
text = sibling.get_text(strip=True) |
|
|
if text: |
|
|
current_content.append(text) |
|
|
|
|
|
|
|
|
if current_chapter and current_content: |
|
|
chapters.append({ |
|
|
'chapter_number': current_chapter['number'], |
|
|
'chapter_title': current_chapter['title'], |
|
|
'content': '\n'.join(current_content) |
|
|
}) |
|
|
|
|
|
|
|
|
if not chapters: |
|
|
print("No chapter structure found, processing as single document") |
|
|
text_content = soup.get_text(separator='\n', strip=True) |
|
|
chapters = [{ |
|
|
'chapter_number': 0, |
|
|
'chapter_title': 'Harry Potter and the Methods of Rationality', |
|
|
'content': text_content |
|
|
}] |
|
|
|
|
|
print(f"Extracted {len(chapters)} chapters") |
|
|
return chapters |
|
|
|
|
|
def create_chunks(self, chapters: List[Dict]) -> List[Document]: |
|
|
"""Create overlapping chunks from chapters.""" |
|
|
print(f"Creating chunks with size={self.chunk_size}, overlap={self.chunk_overlap}") |
|
|
|
|
|
documents = [] |
|
|
splitter = SentenceSplitter( |
|
|
chunk_size=self.chunk_size, |
|
|
chunk_overlap=self.chunk_overlap, |
|
|
) |
|
|
|
|
|
for chapter in chapters: |
|
|
|
|
|
chapter_doc = Document( |
|
|
text=chapter['content'], |
|
|
metadata={ |
|
|
'chapter_number': chapter['chapter_number'], |
|
|
'chapter_title': chapter['chapter_title'], |
|
|
'source': 'hpmor.html' |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
nodes = splitter.get_nodes_from_documents([chapter_doc]) |
|
|
|
|
|
|
|
|
for i, node in enumerate(nodes): |
|
|
doc = Document( |
|
|
text=node.text, |
|
|
metadata={ |
|
|
**chapter_doc.metadata, |
|
|
'chunk_id': f"ch{chapter['chapter_number']}_chunk{i}", |
|
|
'chunk_index': i, |
|
|
'total_chunks_in_chapter': len(nodes) |
|
|
} |
|
|
) |
|
|
documents.append(doc) |
|
|
|
|
|
print(f"Created {len(documents)} chunks total") |
|
|
return documents |
|
|
|
|
|
def save_processed_data(self, documents: List[Document], chapters: List[Dict]) -> None: |
|
|
"""Save processed documents and metadata to disk.""" |
|
|
|
|
|
docs_data = [] |
|
|
for doc in documents: |
|
|
docs_data.append({ |
|
|
'text': doc.text, |
|
|
'metadata': doc.metadata |
|
|
}) |
|
|
|
|
|
docs_file = self.processed_dir / 'documents.json' |
|
|
with open(docs_file, 'w', encoding='utf-8') as f: |
|
|
json.dump(docs_data, f, indent=2, ensure_ascii=False) |
|
|
print(f"Saved {len(docs_data)} documents to {docs_file}") |
|
|
|
|
|
|
|
|
chapters_file = self.processed_dir / 'chapters.json' |
|
|
with open(chapters_file, 'w', encoding='utf-8') as f: |
|
|
json.dump(chapters, f, indent=2, ensure_ascii=False) |
|
|
print(f"Saved chapter metadata to {chapters_file}") |
|
|
|
|
|
def load_processed_data(self) -> Optional[List[Document]]: |
|
|
"""Load previously processed documents.""" |
|
|
docs_file = self.processed_dir / 'documents.json' |
|
|
|
|
|
if not docs_file.exists(): |
|
|
return None |
|
|
|
|
|
with open(docs_file, 'r', encoding='utf-8') as f: |
|
|
docs_data = json.load(f) |
|
|
|
|
|
documents = [] |
|
|
for doc_data in docs_data: |
|
|
doc = Document( |
|
|
text=doc_data['text'], |
|
|
metadata=doc_data['metadata'] |
|
|
) |
|
|
documents.append(doc) |
|
|
|
|
|
print(f"Loaded {len(documents)} documents from cache") |
|
|
return documents |
|
|
|
|
|
def process(self, force_reprocess: bool = False) -> List[Document]: |
|
|
"""Main processing pipeline.""" |
|
|
|
|
|
if not force_reprocess: |
|
|
documents = self.load_processed_data() |
|
|
if documents: |
|
|
return documents |
|
|
|
|
|
|
|
|
print("Processing HPMOR document from scratch...") |
|
|
|
|
|
if not config.hpmor_file.exists(): |
|
|
raise FileNotFoundError(f"HPMOR file not found: {config.hpmor_file}") |
|
|
|
|
|
|
|
|
chapters = self.parse_html(config.hpmor_file) |
|
|
|
|
|
|
|
|
documents = self.create_chunks(chapters) |
|
|
|
|
|
|
|
|
self.save_processed_data(documents, chapters) |
|
|
|
|
|
return documents |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Process HPMOR document.""" |
|
|
processor = HPMORProcessor() |
|
|
documents = processor.process(force_reprocess=True) |
|
|
print(f"\nProcessing complete! Created {len(documents)} document chunks.") |
|
|
|
|
|
|
|
|
if documents: |
|
|
print("\nSample chunk:") |
|
|
print(f"Text: {documents[0].text[:200]}...") |
|
|
print(f"Metadata: {documents[0].metadata}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |