Spaces:
Sleeping
Sleeping
import chromadb | |
import tempfile | |
import os | |
from chromadb.config import Settings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PDFPlumberLoader | |
from langchain_chroma import Chroma | |
from langchain.vectorstores.base import VectorStore | |
from langchain_openai import OpenAIEmbeddings | |
def process_file(file_data, file_type: str = None) -> list: | |
""" | |
Process a PDF file and split it into documents. | |
Args: | |
file_data: Either a file path (str) or file bytes | |
file_type: Optional file type, defaults to checking if PDF | |
Returns: | |
List of processed documents | |
Raises: | |
TypeError: If file is not a PDF | |
ValueError: If PDF parsing fails | |
""" | |
if file_type and file_type != "application/pdf": | |
raise TypeError("Only PDF files are supported") | |
# Handle both file path and file bytes | |
if isinstance(file_data, bytes): | |
# Create a temporary file for the PDF bytes | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
tmp_file.write(file_data) | |
tmp_file_path = tmp_file.name | |
try: | |
loader = PDFPlumberLoader(tmp_file_path) | |
documents = loader.load() | |
finally: | |
# Clean up the temporary file | |
os.unlink(tmp_file_path) | |
else: | |
# Assume it's a file path | |
loader = PDFPlumberLoader(file_data) | |
documents = loader.load() | |
# Clean up extracted text to fix common PDF extraction issues | |
for doc in documents: | |
# Fix common spacing issues from PDF extraction | |
doc.page_content = doc.page_content.replace('\n', ' ') # Replace newlines with spaces | |
doc.page_content = ' '.join(doc.page_content.split()) # Normalize whitespace | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=3000, | |
chunk_overlap=100, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
docs = text_splitter.split_documents(documents) | |
for i, doc in enumerate(docs): | |
doc.metadata["source"] = f"source_{i}" | |
if not docs: | |
raise ValueError("PDF file parsing failed.") | |
return docs | |
def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]: | |
""" | |
Create a vector store search engine from a PDF file. | |
Args: | |
file_data: Either a file path (str) or file bytes | |
file_type: Optional file type for validation | |
api_key: OpenAI API key for embeddings | |
Returns: | |
Tuple of (search_engine, docs) where: | |
- search_engine: The Chroma vector store | |
- docs: The processed documents | |
""" | |
# Process the file | |
docs = process_file(file_data, file_type) | |
encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key) | |
# Initialize Chromadb client and settings, reset to ensure we get a clean | |
# search engine | |
client = chromadb.EphemeralClient() | |
client_settings = Settings( | |
allow_reset=True, | |
anonymized_telemetry=False | |
) | |
search_engine = Chroma( | |
client=client, | |
client_settings=client_settings | |
) | |
search_engine._client.reset() | |
search_engine = Chroma.from_documents( | |
client=client, | |
documents=docs, | |
embedding=encoder, | |
client_settings=client_settings | |
) | |
return search_engine, docs |