Spaces:

HanLee
/

linkedin-learning

Sleeping

App Files Files Community

linkedin-learning / app /utils.py

HanLee

feat: update to streamlit, lcel, and deployable to huggingface

53edd39 2 months ago

raw

history blame contribute delete

3.47 kB

	import chromadb
	import tempfile
	import os
	from chromadb.config import Settings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PDFPlumberLoader
	from langchain_chroma import Chroma
	from langchain.vectorstores.base import VectorStore
	from langchain_openai import OpenAIEmbeddings


	def process_file(file_data, file_type: str = None) -> list:
	"""
	Process a PDF file and split it into documents.

	Args:
	file_data: Either a file path (str) or file bytes
	file_type: Optional file type, defaults to checking if PDF

	Returns:
	List of processed documents

	Raises:
	TypeError: If file is not a PDF
	ValueError: If PDF parsing fails
	"""
	if file_type and file_type != "application/pdf":
	raise TypeError("Only PDF files are supported")

	# Handle both file path and file bytes
	if isinstance(file_data, bytes):
	# Create a temporary file for the PDF bytes
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(file_data)
	tmp_file_path = tmp_file.name

	try:
	loader = PDFPlumberLoader(tmp_file_path)
	documents = loader.load()
	finally:
	# Clean up the temporary file
	os.unlink(tmp_file_path)
	else:
	# Assume it's a file path
	loader = PDFPlumberLoader(file_data)
	documents = loader.load()

	# Clean up extracted text to fix common PDF extraction issues
	for doc in documents:
	# Fix common spacing issues from PDF extraction
	doc.page_content = doc.page_content.replace('\n', ' ') # Replace newlines with spaces
	doc.page_content = ' '.join(doc.page_content.split()) # Normalize whitespace

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=3000,
	chunk_overlap=100,
	separators=["\n\n", "\n", " ", ""]
	)
	docs = text_splitter.split_documents(documents)
	for i, doc in enumerate(docs):
	doc.metadata["source"] = f"source_{i}"
	if not docs:
	raise ValueError("PDF file parsing failed.")
	return docs


	def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]:
	"""
	Create a vector store search engine from a PDF file.

	Args:
	file_data: Either a file path (str) or file bytes
	file_type: Optional file type for validation
	api_key: OpenAI API key for embeddings

	Returns:
	Tuple of (search_engine, docs) where:
	- search_engine: The Chroma vector store
	- docs: The processed documents
	"""
	# Process the file
	docs = process_file(file_data, file_type)

	encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)

	# Initialize Chromadb client and settings, reset to ensure we get a clean
	# search engine
	client = chromadb.EphemeralClient()
	client_settings = Settings(
	allow_reset=True,
	anonymized_telemetry=False
	)
	search_engine = Chroma(
	client=client,
	client_settings=client_settings
	)
	search_engine._client.reset()

	search_engine = Chroma.from_documents(
	client=client,
	documents=docs,
	embedding=encoder,
	client_settings=client_settings
	)

	return search_engine, docs