Spaces:

taaha3244
/

Lex

Runtime error

Lex / preprocess.py

Update preprocess.py

9c98b1f verified about 2 months ago

No virus

1.75 kB

	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import UnstructuredAPIFileLoader


	def load_documents_OCR(file_path, unstructured_api):
	"""Load documents that require OCR via unstructured."""
	loader = UnstructuredAPIFileLoader(file_path=file_path,
	api_key=unstructured_api,
	url='https://paf-stkjy1b5.api.unstructuredapp.io/',
	mode='paged')
	documents = loader.load()
	return documents


	def load_documents(file_path):
	"""Load documents using LangChain."""
	loader = PyPDFLoader(file_path)
	documents = loader.load()
	return documents

	def split_documents(documents):
	"""Split documents using LangChain splitter."""
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
	split_docs = text_splitter.split_documents(documents)
	return split_docs


	def load_and_split_documents(file_path):
	"""Load and split documents from the specified file path."""
	loader = PyPDFLoader(file_path)
	documents = loader.load()
	if not documents:
	print("No documents loaded from file:", file_path)
	return []
	split_docs = split_documents(documents)
	if not split_docs:
	print("Document splitting resulted in no output for file:", file_path)
	return split_docs

	def update_metadata(documents, original_name):
	"""Update metadata for each document."""
	updated_documents = []
	for doc in documents:
	doc.metadata['source'] = original_name
	updated_documents.append(doc)
	return updated_documents