Spaces:

taaha3244
/

Lex

Runtime error

App Files Files Community

taaha3244 commited on May 17, 2024

Commit

5a47e6d

verified ·

1 Parent(s): 0457256

Create preprocess.py

Browse files

Files changed (1) hide show

preprocess.py +44 -0

preprocess.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import UnstructuredAPIFileLoader
+def load_documents_OCR(file_path, unstructured_api):
+    """Load documents that require OCR via unstructured."""
+    loader = UnstructuredAPIFileLoader(file_path=file_path, api_key=unstructured_api)
+    documents = loader.load()
+    return documents
+def load_documents(file_path):
+    """Load documents using LangChain."""
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    return documents
+def split_documents(documents):
+    """Split documents using LangChain splitter."""
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
+    split_docs = text_splitter.split_documents(documents)
+    return split_docs
+def load_and_split_documents(file_path):
+    """Load and split documents from the specified file path."""
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    if not documents:
+        print("No documents loaded from file:", file_path)
+        return []
+    split_docs = split_documents(documents)
+    if not split_docs:
+        print("Document splitting resulted in no output for file:", file_path)
+    return split_docs
+def update_metadata(documents, original_name):
+    """Update metadata for each document."""
+    updated_documents = []
+    for doc in documents:
+        doc.metadata['source'] = original_name
+        updated_documents.append(doc)
+    return updated_documents