taaha3244 commited on
Commit
5a47e6d
1 Parent(s): 0457256

Create preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +44 -0
preprocess.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.document_loaders import UnstructuredAPIFileLoader
4
+
5
+
6
+ def load_documents_OCR(file_path, unstructured_api):
7
+ """Load documents that require OCR via unstructured."""
8
+ loader = UnstructuredAPIFileLoader(file_path=file_path, api_key=unstructured_api)
9
+ documents = loader.load()
10
+ return documents
11
+
12
+
13
+ def load_documents(file_path):
14
+ """Load documents using LangChain."""
15
+ loader = PyPDFLoader(file_path)
16
+ documents = loader.load()
17
+ return documents
18
+
19
+ def split_documents(documents):
20
+ """Split documents using LangChain splitter."""
21
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
22
+ split_docs = text_splitter.split_documents(documents)
23
+ return split_docs
24
+
25
+
26
+ def load_and_split_documents(file_path):
27
+ """Load and split documents from the specified file path."""
28
+ loader = PyPDFLoader(file_path)
29
+ documents = loader.load()
30
+ if not documents:
31
+ print("No documents loaded from file:", file_path)
32
+ return []
33
+ split_docs = split_documents(documents)
34
+ if not split_docs:
35
+ print("Document splitting resulted in no output for file:", file_path)
36
+ return split_docs
37
+
38
+ def update_metadata(documents, original_name):
39
+ """Update metadata for each document."""
40
+ updated_documents = []
41
+ for doc in documents:
42
+ doc.metadata['source'] = original_name
43
+ updated_documents.append(doc)
44
+ return updated_documents