aakash0563 commited on
Commit
b9ca8c8
1 Parent(s): c176e10

Create upload_pdf.py

Browse files
Files changed (1) hide show
  1. upload_pdf.py +41 -0
upload_pdf.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from uuid import uuid4
5
+
6
+
7
+ text_splitter = RecursiveCharacterTextSplitter(
8
+ chunk_size=800,
9
+ chunk_overlap=50
10
+ )
11
+ client = chromadb.PersistentClient("test")
12
+ collection = client.create_collection("test_data")
13
+
14
+ def upload_pdf(file_path):
15
+ loader = PyPDFLoader(file_path)
16
+ pages = loader.load()
17
+ documents = []
18
+ for page in pages:
19
+ docs = text_splitter.split_text(page.page_content)
20
+ for doc in docs:
21
+ documents.append({
22
+ "text": docs, "meta_data": page.metadata,
23
+ })
24
+ collection.add(
25
+ ids=[str(uuid4()) for _ in range(len(documents))],
26
+ documents=[doc['text'][0] for doc in documents],
27
+ metadatas=[doc['meta_data'] for doc in documents]
28
+ )
29
+ return f"PDF Uploaded Successfully. {collection.count()} chunks stored in ChromaDB"
30
+
31
+ # Define the Gradio interface
32
+ iface = gr.Interface(
33
+ fn=upload_pdf,
34
+ inputs=["file"], # Specify a file input component
35
+ outputs="textbox", # Display the output text in a textbox
36
+ title="Upload PDF to ChromaDB",
37
+ description="Upload a PDF file and store its text chunks in ChromaDB.",
38
+ )
39
+
40
+ # Launch the Gradio app
41
+ iface.launch(debug=True,share=True)