File size: 1,336 Bytes
b9ca8c8
 
 
 
9063322
b9ca8c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4
import gradio as gr


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=50
)
client = chromadb.PersistentClient("test")
collection = client.create_collection("test_data")

def upload_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    documents = []
    for page in pages:
        docs = text_splitter.split_text(page.page_content)
        for doc in docs:
            documents.append({
                "text": docs, "meta_data": page.metadata,
            })
    collection.add(
        ids=[str(uuid4()) for _ in range(len(documents))],
        documents=[doc['text'][0] for doc in documents],
        metadatas=[doc['meta_data'] for doc in documents]
    )
    return f"PDF Uploaded Successfully. {collection.count()} chunks stored in ChromaDB"

# Define the Gradio interface
iface = gr.Interface(
    fn=upload_pdf,
    inputs=["file"],  # Specify a file input component
    outputs="textbox",  # Display the output text in a textbox
    title="Upload PDF to ChromaDB",
    description="Upload a PDF file and store its text chunks in ChromaDB.",
)

# Launch the Gradio app
iface.launch(debug=True,share=True)