import chromadb from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from uuid import uuid4 text_splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=50 ) client = chromadb.PersistentClient("test") collection = client.create_collection("test_data") def upload_pdf(file_path): loader = PyPDFLoader(file_path) pages = loader.load() documents = [] for page in pages: docs = text_splitter.split_text(page.page_content) for doc in docs: documents.append({ "text": docs, "meta_data": page.metadata, }) collection.add( ids=[str(uuid4()) for _ in range(len(documents))], documents=[doc['text'][0] for doc in documents], metadatas=[doc['meta_data'] for doc in documents] ) return f"PDF Uploaded Successfully. {collection.count()} chunks stored in ChromaDB" # Define the Gradio interface iface = gr.Interface( fn=upload_pdf, inputs=["file"], # Specify a file input component outputs="textbox", # Display the output text in a textbox title="Upload PDF to ChromaDB", description="Upload a PDF file and store its text chunks in ChromaDB.", ) # Launch the Gradio app iface.launch(debug=True,share=True)