import os import gradio as gr import fitz # PyMuPDF from sentence_transformers import SentenceTransformer import chromadb from chromadb.utils import embedding_functions import openai # Load GROQ API Key openai.api_key = os.getenv("GROQ_API_KEY") openai.api_base = "https://api.groq.com/openai/v1" # Load embedding model embedder = SentenceTransformer("all-MiniLM-L6-v2") # Set up ChromaDB with persistence persist_path = "./chroma_db" db = chromadb.Client(chromadb.config.Settings(persist_directory=persist_path)) collection = db.get_or_create_collection("papers") # Extract text from uploaded PDF def extract_text_from_pdf(file): text = "" doc = fitz.open(stream=file.read(), filetype="pdf") for page in doc: text += page.get_text() return text # Chunk and store in vector DB def chunk_and_store(text): chunks = [text[i:i+500] for i in range(0, len(text), 500)] embeddings = embedder.encode(chunks).tolist() for i, chunk in enumerate(chunks): collection.add(documents=[chunk], ids=[f"id_{len(collection.get()['ids']) + i}"], embeddings=[embeddings[i]]) db.persist() # Retrieve relevant chunks and send to LLaMA3 via Groq def retrieve_and_ask(query): if len(collection.get()["documents"]) == 0: return "Please upload a paper first." query_embedding = embedder.encode([query]).tolist()[0] results = collection.query(query_embeddings=[query_embedding], n_results=3) context = "\n".join(results["documents"][0]) system_prompt = "You are an academic assistant helping students understand research papers." user_prompt = f"Based on the following context:\n{context}\n\nAnswer the question:\n{query}" try: response = openai.ChatCompletion.create( model="llama3-70b-8192", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ] ) return response['choices'][0]['message']['content'] except Exception as e: return f"Error: {str(e)}" # Gradio UI def handle_upload(file): if file is None: return "Upload a valid PDF file." text = extract_text_from_pdf(file) chunk_and_store(text) return "✅ Paper uploaded and processed." def handle_query(query): return retrieve_and_ask(query) with gr.Blocks() as demo: gr.Markdown("### 📘 RAG Academic Assistant\nUpload a paper and ask questions.") with gr.Row(): file = gr.File(label="Upload PDF", type="binary") upload_btn = gr.Button("Process") upload_output = gr.Textbox() with gr.Row(): query = gr.Textbox(label="Ask a question") response = gr.Textbox(label="Answer") ask_btn = gr.Button("Ask") upload_btn.click(handle_upload, inputs=[file], outputs=[upload_output]) ask_btn.click(handle_query, inputs=[query], outputs=[response]) demo.launch()