Spaces:
Runtime error
Runtime error
| import os | |
| import faiss | |
| import numpy as np | |
| import gradio as gr | |
| from pypdf import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| # ----------------------------- | |
| # Initialize Models | |
| # ----------------------------- | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| client = Groq( | |
| api_key=os.environ.get("Tgb"), | |
| ) | |
| # ----------------------------- | |
| # Global Variables | |
| # ----------------------------- | |
| index = None | |
| documents = [] | |
| # ----------------------------- | |
| # PDF Processing | |
| # ----------------------------- | |
| def read_pdf(file): | |
| reader = PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() | |
| return text | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunk = text[start:end] | |
| chunks.append(chunk) | |
| start += chunk_size - overlap | |
| return chunks | |
| # ----------------------------- | |
| # Create FAISS Index | |
| # ----------------------------- | |
| def create_index(chunks): | |
| global index, documents | |
| documents = chunks | |
| embeddings = embedder.encode(chunks) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(np.array(embeddings)) | |
| # ----------------------------- | |
| # Retrieval with Relevance Check | |
| # ----------------------------- | |
| def retrieve(query, k=3, threshold=1.2): | |
| if index is None: | |
| return [], None | |
| query_embedding = embedder.encode([query]) | |
| distances, indices = index.search(np.array(query_embedding), k) | |
| relevant_chunks = [] | |
| valid_distances = [] | |
| for i, dist in zip(indices[0], distances[0]): | |
| if dist < threshold: | |
| relevant_chunks.append(documents[i]) | |
| valid_distances.append(dist) | |
| # Confidence score (lower distance = better) | |
| confidence = None | |
| if len(valid_distances) > 0: | |
| avg_dist = np.mean(valid_distances) | |
| if avg_dist < 0.5: | |
| confidence = "High" | |
| elif avg_dist < 1.0: | |
| confidence = "Medium" | |
| else: | |
| confidence = "Low" | |
| return relevant_chunks, confidence | |
| # ----------------------------- | |
| # Ask Groq LLM | |
| # ----------------------------- | |
| def ask_groq(context_chunks, question): | |
| context = "\n".join(context_chunks) | |
| prompt = f""" | |
| You are an intelligent assistant. | |
| Rules: | |
| 1. If the answer is clearly present in the context, answer normally. | |
| 2. If the answer is NOT directly present but somewhat related, say: | |
| "This is not explicitly mentioned in the document, but based on related context..." | |
| then give a helpful answer. | |
| 3. If the context is completely irrelevant, say: | |
| "The document does not contain information related to this question." | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| """ | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # ----------------------------- | |
| # Main Pipeline | |
| # ----------------------------- | |
| def process_pdf(file): | |
| if file is None: | |
| return "Please upload a PDF first." | |
| text = read_pdf(file) | |
| if not text.strip(): | |
| return "Could not extract text from PDF." | |
| chunks = chunk_text(text) | |
| create_index(chunks) | |
| return f"PDF processed successfully! Total chunks: {len(chunks)}" | |
| def answer_question(question): | |
| if index is None: | |
| return "Please upload and process a PDF first." | |
| context_chunks, confidence = retrieve(question) | |
| if len(context_chunks) == 0: | |
| return "The document does not contain information related to this question." | |
| answer = ask_groq(context_chunks, question) | |
| if confidence: | |
| answer = f"(Confidence: {confidence})\n\n" + answer | |
| return answer | |
| # ----------------------------- | |
| # Gradio UI | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 📄 RAG PDF Q&A App (Groq + FAISS)") | |
| file_input = gr.File(label="Upload PDF") | |
| upload_btn = gr.Button("Process PDF") | |
| status = gr.Textbox(label="Status") | |
| question = gr.Textbox(label="Ask a question") | |
| answer = gr.Textbox(label="Answer") | |
| upload_btn.click(process_pdf, inputs=file_input, outputs=status) | |
| question.submit(answer_question, inputs=question, outputs=answer) | |
| # ----------------------------- | |
| # Run App | |
| # ----------------------------- | |
| if __name__ == "__main__": | |
| demo.launch() |