Spaces:

akazmi
/

legal-documents-reader

Sleeping

App Files Files Community

akazmi commited on Jun 24

Commit

4cb39e2

verified ·

1 Parent(s): e7d139e

Create app.py

Browse files

Files changed (1) hide show

app.py +92 -0

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import gradio as gr
+import os
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import torch
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+# Load sentence transformer for embeddings
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# Load Zephyr model (use accelerate for GPU/CPU device map)
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,
+    device_map="auto"  # ✅ Works if 'accelerate' is installed
+)
+rag_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+# Function to extract text from PDF
+def read_pdf(file_path):
+    try:
+        with open(file_path, "rb") as file:
+            reader = PdfReader(file)
+            text = ""
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text
+    except Exception as e:
+        return f"Error reading PDF: {str(e)}"
+# Function to split text into chunks (~500 words)
+def chunk_text(text, chunk_size=500):
+    words = text.split()
+    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
+# Function to find top-k most relevant chunks
+def retrieve_relevant_chunks(question, chunks, top_k=3):
+    chunk_embeddings = embedder.encode(chunks)
+    question_embedding = embedder.encode([question])
+    scores = cosine_similarity(question_embedding, chunk_embeddings)[0]
+    top_indices = np.argsort(scores)[-top_k:][::-1]
+    return "\n\n".join([chunks[i] for i in top_indices])
+# Main function to process question and return answer
+def answer_question(uploaded_file, user_question):
+    if uploaded_file is None:
+        return "❌ Please upload a PDF file."
+    file_path = uploaded_file.name
+    document_text = read_pdf(file_path)
+    if not document_text or not isinstance(document_text, str):
+        return "❌ Document is empty or could not be read."
+    chunks = chunk_text(document_text)
+    if not chunks:
+        return "❌ Document is too short to process."
+    relevant_context = retrieve_relevant_chunks(user_question, chunks)
+    prompt = f"""You are a helpful assistant. Use the context below to answer the user's question.\n\nContext:\n{relevant_context}\n\nQuestion: {user_question}\nAnswer:"""
+    try:
+        result = rag_pipeline(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)
+        answer = result[0]["generated_text"].split("Answer:")[-1].strip()
+        return str(answer)
+    except Exception as e:
+        return f"❌ Error generating answer: {str(e)}"
+# Gradio interface
+def create_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("## 📄 Ask Questions from a PDF Document (RAG using Zephyr 7B)")
+        file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        question_input = gr.Textbox(label="Enter your question")
+        answer_output = gr.Textbox(label="Answer", lines=10)
+        ask_button = gr.Button("Ask")
+        ask_button.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()