Legal2 / app.py
akazmi's picture
Update app.py
983c9b5 verified
import gradio as gr
import os
from groq import Groq
from PyPDF2 import PdfReader
import re
from datasets import load_dataset
# Function to read the uploaded PDFs and return the text
def read_pdf_from_dataset(file_name):
try:
# Load the dataset containing the PDF files
dataset = load_dataset("akazmi/legal-documents")
# Get the content of the selected document
document = dataset["train"][file_name]
file_path = document["file"]
# Read the PDF file content
with open(file_path, "rb") as file:
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
return f"Error reading PDF: {str(e)}"
# Function to chunk large text for Groq model to avoid token limits
def chunk_text(text, chunk_size=3000):
chunks = []
for i in range(0, len(text), chunk_size):
chunks.append(text[i:i + chunk_size])
return chunks
# Function to perform document retrieval (find the relevant chunks)
def retrieve_relevant_document(user_question, document_text):
text_chunks = chunk_text(document_text)
# Find chunk with the highest relevance to the user's question
relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
return relevant_chunk
# A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
def similarity(query, text):
query_words = set(query.lower().split())
text_words = set(text.lower().split())
common_words = query_words.intersection(text_words)
return len(common_words)
# Initialize Groq client
def initialize_groq():
return Groq(api_key=os.getenv("GROQ_API_KEY"))
# Function to handle document selection and answer generation using RAG
def answer_question(selected_document, user_question):
# Check if document is selected
if selected_document is None:
return "Please select a document before asking a question."
# Read the content from the selected document
document_text = read_pdf_from_dataset(selected_document)
# If document text is empty, return an error message
if not document_text:
return "Error: The document content is empty or could not be extracted."
# Perform document retrieval: get the most relevant chunk
relevant_chunk = retrieve_relevant_document(user_question, document_text)
# Prepare the query for the model, including the relevant chunk of text
query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"
# Initialize Groq client
client = initialize_groq()
try:
# Generate the answer from the Groq model
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": query}],
model="llama3-8b-8192", # Use your chosen model
)
# Return the model's response
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error generating answer: {str(e)}"
# Create Gradio Interface
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("### Ask questions based on the selected document")
# Dropdown to select the document
document_dropdown = gr.Dropdown(
label="Select Document",
choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"],
value="Income Tax Ordinance.pdf"
)
# Input for the user's question
question_input = gr.Textbox(
label="Enter your question",
placeholder="Ask something related to the selected document..."
)
# Output area for the answer
answer_output = gr.Textbox(label="Answer", interactive=False)
# Button to submit the question and get the answer
submit_button = gr.Button("Ask")
submit_button.click(
fn=answer_question,
inputs=[document_dropdown, question_input],
outputs=answer_output
)
return demo
# Run the interface
if __name__ == "__main__":
demo = create_interface()
demo.launch()