Spaces:

akazmi
/

Legal2

Sleeping

App Files Files Community

Legal2 / app.py

akazmi

Update app.py

983c9b5 verified 8 months ago

raw

history blame contribute delete

4.25 kB

	import gradio as gr
	import os
	from groq import Groq
	from PyPDF2 import PdfReader
	import re
	from datasets import load_dataset

	# Function to read the uploaded PDFs and return the text
	def read_pdf_from_dataset(file_name):
	try:
	# Load the dataset containing the PDF files
	dataset = load_dataset("akazmi/legal-documents")

	# Get the content of the selected document
	document = dataset["train"][file_name]
	file_path = document["file"]

	# Read the PDF file content
	with open(file_path, "rb") as file:
	reader = PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text
	except Exception as e:
	return f"Error reading PDF: {str(e)}"

	# Function to chunk large text for Groq model to avoid token limits
	def chunk_text(text, chunk_size=3000):
	chunks = []
	for i in range(0, len(text), chunk_size):
	chunks.append(text[i:i + chunk_size])
	return chunks

	# Function to perform document retrieval (find the relevant chunks)
	def retrieve_relevant_document(user_question, document_text):
	text_chunks = chunk_text(document_text)

	# Find chunk with the highest relevance to the user's question
	relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk))
	return relevant_chunk

	# A simple similarity function (you can use a more advanced one, e.g., cosine similarity with embeddings)
	def similarity(query, text):
	query_words = set(query.lower().split())
	text_words = set(text.lower().split())
	common_words = query_words.intersection(text_words)
	return len(common_words)

	# Initialize Groq client
	def initialize_groq():
	return Groq(api_key=os.getenv("GROQ_API_KEY"))

	# Function to handle document selection and answer generation using RAG
	def answer_question(selected_document, user_question):
	# Check if document is selected
	if selected_document is None:
	return "Please select a document before asking a question."

	# Read the content from the selected document
	document_text = read_pdf_from_dataset(selected_document)

	# If document text is empty, return an error message
	if not document_text:
	return "Error: The document content is empty or could not be extracted."

	# Perform document retrieval: get the most relevant chunk
	relevant_chunk = retrieve_relevant_document(user_question, document_text)

	# Prepare the query for the model, including the relevant chunk of text
	query = f"{user_question} \n\n Relevant Document: {relevant_chunk}"

	# Initialize Groq client
	client = initialize_groq()

	try:
	# Generate the answer from the Groq model
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": query}],
	model="llama3-8b-8192", # Use your chosen model
	)
	# Return the model's response
	return chat_completion.choices[0].message.content
	except Exception as e:
	return f"Error generating answer: {str(e)}"

	# Create Gradio Interface
	def create_interface():
	with gr.Blocks() as demo:
	gr.Markdown("### Ask questions based on the selected document")

	# Dropdown to select the document
	document_dropdown = gr.Dropdown(
	label="Select Document",
	choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"],
	value="Income Tax Ordinance.pdf"
	)

	# Input for the user's question
	question_input = gr.Textbox(
	label="Enter your question",
	placeholder="Ask something related to the selected document..."
	)

	# Output area for the answer
	answer_output = gr.Textbox(label="Answer", interactive=False)

	# Button to submit the question and get the answer
	submit_button = gr.Button("Ask")

	submit_button.click(
	fn=answer_question,
	inputs=[document_dropdown, question_input],
	outputs=answer_output
	)

	return demo

	# Run the interface
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()