|
import gradio as gr |
|
import os |
|
from groq import Groq |
|
from PyPDF2 import PdfReader |
|
import re |
|
from datasets import load_dataset |
|
|
|
|
|
def read_pdf_from_dataset(file_name): |
|
try: |
|
|
|
dataset = load_dataset("akazmi/legal-documents") |
|
|
|
|
|
document = dataset["train"][file_name] |
|
file_path = document["file"] |
|
|
|
|
|
with open(file_path, "rb") as file: |
|
reader = PdfReader(file) |
|
text = "" |
|
for page in reader.pages: |
|
text += page.extract_text() |
|
return text |
|
except Exception as e: |
|
return f"Error reading PDF: {str(e)}" |
|
|
|
|
|
def chunk_text(text, chunk_size=3000): |
|
chunks = [] |
|
for i in range(0, len(text), chunk_size): |
|
chunks.append(text[i:i + chunk_size]) |
|
return chunks |
|
|
|
|
|
def retrieve_relevant_document(user_question, document_text): |
|
text_chunks = chunk_text(document_text) |
|
|
|
|
|
relevant_chunk = max(text_chunks, key=lambda chunk: similarity(user_question, chunk)) |
|
return relevant_chunk |
|
|
|
|
|
def similarity(query, text): |
|
query_words = set(query.lower().split()) |
|
text_words = set(text.lower().split()) |
|
common_words = query_words.intersection(text_words) |
|
return len(common_words) |
|
|
|
|
|
def initialize_groq(): |
|
return Groq(api_key=os.getenv("GROQ_API_KEY")) |
|
|
|
|
|
def answer_question(selected_document, user_question): |
|
|
|
if selected_document is None: |
|
return "Please select a document before asking a question." |
|
|
|
|
|
document_text = read_pdf_from_dataset(selected_document) |
|
|
|
|
|
if not document_text: |
|
return "Error: The document content is empty or could not be extracted." |
|
|
|
|
|
relevant_chunk = retrieve_relevant_document(user_question, document_text) |
|
|
|
|
|
query = f"{user_question} \n\n Relevant Document: {relevant_chunk}" |
|
|
|
|
|
client = initialize_groq() |
|
|
|
try: |
|
|
|
chat_completion = client.chat.completions.create( |
|
messages=[{"role": "user", "content": query}], |
|
model="llama3-8b-8192", |
|
) |
|
|
|
return chat_completion.choices[0].message.content |
|
except Exception as e: |
|
return f"Error generating answer: {str(e)}" |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("### Ask questions based on the selected document") |
|
|
|
|
|
document_dropdown = gr.Dropdown( |
|
label="Select Document", |
|
choices=["Income Tax Ordinance.pdf", "Companies Act 1984.pdf"], |
|
value="Income Tax Ordinance.pdf" |
|
) |
|
|
|
|
|
question_input = gr.Textbox( |
|
label="Enter your question", |
|
placeholder="Ask something related to the selected document..." |
|
) |
|
|
|
|
|
answer_output = gr.Textbox(label="Answer", interactive=False) |
|
|
|
|
|
submit_button = gr.Button("Ask") |
|
|
|
submit_button.click( |
|
fn=answer_question, |
|
inputs=[document_dropdown, question_input], |
|
outputs=answer_output |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_interface() |
|
demo.launch() |
|
|