RAG_NLP / app.py
DenysPetro's picture
fixed error with path to pdf
ce31b4c
import nltk
import gradio as gr
from assistant import Assistant
from citation import Citation
from retrievers import KeyWordRetriever, SemanticRetriever, HybridRetriever
from text_processing import extract_text_from_pdf, clean_text
from embeddings import process_pdf_for_rag
from reranker import Reranker
nltk.download('punkt', quiet=True)
nltk.download('punkt')
nltk.download('punkt_tab')
# Load and preprocess PDF
data_path = "data"
pdf_path = data_path + "/sherlock.pdf"
chunks = process_pdf_for_rag(pdf_path, chunk_size=500)
pdf_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(pdf_text)
citation = Citation(cleaned_text)
# Initialize retrievers
keyword_retriever = KeyWordRetriever(chunks)
semantic_retriever = SemanticRetriever(chunks)
hybrid_retriever = HybridRetriever(keyword_retriever, semantic_retriever)
# Initialize assistant
reranker = Reranker()
assistant = Assistant(hybrid_retriever, reranker, citation=citation)
# Gradio UI
def run_rag_ui(api_key, query, retriever_type, top_k, use_reranker):
if retriever_type.lower() == "keyword":
retriever = keyword_retriever
elif retriever_type.lower() == "semantic":
retriever = semantic_retriever
elif retriever_type.lower() == "hybrid":
retriever = hybrid_retriever
else:
return "Invalid retrieval method selected."
reranker = Reranker() if use_reranker else None
pdf_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(pdf_text)
citation = Citation(cleaned_text)
assistant = Assistant(retriever, reranker, citation=citation)
response, retrieved_chunks, citations = assistant.handle_query(
query, api_key,
retriever_type=retriever_type,
top_k=top_k,
use_reranker=use_reranker
)
return response, citations, retrieved_chunks
iface = gr.Interface(
fn=run_rag_ui,
inputs=[
gr.Textbox(label="API Key", placeholder="Enter your Groq API Key", type="password"),
gr.Textbox(label="Query", placeholder="Enter your query", type="text"),
gr.Radio(choices=["keyword", "semantic", "hybrid"], label="Retrieval Method", value='hybrid'),
gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of chunks in context"),
gr.Radio(choices=[True, False], label="Use Reranker", value=False)
],
outputs=[
gr.Textbox(label="LLM Response", interactive=False),
gr.Textbox(label="Citations", interactive=False),
gr.Textbox(label="Retrieved Chunks", interactive=False)
],
title="RAG System with Gradio UI",
description="Enter your query, select the retrieval method, and get retrieved chunks along with LLM responses."
)
iface.launch(share=True)