import os import re import gc import torch import gradio as gr import numpy as np import faiss import nltk from dotenv import load_dotenv from PyPDF2 import PdfReader from transformers import ( MarianMTModel, MarianTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, ) from sentence_transformers import SentenceTransformer nltk.download("punkt_tab") load_dotenv() device = "cuda" if torch.cuda.is_available() else "cpu" # Embeddings & QA embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Translation models: # English -> Hindi (fine-tuned Marian model; used for summary -> Hindi) en_hi_model_name = "saved_model_nlp" translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device) translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name) # Hindi -> English (Helsinki model to convert input Hindi PDF to English) hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en" translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device) translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name) # BART Summarizer bart_model_name = "pszemraj/led-large-book-summary" bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name) bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device) pdf_text = "" text_chunks = [] index = None # QA def extract_text_from_pdf(file_path): reader = PdfReader(file_path) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" doc_is_hindi = is_devanagari(text) if doc_is_hindi: # split into Hindi sentences hindi_sentences = sentence_tokenize_hindi(text) # translate in batches to English english_sentences = batch_translate_hi_to_en(hindi_sentences) english_source_text = " ".join(english_sentences) else: english_source_text = text return english_source_text def chunk_text(text, chunk_size=500, overlap=100): chunks = [] start = 0 while start < len(text): end = min(start + chunk_size, len(text)) chunk = text[start:end] chunks.append(chunk) start += chunk_size - overlap return chunks def build_faiss_index(chunks, embedder): embeddings = embedder.encode(chunks) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings, dtype=np.float32)) return index, np.array(embeddings, dtype=np.float32) def is_devanagari(text: str, threshold: float = 0.02) -> bool: """ Percentage of Devanagari characters in text. If above threshold -> consider the document as Hindi/Devanagari. """ if not text: return False devanagari_count = len(re.findall(r"[\u0900-\u097F]", text)) return (devanagari_count / max(1, len(text))) > threshold def sentence_tokenize_english(text: str): return nltk.sent_tokenize(text) def sentence_tokenize_hindi(text: str): parts = re.split(r"[।\.\?\!]\s+", text) parts = [p.strip() for p in parts if p and p.strip()] return parts def batch_translate_hi_to_en(sentences, batch_size=16): """ Translate a list of Hindi sentences -> English using Helsinki model in batches. Returns list of translated strings in same order. """ out = [] for i in range(0, len(sentences), batch_size): batch = sentences[i : i + batch_size] toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) with torch.no_grad(): gen = translator_hi_en_model.generate(**toks, max_length=512) decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen] out.extend(decoded) return out def batch_translate_en_to_hi(sentences, batch_size=16): """ Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian). """ out = [] for i in range(0, len(sentences), batch_size): batch = sentences[i : i + batch_size] toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) with torch.no_grad(): gen = translator_en_hi_model.generate(**toks, max_length=512) decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen] out.extend(decoded) return out # Upload + Process PDF(QA) def upload_pdf(file): global pdf_text, text_chunks, index pdf_text = extract_text_from_pdf(file.name) text_chunks = chunk_text(pdf_text) if len(text_chunks) == 0: return "❌ Empty PDF or could not extract text." index, _ = build_faiss_index(text_chunks, embedder) return "✅ PDF uploaded and processed successfully! Ready for questions." # Answer Questions def get_answer(question): global pdf_text, text_chunks, index if index is None: return "❌ Please upload a PDF first." q_emb = embedder.encode([question]) D, I = index.search(np.array(q_emb, dtype=np.float32), k=3) relevant_text = " ".join([text_chunks[i] for i in I[0]]) result = qa_pipeline(question=question, context=relevant_text) answer = result.get("answer", "") confidence = round(result.get("score", 0.0), 3) return ( f"**Answer:** {answer}\n\n" f"**Confidence:** {confidence}\n\n" f"**Context Extract:**\n{relevant_text[:500]}..." ) # BART Summarization(English) def bart_summarize(text): inputs = bart_tokenizer( text, return_tensors="pt", truncation=True, max_length=4096, ).to(device) bart_model.config.max_length = 4096 with torch.no_grad(): summary_ids = bart_model.generate( inputs["input_ids"], max_length=2000, min_length=80, num_beams=4, length_penalty=2.0, ) return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True) def summarize_pdf_with_options(pdf_file, output_lang="english"): """ output_lang: "english" or "hindi" """ try: # Extract text reader = PdfReader(pdf_file) text = "" for page in reader.pages[:10]: # first 10 pages text += page.extract_text() or "" if not text.strip(): return "❌ Could not extract text from the PDF." # Detect Devanagari(Hindi) doc_is_hindi = is_devanagari(text) # If Hindi document->translate whole doc to English sentence-wise first if doc_is_hindi: # split into Hindi sentences hindi_sentences = sentence_tokenize_hindi(text) # translate in batches to English english_sentences = batch_translate_hi_to_en(hindi_sentences) # join for summarization english_source_text = " ".join(english_sentences) else: english_source_text = text # Summarize English source text using BART english_summary = bart_summarize(english_source_text[:5000]) # Sentence-tokenize the English summary english_sentences_out = sentence_tokenize_english(english_summary) if output_lang.lower().startswith("eng"): # each sentence in a new line lines = [s.strip() for s in english_sentences_out if s.strip()] return "\n".join(lines) # If user wants Hindi output -> translate each English sentence sentence-wise to Hindi else: hindi_translations = batch_translate_en_to_hi(english_sentences_out) lines = [s.strip() for s in hindi_translations if s.strip()] return "\n".join(lines) except Exception as e: return f"⚠️ Error processing PDF: {e}" # UI with gr.Blocks() as demo: gr.Markdown("# 📄 PDF Assist (QA + BART Summarizer — English/Hindi)") # PDF Question Answering with gr.Tab("🤖 PDF Question Answering"): gr.Markdown("Ask questions about your uploaded PDF document.") pdf_file = gr.File(label="📄 Upload PDF") upload_btn = gr.Button("Process PDF") status = gr.Markdown() question_box = gr.Textbox(label="Ask a question") ask_btn = gr.Button("Get Answer") output_box = gr.Markdown() upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status) ask_btn.click(get_answer, inputs=question_box, outputs=output_box) # Academic PDF Summarizer with gr.Tab("📚 Academic PDF Summarizer (English ↔ Hindi)"): gr.Markdown( "Upload an academic PDF (English or Hindi). The app auto-detects script. " "Choose output language" ) pdf_input = gr.File(label="📎 Upload a PDF", file_types=[".pdf"]) output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language") summarize_btn = gr.Button("📑 Summarize") summarize_out = gr.Textbox(label="📘 Summary", lines=20) summarize_btn.click( fn=summarize_pdf_with_options, inputs=[pdf_input, output_choice], outputs=summarize_out, ) if __name__ == "__main__": demo.launch(share=True)