import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from PyPDF2 import PdfReader from fpdf import FPDF from deep_translator import GoogleTranslator import tempfile # Load the tokenizer and model for summarization tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-led-base-16384") model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-led-base-16384") # Function to extract text from PDF def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = '' for page in reader.pages: text += page.extract_text() return text # Function to summarize text def summarize_text(text, max_input_length=16384, max_summary_length=512): text = ' '.join(text.split()) inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True) summary_ids = model.generate( inputs["input_ids"], max_length=max_summary_length, min_length=150, length_penalty=2.0, num_beams=5, early_stopping=True, no_repeat_ngram_size=3 ) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # Function to save summary to PDF def save_summary_to_pdf(summary_text, output_pdf_path): pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() pdf.set_font("Arial", size=12) pdf.set_font("Arial", 'B', 16) pdf.cell(200, 10, txt="Legal Document Summary", ln=True, align="C") pdf.set_font("Arial", size=12) lines = summary_text.splitlines() for line in lines: pdf.multi_cell(0, 10, line) pdf.output(output_pdf_path) return output_pdf_path # Function to process PDF for summarization def process_pdf_summary(pdf_file): pdf_text = extract_text_from_pdf(pdf_file) summary = summarize_text(pdf_text) output_pdf_path = "legal_document_summary.pdf" return save_summary_to_pdf(summary, output_pdf_path) # Function to translate text def translate_text(text, language): translation = GoogleTranslator(source='auto', target=language).translate(text) return translation # Function to create a PDF from translated text def create_pdf(text): pdf_output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # Use Arial instead of Latha for line in text.split('\n'): pdf.cell(200, 10, txt=line.encode('latin-1', 'replace').decode('latin-1'), ln=True) pdf.output(pdf_output) return pdf_output # Function to process PDF for translation def process_pdf_translation(pdf_file, language): extracted_text = extract_text_from_pdf(pdf_file) translated_text = translate_text(extracted_text, language) translated_pdf = create_pdf(translated_text) return translated_pdf, translated_text # Create Gradio interface with gr.Blocks() as app: gr.Markdown("# Legal Document Translator and Summarizer") gr.Markdown("Choose an operation to perform on your document:") with gr.Tab("Summarization"): gr.Markdown("### Upload PDF for Summarization") pdf_input_summary = gr.File(label="Upload PDF Document") summary_output = gr.File(label="Download Summary PDF") summarize_button = gr.Button("Summarize") summarize_button.click(process_pdf_summary, inputs=pdf_input_summary, outputs=summary_output) with gr.Tab("Translation"): gr.Markdown("### Upload PDF for Translation") pdf_input_translation = gr.File(label="Upload PDF Document") language_options = ["hi", "ta", "ml", "en"] # Hindi, Tamil, Malayalam, English language_selector = gr.Dropdown(choices=language_options, label="Select Language", value="hi") translation_output = gr.File(label="Download Translated PDF") translated_text_output = gr.Textbox(label="Translated Text", lines=10) translate_button = gr.Button("Translate") translate_button.click(process_pdf_translation, inputs=[pdf_input_translation, language_selector], outputs=[translation_output, translated_text_output]) # Launch the app if __name__ == "__main__": app.launch()