import gradio as gr import pdfplumber import re import tempfile import os import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from concurrent.futures import ThreadPoolExecutor import spaces @spaces.GPU def preprocess_text_for_tts(text): text = re.sub(r'[^\x20-\x7E]', ' ', text) text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) text = re.sub(r'\S+@\S+', '', text) text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text) text = re.sub(r'\.{2,}', ' ', text) def convert_case(match): word = match.group(0) common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'} return word if word in common_abbreviations else word.title() text = re.sub(r'\b[A-Z]+\b', convert_case, text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\.([A-Za-z])', r'. \1', text) text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text) text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text) text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text) text = re.sub(r'\s+', ' ', text).strip() return text # Check if CUDA (GPU) is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load the model and tokenizer model_name = "sherif31/T5-Grammer-Correction" # Replace with your actual model name tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) def correct_text(text): # Split the text into chunks to avoid exceeding max token limit max_chunk_length = 512 chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)] corrected_chunks = [] for chunk in chunks: input_text = f"grammar: {chunk}" input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device) with torch.no_grad(): output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5) corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True) corrected_chunks.append(corrected_chunk) return ' '.join(corrected_chunks) def extract_text_from_pages(pdf_bytes): page_text_dict = {} with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(pdf_bytes) temp_pdf_path = temp_pdf.name try: with pdfplumber.open(temp_pdf_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): raw_text = page.extract_text() if raw_text: cleaned_text = preprocess_text_for_tts(raw_text) corrected_text = correct_text(cleaned_text) page_text_dict[page_num] = corrected_text else: page_text_dict[page_num] = "" finally: os.unlink(temp_pdf_path) return page_text_dict def process_pdf(pdf_file): if pdf_file is None: return "No file uploaded. Please upload a PDF file." result = extract_text_from_pages(pdf_file) # Use ThreadPoolExecutor for parallel processing with ThreadPoolExecutor() as executor: corrected_texts = list(executor.map(correct_text, result.values())) # Combine the results output = "" for page_num, text in zip(result.keys(), corrected_texts): output += f"Page {page_num}:\n{text}\n\n" return output # Create the Gradio interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF", type="binary"), outputs=gr.Textbox(label="Extracted and Processed Text"), title="PDF Text Extractor and Processor", description="Upload a PDF file to extract, clean, and correct its text content." ) # Launch the app iface.launch()