import gradio as gr import requests from bs4 import BeautifulSoup from PyPDF2 import PdfReader from docx import Document from transformers import BartForConditionalGeneration, BartTokenizer from concurrent.futures import ThreadPoolExecutor # Load model and tokenizer model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') def chunk_text(text, chunk_size=1024): """Break text into chunks of a specified size.""" tokens = tokenizer.encode(text, truncation=False) chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)] return chunks def summarize_chunk(chunk, summary_max_length=150): """Summarize a single chunk.""" inputs = tokenizer.decode(chunk, skip_special_tokens=True) inputs = tokenizer([inputs], max_length=1024, return_tensors='pt', truncation=True) summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=summary_max_length, early_stopping=True) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) def summarize_chunks_parallel(chunks, summary_max_length=150): """Summarize each chunk in parallel and combine the summaries.""" with ThreadPoolExecutor() as executor: summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, summary_max_length), chunks)) return ' '.join(summaries) def summarize_text(text, title=None, author=None, length_ratio=0.25): # Dynamically adjust chunk size based on text length input_length = len(tokenizer.encode(text, truncation=True)) chunk_size = min(1024, max(512, input_length // 8)) # Break text into chunks chunks = chunk_text(text, chunk_size=chunk_size) # Set the max length for each summary based on the length ratio summary_max_length = int(len(chunks) * length_ratio * 1024) # Summarize each chunk in parallel and combine the summaries summary = summarize_chunks_parallel(chunks, summary_max_length=summary_max_length) # Adding introductory sentence if title or author is available if title or author: intro = f"The text titled '{title}'" if title else "The text" if author: intro += f" by {author}" intro += " discusses the following main points: " summary = intro + summary return summary def extract_text_from_url(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') paragraphs = soup.find_all('p') text = ' '.join([para.get_text() for para in paragraphs]) return text except Exception as e: return str(e) def extract_text_from_pdf(file): pdf_text = "" try: reader = PdfReader(file) for page in reader.pages: pdf_text += page.extract_text() return pdf_text except Exception as e: return str(e) def extract_text_from_docx(file): doc_text = "" try: doc = Document(file) for para in doc.paragraphs: doc_text += para.text + "\n" return doc_text except Exception as e: return str(e) def process_input(text=None, url=None, file=None, length_ratio=0.25): if text: # Summarize the provided text return summarize_text(text, length_ratio=length_ratio) elif url: # Extract text from the provided URL and summarize it text = extract_text_from_url(url) if text: return summarize_text(text, length_ratio=length_ratio) else: return "No text extracted from the URL." elif file: # Extract text from the provided file (PDF or DOCX) and summarize it if file.name.endswith('.pdf'): text = extract_text_from_pdf(file) elif file.name.endswith('.docx'): text = extract_text_from_docx(file) else: return "Unsupported file type. Please upload a PDF or DOCX file." if text: return summarize_text(text, length_ratio=length_ratio) else: return "No text extracted from the file." else: return "Please provide text, a URL, or upload a file." # Define Gradio interface interface = gr.Interface( fn=process_input, inputs=[ gr.Textbox(label="Input Text", placeholder="Enter text here...", lines=10), # Adjusted input field size gr.Textbox(label="URL", placeholder="Enter URL here...", lines=2), # Adjusted URL field size gr.File(label="Upload a file (PDF or DOCX)"), gr.Slider(label="Summary Length Ratio (as a fraction of the original)", minimum=0.1, maximum=1.0, step=0.05, value=0.25) ], outputs=gr.Textbox(label="Summary", lines=20), # Adjusted output field size title="Text Summarization Tool", description="Enter text, paste a URL, or upload a PDF/DOCX file to generate a summary. Adjust the summary length with the slider." ) interface.launch()