#Load BART Model from Hugging Face from transformers import BartTokenizer, BartForConditionalGeneration import gradio as gr # Load the model and tokenizer from Hugging Face bart_model_name = "Kalotaibi/BART" bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name) bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name) bart_model.eval() # Ensure the model is in evaluation mode #Load PEGASUS Model from Hugging Face from transformers import PegasusForConditionalGeneration, PegasusTokenizer import torch # Set the path to the files pegasus_model_path = "KholoudA/PEGASUS_FINETUNED" # Load the tokenizer pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path) # Load the model pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path) # Set model to evaluation mode pegasus_model.eval() #Define the Summary Generation Functions import time import psutil # This function uses the BART model to generate summaries for given text def generate_summary_bart(text, max_length, min_length, length_penalty=2.5, num_beams=6): # Track the memory usage before the operation memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB start_time = time.time() # Prepare the text for the BART model inputs = bart_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024) # Generate a summary with the specified model configuration summary_ids = bart_model.generate(inputs["input_ids"], max_length=max_length, min_length=min_length, length_penalty=length_penalty, num_beams=num_beams, early_stopping=True) # Decode the generated tokens to a string summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True) end_time = time.time() # Calculate the inference time and memory usage inference_time = end_time - start_time # Log memory after processing memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB memory_used = memory_after - memory_before # Calculate the difference in memory usage return summary, inference_time, f"{memory_used:.2f} MB" # This function generates a summary based on TF-IDF and cosine similarity ranking of sentences import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def generate_summary_baseline(text, max_length=150, min_length=40): # Log memory before processing memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB start_time = time.time() # Start timing # Split the text into sentences sentences = text.split('.') sentences = [s.strip() for s in sentences if len(s.strip()) > 0] # Ensure sentences are not just whitespace if len(sentences) == 0: return "" vectorizer = TfidfVectorizer(stop_words='english') sentence_vectors = vectorizer.fit_transform(sentences) # Compute the average of all sentence vectors doc_vector = np.mean(sentence_vectors, axis=0).A # Calculate sentence scores. sentence_scores = cosine_similarity(sentence_vectors, doc_vector).flatten() # Rank sentences based on score ranked_sentences = sorted(((scores, sentence) for scores, sentence in zip(sentence_scores, sentences)), reverse=True) # Select sentences for the summary summary = [] current_length = 0 for scores, sentence in ranked_sentences: sentence_length = len(sentence.split()) if current_length + sentence_length > max_length: break summary.append(sentence) current_length += sentence_length if current_length >= min_length: break end_time = time.time() # End timing # Log memory after processing memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB inference_time = end_time - start_time memory_used = memory_after - memory_before # Calculate the difference in memory usage return ' '.join(summary), inference_time, f"{memory_used:.2f} MB" # This function uses the Pegasus model to generate summaries, setting various generation parameters def generate_summary_pegasus(input_text, max_length=150, min_length=40): # Log memory before processing memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB start_time = time.time() inputs = pegasus_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024, padding="max_length") summary_ids = pegasus_model.generate(inputs['input_ids'], max_length=max_length, min_length=min_length, num_beams=4, length_penalty=2.5, early_stopping=True) summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True) end_time = time.time() # Log memory after processing memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB inference_time = end_time - start_time memory_used = memory_after - memory_before # Calculate the difference in memory usage return summary, inference_time, f"{memory_used:.2f} MB" from sklearn.feature_extraction.text import CountVectorizer import numpy as np # This function generates summaries using a simple Bag of Words (BoW) model to identify key sentences def generate_summary_bow(text, max_length=150, min_length=40): # Log memory before processing memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB start_time = time.time() # Start timing # Split the text into sentences sentences = text.split('. ') vectorizer = CountVectorizer(stop_words='english') # Convert sentences to BoW model bow_matrix = vectorizer.fit_transform(sentences) # Calculate sentence scores as the sum of their word counts sentence_scores = np.array(bow_matrix.sum(axis=1)).flatten() # Sort sentences by score in descending order sorted_indices = np.argsort(-sentence_scores) summary = [] current_length = 0 # Select top sentences based on length criteria for idx in sorted_indices: sentence = sentences[idx].strip() sentence_length = len(sentence.split()) if current_length + sentence_length <= max_length: summary.append(sentence) current_length += sentence_length if current_length >= min_length: break end_time = time.time() # End timing # Log memory after processing memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB inference_time = end_time - start_time memory_used = memory_after - memory_before # Calculate the difference in memory usage return ' '.join(summary), inference_time, f"{memory_used:.2f} MB" #Examples and pre-caculated Rouge Scores examples = { "example1: Germanwings Flight 9525 Crash Investigation": { "text": "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that 'so far no videos were used in the crash investigation.' He added, 'A person who has such a video needs to immediately give it to the investigators.'", "summaries": { "BART": { "summary": "Prosecutor: 'So far no videos were used in the crash investigation.' Two magazines claim to have video of harrowing final seconds from on board Germanwings Flight 9525. 'It is a very disturbing scene,' says Bild editor-in-chief Julian Reichelt.", "score": {"rouge1": 0.47, "rouge2": 0.30, "rougeL": 0.45} }, "PEGASUS": { "summary": "Marseille, France, prosecutor: 'So far no videos were used in the crash investigation.' French investigators say they are not aware of any video footage from on board Germanwings Flight 9525. Two German publications report cell phone video of final seconds of crash.", "score": {"rouge1": 0.44, "rouge2": 0.28, "rougeL": 0.43} }, "Baseline": { "summary": "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane Marseille prosecutor Brice Robin told CNN that 'so far no videos were used in the crash investigation", "score": {"rouge1": 0.4634, "rouge2": 0.325, "rougeL": 0.3659} }, "BoW": { "summary": "Marseille prosecutor Brice Robin told CNN that 'so far no videos were used in the crash investigation.' He added, 'A person who has such a video needs to immediately give it to the investigators. The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane.", "score": {"rouge1": 0.1263, "rouge2": 0.0, "rougeL": 0.0632} } } }, "example2: California's PG&E Penalty for 2010 Gas Pipeline Explosion": { "text": "In 2010, a section of PG&E's gas pipeline exploded in San Bruno, California, killing eight people and injuring many others. The California Public Utilities Commission ordered PG&E to pay a record $1.6 billion penalty for the unsafe operation of its gas transmission system, which included forced spending on pipeline safety improvements. PG&E has since replaced over 800 miles of pipeline and made various safety upgrades to prevent future incidents.'", "summaries": { "BART": { "summary": "Following the deadly 2010 gas pipeline explosion in San Bruno, PG&E faced a $1.6 billion penalty, leading to significant safety upgrades and pipeline replacements to enhance system safety.", "score": {"rouge1": 0.47, "rouge2": 0.30, "rougeL": 0.45} }, "PEGASUS": { "summary": "PG&E was fined $1.6 billion by the California Public Utilities Commission for the 2010 pipeline explosion in San Bruno that killed eight. The penalty includes major safety improvements to the gas transmission system.", "score": {"rouge1": 0.44, "rouge2": 0.28, "rougeL": 0.43} }, "Baseline": { "summary": "In 2010, a section of PG&E's gas pipeline exploded in San Bruno, California, killing eight people and injuring many others PG&E has since replaced over 800 miles of pipeline and made various safety upgrades to prevent future incidents 6 billion penalty for the unsafe operation of its gas transmission system, which included forced spending on pipeline safety improvements", "score": {"rouge1": 0.4634, "rouge2": 0.325, "rougeL": 0.3659} }, "BoW": { "summary": "The California Public Utilities Commission ordered PG&E to pay a record $1.6 billion penalty for the unsafe operation of its gas transmission system, which included forced spending on pipeline safety improvements In 2010, a section of PG&E's gas pipeline exploded in San Bruno, California, killing eight people and injuring many others.", "score": {"rouge1": 0.1263, "rouge2": 0.0, "rougeL": 0.0632} } } } } #NLP APP using Gradio Interface import gradio as gr from transformers import BartTokenizer, BartForConditionalGeneration rouge_description = """ ROUGE Scores Explained:
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate the quality of automatic summarization and machine translation by comparing them to reference summaries. Here’s what each score represents:
Higher ROUGE scores generally indicate a summary’s better alignment with the reference texts, suggesting higher quality. """ # This function handles summarization based on user-selected options def generate_summary(example_choice, custom_text, summarizer, max_length, min_length): print("Function started") # Check if the input is custom text or a preselected example if example_choice == "Custom": input_text = custom_text # Use the custom text provided by the user # If "All" summarizers are selected, it uses all available methods to generate summaries if summarizer == "All": # Generate summaries from different models or methods summary_bart, time_taken_bart, mem_usage_bart = generate_summary_bart(input_text, max_length, min_length) summary_pegasus, time_taken_pegasus, mem_usage_pegasus = generate_summary_pegasus(input_text, max_length, min_length) summary_baseline, time_taken_baseline, mem_usage_baseline = generate_summary_baseline(input_text, max_length, min_length) summary_bow, time_taken_bow, mem_usage_bow = generate_summary_bow(input_text, max_length, min_length) # Concatenate all summaries for display summaries = f"BART: {summary_bart}\n\nPEGASUS: {summary_pegasus}\n\nBaseline: {summary_baseline}\n\nBOW: {summary_bow}" times = f"BART: {time_taken_bart}s, PEGASUS: {time_taken_pegasus}s, Baseline: {time_taken_baseline}s, BOW: {time_taken_bow}s" usages = f"BART: {mem_usage_bart} MB, PEGASUS: {mem_usage_pegasus} MB, Baseline: {mem_usage_baseline} MB, BOW: {mem_usage_bow} MB" rouge_scores = "BART: rouge-1 : 0.47, rouge-2: 0.30, rouge-L: 0.45\n PEGASUS: rouge-1: 0.44, rouge-2: 0.28, rouge-L: 0.43 \n BOW: rouge-1: 0.1263, rouge-2: 0.0, rouge-L: 0.0632 \n BASELINE: rouge-1: 0.4634, rouge-2: 0.325, rouge-L: 0.3659" # Individual model summarization based on the selected summarizer elif summarizer == "BART": summaries, times, usages = generate_summary_bart(input_text, max_length, min_length) rouge_scores = "BART: rouge-1 : 0.47, rouge-2: 0.30, rouge-L: 0.45" elif summarizer == "PEGASUS": summaries, times, usages = generate_summary_pegasus(input_text, max_length, min_length) rouge_scores = "PEGASUS: rouge-1: 0.44, rouge-2: 0.28, rouge-L: 0.43" elif summarizer == "BOW": summaries, times, usages = generate_summary_bow(input_text, max_length, min_length) rouge_scores = "BOW: rouge-1: 0.1263, rouge-2: 0.0, rouge-L: 0.0632" else: # Baseline summaries, times, usages = generate_summary_baseline(input_text, max_length, min_length) rouge_scores = "BASELINE: rouge-1: 0.4634, rouge-2: 0.325, rouge-L: 0.3659" else: # If a preselected example is used, handle accordingly if summarizer is None: summarizer = "All" input_text = examples[example_choice]["text"] summaries = {k: v["summary"] for k, v in examples[example_choice]["summaries"].items()} if summarizer == "All": summaries = "\n\n".join(f"{key}: {value}" for key, value in summaries.items()) times = "N/A" usages = "N/A" rouge_scores = "\n".join( f"{key}: ROUGE-1: {value['score']['rouge1']}, ROUGE-2: {value['score']['rouge2']}, ROUGE-L: {value['score']['rougeL']}" for key, value in examples[example_choice]["summaries"].items()) else: summaries = summaries[summarizer] times = "N/A" usages = "N/A" score = examples[example_choice]["summaries"][summarizer]["score"] rouge_scores = f"ROUGE-1: {score['rouge1']}, ROUGE-2: {score['rouge2']}, ROUGE-L: {score['rougeL']}" print("Function completed..") # Return all generated and calculated results return input_text, summaries, times, usages, rouge_scores, rouge_description # Setup the Gradio interface iface = gr.Interface( fn=generate_summary, # Function to call inputs=[ gr.Dropdown(list(examples.keys()) + ["Custom"], label="Select Example or Custom Text"), gr.Textbox(lines=10, placeholder="Enter Your Text Here If 'Custom' Selected", label="Custom Text"), gr.Dropdown(["All", "BART", "PEGASUS", "Baseline", "BOW"], label="Summarizer Model"), gr.Slider(minimum=10, maximum=300, step=1, value=150, label="Max Length"), gr.Slider(minimum=10, maximum=300, step=1, value=40, label="Min Length"), ], outputs=[ gr.Textbox(label="Original Text"), gr.Textbox(label="Summary"), gr.Textbox(label="Inference Time"), gr.Textbox(label="Memory Usage"), gr.Textbox(label="ROUGE Scores"), gr.Markdown(label="ROUGE Explanation", value=rouge_description) ], title="Text Summarization Tool", description="Select an example or enter custom text. Choose a summarization model and adjust the maximum and minimum lengths. View the summary, time taken, memory usage, and ROUGE scores." ) # Launch the interface iface.launch() # Launch with sharing and debugging options enabled