#Load BART Model from Hugging Face
from transformers import BartTokenizer, BartForConditionalGeneration
import gradio as gr
# Load the model and tokenizer from Hugging Face
bart_model_name = "Kalotaibi/BART"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
bart_model.eval() # Ensure the model is in evaluation mode
#Load PEGASUS Model from Hugging Face
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
# Set the path to the files
pegasus_model_path = "KholoudA/PEGASUS_FINETUNED"
# Load the tokenizer
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
# Load the model
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
# Set model to evaluation mode
pegasus_model.eval()
#Define the Summary Generation Functions
import time
import psutil
# This function uses the BART model to generate summaries for given text
def generate_summary_bart(text, max_length, min_length, length_penalty=2.5, num_beams=6):
# Track the memory usage before the operation
memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
start_time = time.time()
# Prepare the text for the BART model
inputs = bart_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024)
# Generate a summary with the specified model configuration
summary_ids = bart_model.generate(inputs["input_ids"], max_length=max_length, min_length=min_length, length_penalty=length_penalty, num_beams=num_beams, early_stopping=True)
# Decode the generated tokens to a string
summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
end_time = time.time()
# Calculate the inference time and memory usage
inference_time = end_time - start_time
# Log memory after processing
memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
memory_used = memory_after - memory_before # Calculate the difference in memory usage
return summary, inference_time, f"{memory_used:.2f} MB"
# This function generates a summary based on TF-IDF and cosine similarity ranking of sentences
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def generate_summary_baseline(text, max_length=150, min_length=40):
# Log memory before processing
memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
start_time = time.time() # Start timing
# Split the text into sentences
sentences = text.split('.')
sentences = [s.strip() for s in sentences if len(s.strip()) > 0] # Ensure sentences are not just whitespace
if len(sentences) == 0:
return ""
vectorizer = TfidfVectorizer(stop_words='english')
sentence_vectors = vectorizer.fit_transform(sentences)
# Compute the average of all sentence vectors
doc_vector = np.mean(sentence_vectors, axis=0).A
# Calculate sentence scores.
sentence_scores = cosine_similarity(sentence_vectors, doc_vector).flatten()
# Rank sentences based on score
ranked_sentences = sorted(((scores, sentence) for scores, sentence in zip(sentence_scores, sentences)), reverse=True)
# Select sentences for the summary
summary = []
current_length = 0
for scores, sentence in ranked_sentences:
sentence_length = len(sentence.split())
if current_length + sentence_length > max_length:
break
summary.append(sentence)
current_length += sentence_length
if current_length >= min_length:
break
end_time = time.time() # End timing
# Log memory after processing
memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
inference_time = end_time - start_time
memory_used = memory_after - memory_before # Calculate the difference in memory usage
return ' '.join(summary), inference_time, f"{memory_used:.2f} MB"
# This function uses the Pegasus model to generate summaries, setting various generation parameters
def generate_summary_pegasus(input_text, max_length=150, min_length=40):
# Log memory before processing
memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
start_time = time.time()
inputs = pegasus_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024, padding="max_length")
summary_ids = pegasus_model.generate(inputs['input_ids'], max_length=max_length, min_length=min_length, num_beams=4, length_penalty=2.5, early_stopping=True)
summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
end_time = time.time()
# Log memory after processing
memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
inference_time = end_time - start_time
memory_used = memory_after - memory_before # Calculate the difference in memory usage
return summary, inference_time, f"{memory_used:.2f} MB"
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# This function generates summaries using a simple Bag of Words (BoW) model to identify key sentences
def generate_summary_bow(text, max_length=150, min_length=40):
# Log memory before processing
memory_before = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
start_time = time.time() # Start timing
# Split the text into sentences
sentences = text.split('. ')
vectorizer = CountVectorizer(stop_words='english')
# Convert sentences to BoW model
bow_matrix = vectorizer.fit_transform(sentences)
# Calculate sentence scores as the sum of their word counts
sentence_scores = np.array(bow_matrix.sum(axis=1)).flatten()
# Sort sentences by score in descending order
sorted_indices = np.argsort(-sentence_scores)
summary = []
current_length = 0
# Select top sentences based on length criteria
for idx in sorted_indices:
sentence = sentences[idx].strip()
sentence_length = len(sentence.split())
if current_length + sentence_length <= max_length:
summary.append(sentence)
current_length += sentence_length
if current_length >= min_length:
break
end_time = time.time() # End timing
# Log memory after processing
memory_after = psutil.virtual_memory().used / (1024 * 1024) # Memory in MB
inference_time = end_time - start_time
memory_used = memory_after - memory_before # Calculate the difference in memory usage
return ' '.join(summary), inference_time, f"{memory_used:.2f} MB"
#Examples and pre-caculated Rouge Scores
examples = {
"example1: Germanwings Flight 9525 Crash Investigation": {
"text": "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that 'so far no videos were used in the crash investigation.' He added, 'A person who has such a video needs to immediately give it to the investigators.'",
"summaries": {
"BART": {
"summary": "Prosecutor: 'So far no videos were used in the crash investigation.' Two magazines claim to have video of harrowing final seconds from on board Germanwings Flight 9525. 'It is a very disturbing scene,' says Bild editor-in-chief Julian Reichelt.",
"score": {"rouge1": 0.47, "rouge2": 0.30, "rougeL": 0.45}
},
"PEGASUS": {
"summary": "Marseille, France, prosecutor: 'So far no videos were used in the crash investigation.' French investigators say they are not aware of any video footage from on board Germanwings Flight 9525. Two German publications report cell phone video of final seconds of crash.",
"score": {"rouge1": 0.44, "rouge2": 0.28, "rougeL": 0.43}
},
"Baseline": {
"summary": "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane Marseille prosecutor Brice Robin told CNN that 'so far no videos were used in the crash investigation",
"score": {"rouge1": 0.4634, "rouge2": 0.325, "rougeL": 0.3659}
},
"BoW": {
"summary": "Marseille prosecutor Brice Robin told CNN that 'so far no videos were used in the crash investigation.' He added, 'A person who has such a video needs to immediately give it to the investigators. The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane.",
"score": {"rouge1": 0.1263, "rouge2": 0.0, "rougeL": 0.0632}
}
}
},
"example2: California's PG&E Penalty for 2010 Gas Pipeline Explosion": {
"text": "In 2010, a section of PG&E's gas pipeline exploded in San Bruno, California, killing eight people and injuring many others. The California Public Utilities Commission ordered PG&E to pay a record $1.6 billion penalty for the unsafe operation of its gas transmission system, which included forced spending on pipeline safety improvements. PG&E has since replaced over 800 miles of pipeline and made various safety upgrades to prevent future incidents.'",
"summaries": {
"BART": {
"summary": "Following the deadly 2010 gas pipeline explosion in San Bruno, PG&E faced a $1.6 billion penalty, leading to significant safety upgrades and pipeline replacements to enhance system safety.",
"score": {"rouge1": 0.47, "rouge2": 0.30, "rougeL": 0.45}
},
"PEGASUS": {
"summary": "PG&E was fined $1.6 billion by the California Public Utilities Commission for the 2010 pipeline explosion in San Bruno that killed eight. The penalty includes major safety improvements to the gas transmission system.",
"score": {"rouge1": 0.44, "rouge2": 0.28, "rougeL": 0.43}
},
"Baseline": {
"summary": "In 2010, a section of PG&E's gas pipeline exploded in San Bruno, California, killing eight people and injuring many others PG&E has since replaced over 800 miles of pipeline and made various safety upgrades to prevent future incidents 6 billion penalty for the unsafe operation of its gas transmission system, which included forced spending on pipeline safety improvements",
"score": {"rouge1": 0.4634, "rouge2": 0.325, "rougeL": 0.3659}
},
"BoW": {
"summary": "The California Public Utilities Commission ordered PG&E to pay a record $1.6 billion penalty for the unsafe operation of its gas transmission system, which included forced spending on pipeline safety improvements In 2010, a section of PG&E's gas pipeline exploded in San Bruno, California, killing eight people and injuring many others.",
"score": {"rouge1": 0.1263, "rouge2": 0.0, "rougeL": 0.0632}
}
}
}
}
#NLP APP using Gradio Interface
import gradio as gr
from transformers import BartTokenizer, BartForConditionalGeneration
rouge_description = """
ROUGE Scores Explained:
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate the quality of automatic summarization and machine translation by comparing them to reference summaries. Here’s what each score represents: