content_summarizer / summarizer.py
KevlarVK's picture
Potential Fix - title bug
c7a5c72
raw history blame
No virus
4.05 kB
from datetime import datetime
from transformers import BartTokenizer, TFBartForConditionalGeneration
from Utils import get_input_chunks
import networkx as nx
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import community
from title_generator import T5Summarizer
class BARTSummarizer:
def __init__(self, model_name: str = 'facebook/bart-large-cnn'):
self.model_name = model_name
self.tokenizer = BartTokenizer.from_pretrained(model_name)
self.model = TFBartForConditionalGeneration.from_pretrained(model_name)
self.max_length = self.model.config.max_position_embeddings
self.title_model = T5Summarizer()
def summarize(self, text: str, auto: bool = False):
encoded_input = self.tokenizer.encode(text, max_length=self.max_length, return_tensors='tf', truncation=True)
if auto:
summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=1, no_repeat_ngram_size=2, min_length=60)
else:
summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True)
summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def chunk_summarize(self, text: str, auto: bool = False):
# split the input into chunks
summaries = []
input_chunks = get_input_chunks(text, self.max_length)
# summarize each input chunk separately
print(datetime.now().strftime("%H:%M:%S"))
for chunk in input_chunks:
summaries.append(self.summarize(chunk, auto))
# # combine the summaries to get the final summary for the entire input
final_summary = " ".join(summaries)
print(datetime.now().strftime("%H:%M:%S"))
return final_summary
def preprocess_for_auto_chapters(self, text: str):
# Tokenize the text into sentences
try:
sentences = sent_tokenize(text)
except:
nltk.download('punkt')
sentences = sent_tokenize(text)
# Filter out empty sentences and sentences with less than 5 words
sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4]
# Combine every 5 sentences into a single sentence
sentences = [' '.join(sentences[i:i + 6]) for i in range(0, len(sentences), 5)]
return sentences
def auto_chapters_summarize(self, text: str):
sentences = self.preprocess_for_auto_chapters(text)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)
# Compute the similarity matrix using cosine similarity
similarity_matrix = X * X.T
# Convert the similarity matrix to a graph
graph = nx.from_scipy_sparse_array(similarity_matrix)
# Apply the Louvain algorithm to identify communities
partition = community.best_partition(graph, resolution=0.7, random_state=42)
# Cluster the sentences
clustered_sentences = []
for cluster in set(partition.values()):
sentences_to_print = []
for i, sentence in enumerate(sentences):
if partition[i] == cluster:
sentences_to_print.append(sentence)
if len(sentences_to_print) > 1:
clustered_sentences.append(" ".join(sentences_to_print))
# Summarize each cluster
summaries_with_title = []
for cluster in clustered_sentences:
title = self.title_model.summarize(cluster)
summary = self.chunk_summarize(cluster, auto=True)
summary_with_title = "#### " + title + "\n" + summary
summaries_with_title.append(summary_with_title)
# Combine the summaries to get the final summary for the entire input
final_summary = "\n\n".join(summaries_with_title)
return final_summary