Spaces:

KevlarVK
/

content_summarizer

Runtime error

App Files Files Community

content_summarizer / summarizer.py

KevlarVK

Potential Fix - title bug

c7a5c72 over 1 year ago

raw history blame

No virus

4.05 kB

	from datetime import datetime
	from transformers import BartTokenizer, TFBartForConditionalGeneration
	from Utils import get_input_chunks
	import networkx as nx
	from nltk.tokenize import sent_tokenize
	import nltk
	from sklearn.feature_extraction.text import TfidfVectorizer
	import community
	from title_generator import T5Summarizer


	class BARTSummarizer:

	def __init__(self, model_name: str = 'facebook/bart-large-cnn'):
	self.model_name = model_name
	self.tokenizer = BartTokenizer.from_pretrained(model_name)
	self.model = TFBartForConditionalGeneration.from_pretrained(model_name)
	self.max_length = self.model.config.max_position_embeddings
	self.title_model = T5Summarizer()

	def summarize(self, text: str, auto: bool = False):
	encoded_input = self.tokenizer.encode(text, max_length=self.max_length, return_tensors='tf', truncation=True)
	if auto:
	summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=1, no_repeat_ngram_size=2, min_length=60)
	else:
	summary_ids = self.model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True)
	summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	return summary

	def chunk_summarize(self, text: str, auto: bool = False):

	# split the input into chunks
	summaries = []
	input_chunks = get_input_chunks(text, self.max_length)

	# summarize each input chunk separately
	print(datetime.now().strftime("%H:%M:%S"))
	for chunk in input_chunks:
	summaries.append(self.summarize(chunk, auto))

	# # combine the summaries to get the final summary for the entire input
	final_summary = " ".join(summaries)

	print(datetime.now().strftime("%H:%M:%S"))

	return final_summary

	def preprocess_for_auto_chapters(self, text: str):

	# Tokenize the text into sentences
	try:
	sentences = sent_tokenize(text)
	except:
	nltk.download('punkt')
	sentences = sent_tokenize(text)

	# Filter out empty sentences and sentences with less than 5 words
	sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4]

	# Combine every 5 sentences into a single sentence
	sentences = [' '.join(sentences[i:i + 6]) for i in range(0, len(sentences), 5)]

	return sentences

	def auto_chapters_summarize(self, text: str):

	sentences = self.preprocess_for_auto_chapters(text)

	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform(sentences)

	# Compute the similarity matrix using cosine similarity
	similarity_matrix = X * X.T

	# Convert the similarity matrix to a graph
	graph = nx.from_scipy_sparse_array(similarity_matrix)

	# Apply the Louvain algorithm to identify communities
	partition = community.best_partition(graph, resolution=0.7, random_state=42)

	# Cluster the sentences
	clustered_sentences = []
	for cluster in set(partition.values()):
	sentences_to_print = []
	for i, sentence in enumerate(sentences):
	if partition[i] == cluster:
	sentences_to_print.append(sentence)
	if len(sentences_to_print) > 1:
	clustered_sentences.append(" ".join(sentences_to_print))

	# Summarize each cluster
	summaries_with_title = []
	for cluster in clustered_sentences:
	title = self.title_model.summarize(cluster)
	summary = self.chunk_summarize(cluster, auto=True)
	summary_with_title = "#### " + title + "\n" + summary
	summaries_with_title.append(summary_with_title)

	# Combine the summaries to get the final summary for the entire input
	final_summary = "\n\n".join(summaries_with_title)

	return final_summary