Spaces:

KevlarVK
/

content_summarizer

Runtime error

KevlarVK commited on Mar 31, 2023

Commit

9a4b6ed

•

1 Parent(s): ae59ea3

Added support for title generation

Files changed (2) hide show

summarizer.py CHANGED Viewed

@@ -5,6 +5,7 @@ import networkx as nx
 from nltk.tokenize import sent_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer
 import community
 class BARTSummarizer:
@@ -14,6 +15,7 @@ class BARTSummarizer:
         self.tokenizer = BartTokenizer.from_pretrained(model_name)
         self.model = TFBartForConditionalGeneration.from_pretrained(model_name)
         self.max_length = self.model.config.max_position_embeddings
     def summarize(self, text: str, auto: bool = False):
         encoded_input = self.tokenizer.encode(text, max_length=self.max_length, return_tensors='tf', truncation=True)
@@ -82,12 +84,14 @@ class BARTSummarizer:
                 clustered_sentences.append(" ".join(sentences_to_print))
         # Summarize each cluster
-        summaries = []
         for cluster in clustered_sentences:
-            summaries.append(self.chunk_summarize(cluster, auto=True))
         # Combine the summaries to get the final summary for the entire input
-        final_summary = "\n\n".join(summaries)
         return final_summary

 from nltk.tokenize import sent_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer
 import community
+from title_generator import T5Summarizer
 class BARTSummarizer:
         self.tokenizer = BartTokenizer.from_pretrained(model_name)
         self.model = TFBartForConditionalGeneration.from_pretrained(model_name)
         self.max_length = self.model.config.max_position_embeddings
+        self.title_model = T5Summarizer()
     def summarize(self, text: str, auto: bool = False):
         encoded_input = self.tokenizer.encode(text, max_length=self.max_length, return_tensors='tf', truncation=True)
                 clustered_sentences.append(" ".join(sentences_to_print))
         # Summarize each cluster
+        summaries_with_title = []
         for cluster in clustered_sentences:
+            summary = self.chunk_summarize(cluster, auto=True)
+            summary_with_title = "#### " + self.title_model.summarize(summary) + "\n" + summary
+            summaries_with_title.append(summary_with_title)
         # Combine the summaries to get the final summary for the entire input
+        final_summary = "\n\n".join(summaries_with_title)
         return final_summary

title_generator.py ADDED Viewed

+from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
+class T5Summarizer:
+    def __init__(self, model_name: str = "fabiochiu/t5-small-medium-title-generation"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
+    def summarize(self, text: str):
+        inputs = ["summarize: " + text]
+        max_input_length = self.tokenizer.model_max_length
+        inputs = self.tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="tf")
+        output = self.model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=10, early_stopping=True)
+        summary = self.tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+        return summary