Anavya-TEXTSUMMARIZERADVANCE

Sleeping

App Files Files Community

Gladiator commited on Jan 23, 2022

Commit

e9ee3ed

•

1 Parent(s): 8809824

restructure dir

Browse files

Files changed (3) hide show

app.py +3 -3
src/abstractive_summarizer.py +0 -52
src/utils.py → utils.py +37 -0

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ from transformers import AutoTokenizer, pipeline
 # local modules
 from extractive_summarizer.model_processors import Summarizer
-from src.utils import clean_text, fetch_article_text
-from src.abstractive_summarizer import (
     preprocess_text_for_abstractive_summarization,
 )
@@ -85,7 +86,6 @@ if __name__ == "__main__":
                     text_to_summarize = preprocess_text_for_abstractive_summarization(
                         tokenizer=abs_tokenizer, text=clean_txt
                     )
-                print(text_to_summarize)
                 tmp_sum = abs_summarizer(
                     text_to_summarize,
                     max_length=abs_max_length,

 # local modules
 from extractive_summarizer.model_processors import Summarizer
+from utils import (
+    clean_text,
+    fetch_article_text,
     preprocess_text_for_abstractive_summarization,
 )
                     text_to_summarize = preprocess_text_for_abstractive_summarization(
                         tokenizer=abs_tokenizer, text=clean_txt
                     )
                 tmp_sum = abs_summarizer(
                     text_to_summarize,
                     max_length=abs_max_length,

src/abstractive_summarizer.py DELETED Viewed

@@ -1,52 +0,0 @@
-import torch
-from nltk.tokenize import sent_tokenize
-from transformers import T5Tokenizer
-def abstractive_summarizer(tokenizer, model, text):
-    # inputs to the model
-    inputs = [tokenizer(f"summarize: {chunk}", return_tensors="pt") for chunk in text]
-    abs_summarized_text = []
-    for input in inputs:
-        output = model.generate(input["input_ids"])
-        tmp_sum = tokenizer.decode(output[0], skip_special_tokens=True)
-        abs_summarized_text.append(tmp_sum)
-    abs_summarized_text = " ".join([summ for summ in abs_summarized_text])
-    return abs_summarized_text
-def preprocess_text_for_abstractive_summarization(tokenizer, text):
-    sentences = sent_tokenize(text)
-    # initialize
-    length = 0
-    chunk = ""
-    chunks = []
-    count = -1
-    for sentence in sentences:
-        count += 1
-        combined_length = (
-            len(tokenizer.tokenize(sentence)) + length
-        )  # add the no. of sentence tokens to the length counter
-        if combined_length <= tokenizer.max_len_single_sentence:  # if it doesn't exceed
-            chunk += sentence + " "  # add the sentence to the chunk
-            length = combined_length  # update the length counter
-            # if it is the last sentence
-            if count == len(sentences) - 1:
-                chunks.append(chunk.strip())  # save the chunk
-        else:
-            chunks.append(chunk.strip())  # save the chunk
-            # reset
-            length = 0
-            chunk = ""
-            # take care of the overflow sentence
-            chunk += sentence + " "
-            length = len(tokenizer.tokenize(sentence))
-    return chunks

src/utils.py → utils.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import re
 import requests
 from bs4 import BeautifulSoup
 emoji_pattern = re.compile(
     "["
@@ -59,3 +60,39 @@ def fetch_article_text(url: str):
         chunks[chunk_id] = " ".join(chunks[chunk_id])
     return ARTICLE, chunks

 import re
 import requests
 from bs4 import BeautifulSoup
+from nltk.tokenize import sent_tokenize
 emoji_pattern = re.compile(
     "["
         chunks[chunk_id] = " ".join(chunks[chunk_id])
     return ARTICLE, chunks
+def preprocess_text_for_abstractive_summarization(tokenizer, text):
+    sentences = sent_tokenize(text)
+    # initialize
+    length = 0
+    chunk = ""
+    chunks = []
+    count = -1
+    for sentence in sentences:
+        count += 1
+        combined_length = (
+            len(tokenizer.tokenize(sentence)) + length
+        )  # add the no. of sentence tokens to the length counter
+        if combined_length <= tokenizer.max_len_single_sentence:  # if it doesn't exceed
+            chunk += sentence + " "  # add the sentence to the chunk
+            length = combined_length  # update the length counter
+            # if it is the last sentence
+            if count == len(sentences) - 1:
+                chunks.append(chunk.strip())  # save the chunk
+        else:
+            chunks.append(chunk.strip())  # save the chunk
+            # reset
+            length = 0
+            chunk = ""
+            # take care of the overflow sentence
+            chunk += sentence + " "
+            length = len(tokenizer.tokenize(sentence))
+    return chunks