Gladiator commited on
Commit
e9ee3ed
1 Parent(s): 8809824

restructure dir

Browse files
app.py CHANGED
@@ -5,8 +5,9 @@ from transformers import AutoTokenizer, pipeline
5
 
6
  # local modules
7
  from extractive_summarizer.model_processors import Summarizer
8
- from src.utils import clean_text, fetch_article_text
9
- from src.abstractive_summarizer import (
 
10
  preprocess_text_for_abstractive_summarization,
11
  )
12
 
@@ -85,7 +86,6 @@ if __name__ == "__main__":
85
  text_to_summarize = preprocess_text_for_abstractive_summarization(
86
  tokenizer=abs_tokenizer, text=clean_txt
87
  )
88
- print(text_to_summarize)
89
  tmp_sum = abs_summarizer(
90
  text_to_summarize,
91
  max_length=abs_max_length,
 
5
 
6
  # local modules
7
  from extractive_summarizer.model_processors import Summarizer
8
+ from utils import (
9
+ clean_text,
10
+ fetch_article_text,
11
  preprocess_text_for_abstractive_summarization,
12
  )
13
 
 
86
  text_to_summarize = preprocess_text_for_abstractive_summarization(
87
  tokenizer=abs_tokenizer, text=clean_txt
88
  )
 
89
  tmp_sum = abs_summarizer(
90
  text_to_summarize,
91
  max_length=abs_max_length,
src/abstractive_summarizer.py DELETED
@@ -1,52 +0,0 @@
1
- import torch
2
- from nltk.tokenize import sent_tokenize
3
- from transformers import T5Tokenizer
4
-
5
-
6
- def abstractive_summarizer(tokenizer, model, text):
7
- # inputs to the model
8
- inputs = [tokenizer(f"summarize: {chunk}", return_tensors="pt") for chunk in text]
9
- abs_summarized_text = []
10
- for input in inputs:
11
- output = model.generate(input["input_ids"])
12
- tmp_sum = tokenizer.decode(output[0], skip_special_tokens=True)
13
- abs_summarized_text.append(tmp_sum)
14
-
15
- abs_summarized_text = " ".join([summ for summ in abs_summarized_text])
16
- return abs_summarized_text
17
-
18
-
19
- def preprocess_text_for_abstractive_summarization(tokenizer, text):
20
- sentences = sent_tokenize(text)
21
-
22
- # initialize
23
- length = 0
24
- chunk = ""
25
- chunks = []
26
- count = -1
27
- for sentence in sentences:
28
- count += 1
29
- combined_length = (
30
- len(tokenizer.tokenize(sentence)) + length
31
- ) # add the no. of sentence tokens to the length counter
32
-
33
- if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
34
- chunk += sentence + " " # add the sentence to the chunk
35
- length = combined_length # update the length counter
36
-
37
- # if it is the last sentence
38
- if count == len(sentences) - 1:
39
- chunks.append(chunk.strip()) # save the chunk
40
-
41
- else:
42
- chunks.append(chunk.strip()) # save the chunk
43
-
44
- # reset
45
- length = 0
46
- chunk = ""
47
-
48
- # take care of the overflow sentence
49
- chunk += sentence + " "
50
- length = len(tokenizer.tokenize(sentence))
51
-
52
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py → utils.py RENAMED
@@ -1,6 +1,7 @@
1
  import re
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
 
5
  emoji_pattern = re.compile(
6
  "["
@@ -59,3 +60,39 @@ def fetch_article_text(url: str):
59
  chunks[chunk_id] = " ".join(chunks[chunk_id])
60
 
61
  return ARTICLE, chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ from nltk.tokenize import sent_tokenize
5
 
6
  emoji_pattern = re.compile(
7
  "["
 
60
  chunks[chunk_id] = " ".join(chunks[chunk_id])
61
 
62
  return ARTICLE, chunks
63
+
64
+
65
+ def preprocess_text_for_abstractive_summarization(tokenizer, text):
66
+ sentences = sent_tokenize(text)
67
+
68
+ # initialize
69
+ length = 0
70
+ chunk = ""
71
+ chunks = []
72
+ count = -1
73
+ for sentence in sentences:
74
+ count += 1
75
+ combined_length = (
76
+ len(tokenizer.tokenize(sentence)) + length
77
+ ) # add the no. of sentence tokens to the length counter
78
+
79
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
80
+ chunk += sentence + " " # add the sentence to the chunk
81
+ length = combined_length # update the length counter
82
+
83
+ # if it is the last sentence
84
+ if count == len(sentences) - 1:
85
+ chunks.append(chunk.strip()) # save the chunk
86
+
87
+ else:
88
+ chunks.append(chunk.strip()) # save the chunk
89
+
90
+ # reset
91
+ length = 0
92
+ chunk = ""
93
+
94
+ # take care of the overflow sentence
95
+ chunk += sentence + " "
96
+ length = len(tokenizer.tokenize(sentence))
97
+
98
+ return chunks