tommasobaldi commited on
Commit
cb047cb
1 Parent(s): eda5d31

add summary cleaning function

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -25,10 +25,19 @@ def main() -> None:
25
  tos_pipeline = pipeline(task="summarization",
26
  model="ML-unipi/bart-large-tos",
27
  tokenizer="ML-unipi/bart-large-tos",
28
- device=0
29
  )
30
  return tos_pipeline
31
 
 
 
 
 
 
 
 
 
 
 
32
  def display_summary(summary_sentences: list) -> None:
33
  st.subheader("Summary :male-detective:")
34
  for sentence in summary_sentences:
@@ -56,7 +65,6 @@ def main() -> None:
56
  cumulative_token_length = 0
57
 
58
  for sentence in sentences:
59
- # token_list = [token for token in nltk.word_tokenize(sentence)]
60
  token_list = tokenizer(sentence, max_length=1024, truncation=True)
61
  token_length = len(token_list["input_ids"])
62
  if token_length > 10:
@@ -103,16 +111,9 @@ def main() -> None:
103
  split_token_length=1024
104
  )
105
  for sentence in sentences:
106
- # token_list = [token for token in nltk.word_tokenize(sentence)]
107
- # st.markdown(sentence)
108
- # st.markdown(str(len(token_list)))
109
  output = pipe(sentence)
110
  summary = output[0]["summary_text"]
111
-
112
- for line in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', summary):
113
- if line.find(".") != -1:
114
- line = line.replace("..", ".")
115
- summary_sentences.append(line)
116
  display_summary(summary_sentences)
117
 
118
 
 
25
  tos_pipeline = pipeline(task="summarization",
26
  model="ML-unipi/bart-large-tos",
27
  tokenizer="ML-unipi/bart-large-tos",
 
28
  )
29
  return tos_pipeline
30
 
31
+ def clean_summaries(text: str) -> list:
32
+ result = []
33
+ lines = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
34
+ for line in lines:
35
+ if line.find(".") != -1:
36
+ line = line.replace("..", ".")
37
+ result.append(line)
38
+ return result
39
+
40
+
41
  def display_summary(summary_sentences: list) -> None:
42
  st.subheader("Summary :male-detective:")
43
  for sentence in summary_sentences:
 
65
  cumulative_token_length = 0
66
 
67
  for sentence in sentences:
 
68
  token_list = tokenizer(sentence, max_length=1024, truncation=True)
69
  token_length = len(token_list["input_ids"])
70
  if token_length > 10:
 
111
  split_token_length=1024
112
  )
113
  for sentence in sentences:
 
 
 
114
  output = pipe(sentence)
115
  summary = output[0]["summary_text"]
116
+ summary_sentences += clean_summaries(summary)
 
 
 
 
117
  display_summary(summary_sentences)
118
 
119