tommasobaldi commited on
Commit
69f90b2
1 Parent(s): 92212fb

working on text splitting

Browse files
Files changed (1) hide show
  1. app.py +15 -14
app.py CHANGED
@@ -98,20 +98,21 @@ def main() -> None:
98
  # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
99
 
100
  def split_text(text: str) -> list:
101
- sentences = sent_tokenize(text)
102
- # token_count = 0
103
- # text_block = ""
104
- # result = []
105
- # for sentence in sentences:
106
- # tokens = word_tokenize(sentence)
107
- # if token_count + len(tokens) < 500:
108
- # token_count += len(tokens)
109
- # text_block += " ".join(sentence)
110
- # else:
111
- # result.append(text_block)
112
- # text_block = "".join(sentence)
113
- # token_count = len(tokens)
114
- return sentences
 
115
 
116
  pipe = create_pipeline()
117
 
 
98
  # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
99
 
100
  def split_text(text: str) -> list:
101
+ sentences = sent_tokenize(text, language="english")
102
+
103
+ token_count = 0
104
+ text_block = ""
105
+ result = []
106
+ for sentence in sentences:
107
+ tokens = word_tokenize(sentence, language="english", preserve_line=True)
108
+ if token_count + len(tokens) < 500:
109
+ token_count += len(tokens)
110
+ text_block += " ".join(sentence)
111
+ else:
112
+ result.append(text_block)
113
+ text_block = "".join(sentence)
114
+ token_count = len(tokens)
115
+ return result
116
 
117
  pipe = create_pipeline()
118