tommasobaldi commited on
Commit
c1aef33
1 Parent(s): f6ab2e2

working on text splitting

Browse files
Files changed (1) hide show
  1. app.py +2 -2
app.py CHANGED
@@ -56,8 +56,8 @@ def main() -> None:
56
 
57
  for sentence in sentences:
58
  # token_list = [token for token in nltk.word_tokenize(sentence)]
59
- token_list = tokenizer(sentence, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
60
- token_length = len(token_list)
61
  if token_length + cumulative_token_length > split_token_length and result_list:
62
  accumulated_lists.append(join_sentences(result_list))
63
  result_list = [sentence]
 
56
 
57
  for sentence in sentences:
58
  # token_list = [token for token in nltk.word_tokenize(sentence)]
59
+ token_list = tokenizer(sentence, max_length=1024, truncation=True)
60
+ token_length = len(token_list["input_ids"])
61
  if token_length + cumulative_token_length > split_token_length and result_list:
62
  accumulated_lists.append(join_sentences(result_list))
63
  result_list = [sentence]