tommasobaldi commited on
Commit
795ee13
1 Parent(s): c1aef33

working on text splitting

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -58,13 +58,14 @@ def main() -> None:
58
  # token_list = [token for token in nltk.word_tokenize(sentence)]
59
  token_list = tokenizer(sentence, max_length=1024, truncation=True)
60
  token_length = len(token_list["input_ids"])
61
- if token_length + cumulative_token_length > split_token_length and result_list:
62
- accumulated_lists.append(join_sentences(result_list))
63
- result_list = [sentence]
64
- cumulative_token_length = token_length
65
- else:
66
- result_list.append(sentence)
67
- cumulative_token_length += token_length
 
68
  if result_list:
69
  accumulated_lists.append(join_sentences(result_list))
70
  return accumulated_lists
 
58
  # token_list = [token for token in nltk.word_tokenize(sentence)]
59
  token_list = tokenizer(sentence, max_length=1024, truncation=True)
60
  token_length = len(token_list["input_ids"])
61
+ if token_length > 10:
62
+ if token_length + cumulative_token_length > split_token_length and result_list:
63
+ accumulated_lists.append(join_sentences(result_list))
64
+ result_list = [sentence]
65
+ cumulative_token_length = token_length
66
+ else:
67
+ result_list.append(sentence)
68
+ cumulative_token_length += token_length
69
  if result_list:
70
  accumulated_lists.append(join_sentences(result_list))
71
  return accumulated_lists