m-ric HF staff commited on
Commit
1fa958e
β€’
1 Parent(s): e13bfd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -34
app.py CHANGED
@@ -10,6 +10,10 @@ LABEL_RECURSIVE = "πŸ¦œπŸ”— LangChain's RecursiveCharacterTextSplitter"
10
 
11
  bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
12
 
 
 
 
 
13
  def extract_separators_from_string(separators_str):
14
  try:
15
  separators = separators_str[1:-1].split(", ")
@@ -31,42 +35,24 @@ def change_split_selection(text, slider_count, split_selection, separator_select
31
  def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
32
  separators = extract_separators_from_string(separators_str)
33
  print(splitter_selection, length_unit_selection.lower())
 
34
  if splitter_selection == LABEL_TEXTSPLITTER:
35
- if "token" in length_unit_selection.lower():
36
- text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
37
- bert_tokenizer,
38
- separator="",
39
- chunk_size=length,
40
- chunk_overlap=0,
41
- is_separator_regex=False,
42
- )
43
- else:
44
- text_splitter = CharacterTextSplitter(
45
- separator="",
46
- chunk_size=length,
47
- chunk_overlap=0,
48
- length_function=len,
49
- is_separator_regex=False,
50
- )
51
  elif splitter_selection == LABEL_RECURSIVE:
52
- if "token" in length_unit_selection.lower():
53
- text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
54
- bert_tokenizer,
55
- chunk_size=length,
56
- chunk_overlap=0,
57
- add_start_index=True,
58
- strip_whitespace=False,
59
- separators=separators,
60
- )
61
- else:
62
- text_splitter = RecursiveCharacterTextSplitter(
63
- chunk_size=length,
64
- chunk_overlap=0,
65
- length_function=len,
66
- add_start_index=True,
67
- strip_whitespace=False,
68
- separators=separators,
69
- )
70
  splits = text_splitter.create_documents([text])
71
  text_splits = [split.page_content for split in splits]
72
 
 
10
 
11
  bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
12
 
13
+ def length_tokens(txt):
14
+ return len(bert_tokenizer.tokenize(txt))
15
+
16
+
17
  def extract_separators_from_string(separators_str):
18
  try:
19
  separators = separators_str[1:-1].split(", ")
 
35
  def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
36
  separators = extract_separators_from_string(separators_str)
37
  print(splitter_selection, length_unit_selection.lower())
38
+ length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
39
  if splitter_selection == LABEL_TEXTSPLITTER:
40
+ text_splitter = CharacterTextSplitter(
41
+ chunk_size=length,
42
+ chunk_overlap=0,
43
+ length_function=length_function,
44
+ stipe_whitespace=False,
45
+ is_separator_regex=False,
46
+ separator="",
47
+ )
 
 
 
 
 
 
 
 
48
  elif splitter_selection == LABEL_RECURSIVE:
49
+ text_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=length,
51
+ chunk_overlap=0,
52
+ length_function=length_function,
53
+ strip_whitespace=False,
54
+ separators=separators,
55
+ )
 
 
 
 
 
 
 
 
 
 
 
56
  splits = text_splitter.create_documents([text])
57
  text_splits = [split.page_content for split in splits]
58