Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -25,19 +25,19 @@ def extract_separators_from_string(separators_str):
|
|
25 |
Please type it in the correct format: "['separator_1', 'separator_2', etc]"
|
26 |
""")
|
27 |
|
28 |
-
def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection):
|
29 |
return (
|
30 |
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
|
31 |
-
chunk(text, slider_count, split_selection, separator_selection, length_unit_selection)
|
32 |
)
|
33 |
|
34 |
-
def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
|
35 |
separators = extract_separators_from_string(separators_str)
|
36 |
length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
|
37 |
if splitter_selection == LABEL_TEXTSPLITTER:
|
38 |
text_splitter = CharacterTextSplitter(
|
39 |
chunk_size=length,
|
40 |
-
chunk_overlap=
|
41 |
length_function=length_function,
|
42 |
strip_whitespace=False,
|
43 |
is_separator_regex=False,
|
@@ -46,7 +46,7 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
46 |
elif splitter_selection == LABEL_RECURSIVE:
|
47 |
text_splitter = RecursiveCharacterTextSplitter(
|
48 |
chunk_size=length,
|
49 |
-
chunk_overlap=
|
50 |
length_function=length_function,
|
51 |
strip_whitespace=False,
|
52 |
separators=separators,
|
@@ -54,7 +54,9 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
|
|
54 |
splits = text_splitter.create_documents([text])
|
55 |
text_splits = [split.page_content for split in splits]
|
56 |
|
57 |
-
|
|
|
|
|
58 |
return output
|
59 |
|
60 |
|
@@ -134,6 +136,9 @@ with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_
|
|
134 |
slider_count = gr.Slider(
|
135 |
20, 500, value=200, label="Chunk length ๐", info="In the chosen unit."
|
136 |
)
|
|
|
|
|
|
|
137 |
out = gr.HighlightedText(
|
138 |
label="Output",
|
139 |
show_legend=True,
|
@@ -141,22 +146,27 @@ with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_
|
|
141 |
)
|
142 |
text.change(
|
143 |
fn=chunk,
|
144 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
145 |
outputs=out,
|
146 |
)
|
147 |
length_unit_selection.change(
|
148 |
fn=chunk,
|
149 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
150 |
outputs=out,
|
151 |
)
|
152 |
split_selection.change(
|
153 |
fn=change_split_selection,
|
154 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
155 |
outputs=[separator_selection, out],
|
156 |
)
|
157 |
slider_count.change(
|
158 |
fn=chunk,
|
159 |
-
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
|
|
|
|
|
|
|
|
|
|
160 |
outputs=out,
|
161 |
)
|
162 |
demo.launch()
|
|
|
25 |
Please type it in the correct format: "['separator_1', 'separator_2', etc]"
|
26 |
""")
|
27 |
|
28 |
+
def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap):
|
29 |
return (
|
30 |
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
|
31 |
+
chunk(text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap)
|
32 |
)
|
33 |
|
34 |
+
def chunk(text, length, splitter_selection, separators_str, length_unit_selection, chunk_overlap):
|
35 |
separators = extract_separators_from_string(separators_str)
|
36 |
length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
|
37 |
if splitter_selection == LABEL_TEXTSPLITTER:
|
38 |
text_splitter = CharacterTextSplitter(
|
39 |
chunk_size=length,
|
40 |
+
chunk_overlap=chunk_overlap,
|
41 |
length_function=length_function,
|
42 |
strip_whitespace=False,
|
43 |
is_separator_regex=False,
|
|
|
46 |
elif splitter_selection == LABEL_RECURSIVE:
|
47 |
text_splitter = RecursiveCharacterTextSplitter(
|
48 |
chunk_size=length,
|
49 |
+
chunk_overlap=chunk_overlap,
|
50 |
length_function=length_function,
|
51 |
strip_whitespace=False,
|
52 |
separators=separators,
|
|
|
54 |
splits = text_splitter.create_documents([text])
|
55 |
text_splits = [split.page_content for split in splits]
|
56 |
|
57 |
+
unoverlapped_text_splits = unoverlap_list(text_splits)
|
58 |
+
|
59 |
+
output = [((split[0], 0) if split[1] else (split[0], str(i+1))) for i, split in enumerate(unoverlapped_text_splits)]
|
60 |
return output
|
61 |
|
62 |
|
|
|
136 |
slider_count = gr.Slider(
|
137 |
20, 500, value=200, label="Chunk length ๐", info="In the chosen unit."
|
138 |
)
|
139 |
+
chunk_overlap = gr.Slider(
|
140 |
+
0, 30, value=10, label="Overlap between chunks", info="In the chosen unit."
|
141 |
+
)
|
142 |
out = gr.HighlightedText(
|
143 |
label="Output",
|
144 |
show_legend=True,
|
|
|
146 |
)
|
147 |
text.change(
|
148 |
fn=chunk,
|
149 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
150 |
outputs=out,
|
151 |
)
|
152 |
length_unit_selection.change(
|
153 |
fn=chunk,
|
154 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
155 |
outputs=out,
|
156 |
)
|
157 |
split_selection.change(
|
158 |
fn=change_split_selection,
|
159 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
160 |
outputs=[separator_selection, out],
|
161 |
)
|
162 |
slider_count.change(
|
163 |
fn=chunk,
|
164 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
165 |
+
outputs=out,
|
166 |
+
)
|
167 |
+
chunk_overlap.change(
|
168 |
+
fn=chunk,
|
169 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
|
170 |
outputs=out,
|
171 |
)
|
172 |
demo.launch()
|