xangma commited on
Commit
0f43d97
1 Parent(s): b10b3aa

more fixes!

Browse files
Files changed (2) hide show
  1. app.py +18 -1
  2. ingest.py +6 -4
app.py CHANGED
@@ -172,6 +172,23 @@ with block:
172
  with gr.Column(scale=2):
173
  all_collections_to_get = gr.List(headers=['New Collections to make'],row_count=3, label='Collections_to_get', show_label=True, interactive=True, max_cols=1, max_rows=3)
174
  make_collections_button = gr.Button(value="Make new collection(s)", variant="secondary").style(full_width=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  with gr.Column(scale=2):
176
  collections_viewer = gr.CheckboxGroup(choices=[], label='Collections_viewer', show_label=True)
177
  with gr.Column(scale=1):
@@ -193,7 +210,7 @@ with block:
193
  message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
194
 
195
  load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
196
- make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get], outputs=[all_collections_state], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
197
  delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
198
  delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
199
  get_all_collection_names_button.click(list_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
 
172
  with gr.Column(scale=2):
173
  all_collections_to_get = gr.List(headers=['New Collections to make'],row_count=3, label='Collections_to_get', show_label=True, interactive=True, max_cols=1, max_rows=3)
174
  make_collections_button = gr.Button(value="Make new collection(s)", variant="secondary").style(full_width=False)
175
+ with gr.Row():
176
+ chunk_size_textbox = gr.Textbox(
177
+ placeholder="Chunk size",
178
+ label="Chunk size",
179
+ show_label=True,
180
+ lines=1,
181
+ )
182
+ chunk_overlap_textbox = gr.Textbox(
183
+ placeholder="Chunk overlap",
184
+ label="Chunk overlap",
185
+ show_label=True,
186
+ lines=1,
187
+ )
188
+ chunk_size_textbox.value = "1000"
189
+ chunk_overlap_textbox.value = "1000"
190
+ with gr.Row():
191
+ gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
192
  with gr.Column(scale=2):
193
  collections_viewer = gr.CheckboxGroup(choices=[], label='Collections_viewer', show_label=True)
194
  with gr.Column(scale=1):
 
210
  message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
211
 
212
  load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
213
+ make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox], outputs=[all_collections_state], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
214
  delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
215
  delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
216
  get_all_collection_names_button.click(list_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
ingest.py CHANGED
@@ -69,7 +69,7 @@ def get_text(content):
69
  else:
70
  return ""
71
 
72
- def ingest_docs(all_collections_state, urls):
73
  """Get documents from web pages."""
74
  all_docs = []
75
 
@@ -77,9 +77,9 @@ def ingest_docs(all_collections_state, urls):
77
  documents = []
78
  shutil.rmtree('downloaded/', ignore_errors=True)
79
  known_exts = ["py", "md"]
80
- py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
81
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
82
- md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
83
  for url in urls:
84
  paths_by_ext = {}
85
  docs_by_ext = {}
@@ -131,6 +131,8 @@ def ingest_docs(all_collections_state, urls):
131
  res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
132
  folder = destination
133
  local_repo_path_1 = folder
 
 
134
  for root, dirs, files in os.walk(local_repo_path_1):
135
  for file in files:
136
  file_path = os.path.join(root, file)
 
69
  else:
70
  return ""
71
 
72
+ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap):
73
  """Get documents from web pages."""
74
  all_docs = []
75
 
 
77
  documents = []
78
  shutil.rmtree('downloaded/', ignore_errors=True)
79
  known_exts = ["py", "md"]
80
+ py_splitter = PythonCodeTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
81
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
82
+ md_splitter = MarkdownTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
83
  for url in urls:
84
  paths_by_ext = {}
85
  docs_by_ext = {}
 
131
  res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
132
  folder = destination
133
  local_repo_path_1 = folder
134
+ if local_repo_path_1 == '.':
135
+ local_repo_path_1 = os.getcwd()
136
  for root, dirs, files in os.walk(local_repo_path_1):
137
  for file in files:
138
  file_path = os.path.join(root, file)