clementsan commited on
Commit
9bf736d
1 Parent(s): 5b34e60

Improve creation of collection name

Browse files
Files changed (1) hide show
  1. app.py +20 -13
app.py CHANGED
@@ -20,6 +20,7 @@ import transformers
20
  import torch
21
  import tqdm
22
  import accelerate
 
23
 
24
 
25
 
@@ -179,27 +180,33 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, pr
179
  return qa_chain
180
 
181
 
182
- # Initialize database
183
- def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
184
- # Create list of documents (when valid)
185
- list_file_path = [x.name for x in list_file_obj if x is not None]
186
- # Create collection_name for vector database
187
- progress(0.1, desc="Creating collection name...")
188
- collection_name = Path(list_file_path[0]).stem
189
  # Fix potential issues from naming convention
190
  ## Remove space
191
  collection_name = collection_name.replace(" ","-")
192
  ## ASCII transliterations of Unicode text
193
  collection_name = unidecode(collection_name)
 
 
 
194
  ## Limit lenght to 50 characters
195
  collection_name = collection_name[:50]
196
- ## Enforce start and end as alphanumeric character
197
- if not collection_name[0].isalnum():
198
- collection_name[0] = 'A'
199
- if not collection_name[-1].isalnum():
200
- collection_name[-1] = 'Z'
201
- # print('list_file_path: ', list_file_path)
202
  print('Collection name: ', collection_name)
 
 
 
 
 
 
 
 
 
 
203
  progress(0.25, desc="Loading document...")
204
  # Load document and create splits
205
  doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
 
20
  import torch
21
  import tqdm
22
  import accelerate
23
+ import re
24
 
25
 
26
 
 
180
  return qa_chain
181
 
182
 
183
+ # Generate collection name for vector database
184
+ # - Use filepath as input, ensuring unicode text
185
+ def create_collection_name(filepath):
186
+ # Extract filename without extension
187
+ collection_name = Path(filepath).stem
 
 
188
  # Fix potential issues from naming convention
189
  ## Remove space
190
  collection_name = collection_name.replace(" ","-")
191
  ## ASCII transliterations of Unicode text
192
  collection_name = unidecode(collection_name)
193
+ ## Remove special characters
194
+ #collection_name = re.findall("[\dA-Za-z]*", collection_name)[0]
195
+ collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
196
  ## Limit lenght to 50 characters
197
  collection_name = collection_name[:50]
198
+ print('Filepath: ', filepath)
 
 
 
 
 
199
  print('Collection name: ', collection_name)
200
+ return collection_name
201
+
202
+
203
+ # Initialize database
204
+ def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
205
+ # Create list of documents (when valid)
206
+ list_file_path = [x.name for x in list_file_obj if x is not None]
207
+ # Create collection_name for vector database
208
+ progress(0.1, desc="Creating collection name...")
209
+ collection_name = create_collection_name(list_file_path[0])
210
  progress(0.25, desc="Loading document...")
211
  # Load document and create splits
212
  doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)