ffreemt commited on
Commit
c9772a1
·
1 Parent(s): 50c6a2e

Update progressbar

Browse files
Files changed (1) hide show
  1. app.py +47 -18
app.py CHANGED
@@ -126,14 +126,18 @@ CHROMA_SETTINGS = Settings(
126
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
127
 
128
  ns_initial = SimpleNamespace(
129
- qa=None,
130
  ingest_done=None,
131
  files_info=None,
132
  files_uploaded=[],
133
  db_ready=None,
 
 
 
134
  )
135
  ns = deepcopy(ns_initial)
136
 
 
137
  def load_single_document(file_path: str | Path) -> List[Document]:
138
  """Loads a single document from a file path."""
139
  try:
@@ -295,30 +299,32 @@ def upload_files(files):
295
 
296
  def process_files(
297
  # file_paths,
298
- progress=gr.Progress()
299
  ):
300
  """Process uploaded files."""
301
  if not ns.files_uploaded:
302
  return f"No files uploaded: {ns.files_uploaded}"
303
 
 
 
 
304
  logger.debug(f"{ns.files_uploaded}")
305
 
306
  logger.info(f"ingest({ns.files_uploaded})...")
307
 
308
  # imgs = [None] * 24
309
  # for img in progress.tqdm(imgs, desc="Loading from list"):
310
- # time.sleep(0.1)
311
 
312
- imgs = [[None] * 8] * 3
313
- for img_set in progress.tqdm(imgs, desc="Nested list"):
314
- time.sleep(.2)
315
- for img in progress.tqdm(img_set, desc="inner list"):
316
- time.sleep(10.1)
317
 
318
- return f"done file(s): {ns.files_info}"
319
  # return f"done file(s)"
320
 
321
- _ = """
322
  documents = []
323
  for file_path in progress.tqdm(ns.files_uploaded, desc="Reading file(s)"):
324
  logger.debug(f"Doing {file_path}")
@@ -327,7 +333,35 @@ def process_files(
327
  logger.debug("Done reading files.")
328
  except Exception as exc:
329
  logger.error(f"{file_path}: {exc}")
330
- # """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
  ns.ingest_done = True
333
 
@@ -400,9 +434,7 @@ def ingest(
400
  # client_settings=CHROMA_SETTINGS,
401
  )
402
  # for text in progress.tqdm(
403
- for text in tqdm(
404
- mit.chunked_even(texts, 101), total=ceil(len(texts) / 101)
405
- ):
406
  db.add_documents(documents=text)
407
 
408
  _ = """
@@ -632,10 +664,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
632
  logger.info("Done loading qa, need to do just one time.")
633
  # """
634
  if ns.qa is None:
635
- bot_message = (
636
- "Looks like the bot is not ready. "
637
- "Try again later..."
638
- )
639
  chat_history.append((message, bot_message))
640
  return "", chat_history
641
 
 
126
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
127
 
128
  ns_initial = SimpleNamespace(
129
+ qa=None, # in effect Chroma db
130
  ingest_done=None,
131
  files_info=None,
132
  files_uploaded=[],
133
  db_ready=None,
134
+ chunk_size=250,
135
+ chunk_overlap=250,
136
+ model_name=MODEL_NAME,
137
  )
138
  ns = deepcopy(ns_initial)
139
 
140
+
141
  def load_single_document(file_path: str | Path) -> List[Document]:
142
  """Loads a single document from a file path."""
143
  try:
 
299
 
300
  def process_files(
301
  # file_paths,
302
+ progress=gr.Progress(),
303
  ):
304
  """Process uploaded files."""
305
  if not ns.files_uploaded:
306
  return f"No files uploaded: {ns.files_uploaded}"
307
 
308
+ # wait for update before querying new ns.qa
309
+ ns.ingest_done = False
310
+
311
  logger.debug(f"{ns.files_uploaded}")
312
 
313
  logger.info(f"ingest({ns.files_uploaded})...")
314
 
315
  # imgs = [None] * 24
316
  # for img in progress.tqdm(imgs, desc="Loading from list"):
317
+ # time.sleep(0.1)
318
 
319
+ # imgs = [[None] * 8] * 3
320
+ # for img_set in progress.tqdm(imgs, desc="Nested list"):
321
+ # time.sleep(.2)
322
+ # for img in progress.tqdm(img_set, desc="inner list"):
323
+ # time.sleep(10.1)
324
 
325
+ # return f"done file(s): {ns.files_info}"
326
  # return f"done file(s)"
327
 
 
328
  documents = []
329
  for file_path in progress.tqdm(ns.files_uploaded, desc="Reading file(s)"):
330
  logger.debug(f"Doing {file_path}")
 
333
  logger.debug("Done reading files.")
334
  except Exception as exc:
335
  logger.error(f"{file_path}: {exc}")
336
+
337
+ text_splitter = RecursiveCharacterTextSplitter(
338
+ chunk_size=ns.chunk_size, chunk_overlap=ns.chunk_overlap
339
+ )
340
+ texts = text_splitter.split_documents(documents)
341
+
342
+ logger.info(f"Loaded {len(ns.files_uploaded)} files ")
343
+ logger.info(f"Loaded {len(documents)} documents ")
344
+ logger.info(f"Split into {len(texts)} chunks of text")
345
+
346
+ # initilize if necessary
347
+ if ns.qa is None:
348
+ embeddings = SentenceTransformerEmbeddings(
349
+ model_name=ns.model_name, model_kwargs={"device": DEVICE}
350
+ )
351
+
352
+ ns.qa = Chroma(
353
+ # persist_directory=PERSIST_DIRECTORY,
354
+ embedding_function=embeddings,
355
+ # client_settings=CHROMA_SETTINGS,
356
+ )
357
+
358
+ # for text in progress.tqdm(
359
+ for text in tqdm(
360
+ mit.chunked_even(texts, 101),
361
+ total=ceil(len(texts) / 101),
362
+ desc="Processing docs",
363
+ ):
364
+ ns.qa.add_documents(documents=text)
365
 
366
  ns.ingest_done = True
367
 
 
434
  # client_settings=CHROMA_SETTINGS,
435
  )
436
  # for text in progress.tqdm(
437
+ for text in tqdm(mit.chunked_even(texts, 101), total=ceil(len(texts) / 101)):
 
 
438
  db.add_documents(documents=text)
439
 
440
  _ = """
 
664
  logger.info("Done loading qa, need to do just one time.")
665
  # """
666
  if ns.qa is None:
667
+ bot_message = "Looks like the bot is not ready. " "Try again later..."
 
 
 
668
  chat_history.append((message, bot_message))
669
  return "", chat_history
670