ffreemt commited on
Commit
e831b10
1 Parent(s): 7e0d59b

Update main.py

Browse files
Files changed (7) hide show
  1. README.md +1 -1
  2. app.py +151 -78
  3. docs/test2.txt +2 -0
  4. main.py +50 -0
  5. requirements-freeze.txt +179 -0
  6. requirements-win10-cpu.txt +33 -0
  7. requirements.txt +2 -2
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.33.1
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.33.1
8
+ app_file: main.py
9
  pinned: false
10
  license: mit
11
  ---
app.py CHANGED
@@ -19,7 +19,7 @@ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20
19
  texts = text_splitter.split_documents(docs)
20
 
21
  model_name = "hkunlp/instructor-base"
22
- embeddings = HuggingFaceInstructEmbeddings(
23
  model_name=model_name, model_kwargs={"device": device}
24
  )
25
 
@@ -28,11 +28,11 @@ embeddings = HuggingFaceInstructEmbeddings(
28
  # both 99 chunks, Wall time: 5min 4s CPU times: total: 13min 31s
29
  # chunks = len / 800
30
 
31
- db = Chroma.from_documents(texts, embeddings)
32
 
33
  db = Chroma.from_documents(
34
  texts,
35
- embeddings,
36
  persist_directory=PERSIST_DIRECTORY,
37
  client_settings=CHROMA_SETTINGS,
38
  )
@@ -126,7 +126,8 @@ CHROMA_SETTINGS = Settings(
126
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
127
 
128
  ns_initial = SimpleNamespace(
129
- qa=None, # in effect Chroma db
 
130
  ingest_done=None,
131
  files_info=None,
132
  files_uploaded=[],
@@ -229,17 +230,17 @@ def get_vectorstore(
229
  persist=True,
230
  ):
231
  """Gne vectorstore."""
232
- # embeddings = OpenAIEmbeddings()
233
  # for HuggingFaceInstructEmbeddings
234
  model_name = "hkunlp/instructor-xl"
235
  model_name = "hkunlp/instructor-large"
236
  model_name = "hkunlp/instructor-base"
237
 
238
- # embeddings = HuggingFaceInstructEmbeddings(model_name=model_name)
239
 
240
  model_name = MODEL_NAME
241
  logger.info(f"Loading {model_name}")
242
- embeddings = SentenceTransformerEmbeddings(model_name=model_name)
243
  logger.info(f"Done loading {model_name}")
244
 
245
  if vectorstore is None:
@@ -247,20 +248,20 @@ def get_vectorstore(
247
 
248
  if vectorstore.lower() in ["chroma"]:
249
  logger.info(
250
- "Doing vectorstore Chroma.from_texts(texts=text_chunks, embedding=embeddings)"
251
  )
252
  if persist:
253
  vectorstore = Chroma.from_texts(
254
  texts=text_chunks,
255
- embedding=embeddings,
256
  persist_directory=PERSIST_DIRECTORY,
257
  client_settings=CHROMA_SETTINGS,
258
  )
259
  else:
260
- vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embeddings)
261
 
262
  logger.info(
263
- "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
264
  )
265
 
266
  return vectorstore
@@ -268,11 +269,11 @@ def get_vectorstore(
268
  # if vectorstore.lower() not in ['chroma']
269
  # TODO handle other cases
270
  logger.info(
271
- "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
272
  )
273
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
274
  logger.info(
275
- "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
276
  )
277
 
278
  return vectorstore
@@ -308,9 +309,7 @@ def process_files(
308
  # wait for update before querying new ns.qa
309
  ns.ingest_done = False
310
 
311
- logger.debug(f"{ns.files_uploaded}")
312
-
313
- logger.info(f"ingest({ns.files_uploaded})...")
314
 
315
  # imgs = [None] * 24
316
  # for img in progress.tqdm(imgs, desc="Loading from list"):
@@ -322,17 +321,25 @@ def process_files(
322
  # for img in progress.tqdm(img_set, desc="inner list"):
323
  # time.sleep(10.1)
324
 
325
- # return f"done file(s): {ns.files_info}"
326
- # return f"done file(s)"
327
 
328
  documents = []
329
- for file_path in progress.tqdm(ns.files_uploaded, desc="Reading file(s)"):
330
- logger.debug(f"Doing {file_path}")
331
- try:
332
- documents.extend(load_single_document(f"{file_path}"))
333
- logger.debug("Done reading files.")
334
- except Exception as exc:
335
- logger.error(f"{file_path}: {exc}")
 
 
 
 
 
 
 
 
 
336
 
337
  text_splitter = RecursiveCharacterTextSplitter(
338
  chunk_size=ns.chunk_size, chunk_overlap=ns.chunk_overlap
@@ -340,30 +347,52 @@ def process_files(
340
  texts = text_splitter.split_documents(documents)
341
 
342
  logger.info(f"Loaded {len(ns.files_uploaded)} files ")
343
- logger.info(f"Loaded {len(documents)} documents ")
344
- logger.info(f"Split into {len(texts)} chunks of text")
345
-
346
- # initilize if necessary
347
- if ns.qa is None:
348
- embeddings = SentenceTransformerEmbeddings(
349
- model_name=ns.model_name, model_kwargs={"device": DEVICE}
350
- )
 
 
351
 
352
- ns.qa = Chroma(
353
- # persist_directory=PERSIST_DIRECTORY,
354
- embedding_function=embeddings,
355
- # client_settings=CHROMA_SETTINGS,
356
- )
 
 
357
 
358
  total = ceil(len(texts) / 101)
359
- # for text in progress.tqdm(
360
- for idx, text in enumerate(progress.tqdm(
361
- mit.chunked_even(texts, 101),
362
- total=total,
363
- desc="Processing docs",
364
- )):
365
- logger.debug(f"{idx + 1} of {total}")
366
- ns.qa.add_documents(documents=text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
  ns.ingest_done = True
369
  _ = [
@@ -372,9 +401,55 @@ def process_files(
372
  ]
373
  ns.files_info = _
374
 
375
- # ns.qa = load_qa()
 
 
 
 
 
 
 
 
 
 
376
 
377
- return f"done file(s): {ns.files_info}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
 
380
  # pylint disable=unused-argument
@@ -424,9 +499,9 @@ def ingest(
424
  logger.info(f"Loaded {len(documents)} documents ")
425
  logger.info(f"Split into {len(texts)} chunks of text")
426
 
427
- # Create embeddings
428
- # embeddings = HuggingFaceInstructEmbeddings(
429
- embeddings = SentenceTransformerEmbeddings(
430
  model_name=model_name, model_kwargs={"device": device}
431
  )
432
 
@@ -437,7 +512,7 @@ def ingest(
437
  # mit.chunked_even(texts, 100)
438
  db = Chroma(
439
  # persist_directory=PERSIST_DIRECTORY,
440
- embedding_function=embeddings,
441
  # client_settings=CHROMA_SETTINGS,
442
  )
443
  # for text in progress.tqdm(
@@ -448,7 +523,7 @@ def ingest(
448
  with about_time() as atime: # type: ignore
449
  db = Chroma.from_documents(
450
  texts,
451
- embeddings,
452
  persist_directory=PERSIST_DIRECTORY,
453
  client_settings=CHROMA_SETTINGS,
454
  )
@@ -512,7 +587,14 @@ def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
512
 
513
 
514
  def load_qa(device=None, model_name: str = MODEL_NAME):
515
- """Gen qa."""
 
 
 
 
 
 
 
516
  logger.info("Doing qa")
517
  if device is None:
518
  if torch.cuda.is_available():
@@ -520,19 +602,14 @@ def load_qa(device=None, model_name: str = MODEL_NAME):
520
  else:
521
  device = "cpu"
522
 
523
- # device = 'cpu'
524
- # model_name = "hkunlp/instructor-xl"
525
- # model_name = "hkunlp/instructor-large"
526
- # model_name = "hkunlp/instructor-base"
527
- # embeddings = HuggingFaceInstructEmbeddings(
528
- embeddings = SentenceTransformerEmbeddings(
529
  model_name=model_name, model_kwargs={"device": device}
530
  )
531
  # xl 4.96G, large 3.5G,
532
 
533
  db = Chroma(
534
  persist_directory=PERSIST_DIRECTORY,
535
- embedding_function=embeddings,
536
  client_settings=CHROMA_SETTINGS,
537
  )
538
  retriever = db.as_retriever()
@@ -552,8 +629,7 @@ def load_qa(device=None, model_name: str = MODEL_NAME):
552
 
553
  return qa
554
 
555
- # """
556
-
557
  # pylint: disable=unreachable
558
 
559
  # model = 'gpt-3.5-turbo', default text-davinci-003
@@ -615,7 +691,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
615
  gr.Markdown(dedent(_))
616
 
617
  with gr.Tab("Upload files"):
618
- # Upload files and generate embeddings database
619
  with gr.Row():
620
  file_output = gr.File()
621
  # file_output = gr.Text()
@@ -626,9 +702,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
626
  file_count="multiple",
627
  )
628
  with gr.Row():
629
- text2 = gr.Textbox("Progress/Log")
630
- process_btn = gr.Button("Click to process files")
631
- reset_btn = gr.Button("Reset everything")
 
632
 
633
  with gr.Tab("Query docs"):
634
  # interactive chat
@@ -643,21 +720,24 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
643
  ns = deepcopy(ns_initial)
644
  return f"reset done: ns={ns}"
645
 
646
- reset_btn.click(reset_all, [], text2)
647
 
648
  upload_button.upload(upload_files, upload_button, file_output)
649
  process_btn.click(process_files, [], text2)
650
 
651
  def respond(message, chat_history):
652
  """Gen response."""
 
653
  if ns.ingest_done is None: # no files processed yet
654
  bot_message = "Upload some file(s) for processing first."
655
  chat_history.append((message, bot_message))
656
  return "", chat_history
657
 
 
658
  if not ns.ingest_done: # embedding database not doen yet
659
  bot_message = (
660
  "Waiting for ingest (embedding) to finish, "
 
661
  "be patient... You can switch the 'Upload files' "
662
  "Tab to check"
663
  )
@@ -695,13 +775,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
695
  clear.click(lambda: None, None, chatbot, queue=False)
696
 
697
  if __name__ == "__main__":
698
- # main()
699
- try:
700
- from google import colab # noqa # type: ignore
701
-
702
- share = True # start share when in colab
703
- except Exception:
704
- share = False
705
  demo.queue(concurrency_count=20).launch(share=share)
706
 
707
  _ = """
@@ -710,12 +783,12 @@ device = 'cpu'
710
  model_name = "hkunlp/instructor-xl"
711
  model_name = "hkunlp/instructor-large"
712
  model_name = "hkunlp/instructor-base"
713
- embeddings = HuggingFaceInstructEmbeddings(
714
  model_name=,
715
  model_kwargs={"device": device}
716
  )
717
  # xl 4.96G, large 3.5G,
718
- db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
719
  retriever = db.as_retriever()
720
 
721
  llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
 
19
  texts = text_splitter.split_documents(docs)
20
 
21
  model_name = "hkunlp/instructor-base"
22
+ embedding = HuggingFaceInstructEmbeddings(
23
  model_name=model_name, model_kwargs={"device": device}
24
  )
25
 
 
28
  # both 99 chunks, Wall time: 5min 4s CPU times: total: 13min 31s
29
  # chunks = len / 800
30
 
31
+ db = Chroma.from_documents(texts, embedding)
32
 
33
  db = Chroma.from_documents(
34
  texts,
35
+ embedding,
36
  persist_directory=PERSIST_DIRECTORY,
37
  client_settings=CHROMA_SETTINGS,
38
  )
 
126
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
127
 
128
  ns_initial = SimpleNamespace(
129
+ db=None,
130
+ qa=None,
131
  ingest_done=None,
132
  files_info=None,
133
  files_uploaded=[],
 
230
  persist=True,
231
  ):
232
  """Gne vectorstore."""
233
+ # embedding = OpenAIEmbeddings()
234
  # for HuggingFaceInstructEmbeddings
235
  model_name = "hkunlp/instructor-xl"
236
  model_name = "hkunlp/instructor-large"
237
  model_name = "hkunlp/instructor-base"
238
 
239
+ # embedding = HuggingFaceInstructEmbeddings(model_name=model_name)
240
 
241
  model_name = MODEL_NAME
242
  logger.info(f"Loading {model_name}")
243
+ embedding = SentenceTransformerEmbeddings(model_name=model_name)
244
  logger.info(f"Done loading {model_name}")
245
 
246
  if vectorstore is None:
 
248
 
249
  if vectorstore.lower() in ["chroma"]:
250
  logger.info(
251
+ "Doing vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
252
  )
253
  if persist:
254
  vectorstore = Chroma.from_texts(
255
  texts=text_chunks,
256
+ embedding=embedding,
257
  persist_directory=PERSIST_DIRECTORY,
258
  client_settings=CHROMA_SETTINGS,
259
  )
260
  else:
261
+ vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
262
 
263
  logger.info(
264
+ "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embedding)"
265
  )
266
 
267
  return vectorstore
 
269
  # if vectorstore.lower() not in ['chroma']
270
  # TODO handle other cases
271
  logger.info(
272
+ "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embedding)"
273
  )
274
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
275
  logger.info(
276
+ "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embedding)"
277
  )
278
 
279
  return vectorstore
 
309
  # wait for update before querying new ns.qa
310
  ns.ingest_done = False
311
 
312
+ logger.debug(f"ns.files_uploaded: {ns.files_uploaded}")
 
 
313
 
314
  # imgs = [None] * 24
315
  # for img in progress.tqdm(imgs, desc="Loading from list"):
 
321
  # for img in progress.tqdm(img_set, desc="inner list"):
322
  # time.sleep(10.1)
323
 
324
+ # return "done..."
 
325
 
326
  documents = []
327
+ if progress is None:
328
+ for file_path in ns.files_uploaded:
329
+ logger.debug(f"-Doing {file_path}")
330
+ try:
331
+ documents.extend(load_single_document(f"{file_path}"))
332
+ logger.debug("-Done reading files.")
333
+ except Exception as exc:
334
+ logger.error(f"-{file_path}: {exc}")
335
+ else:
336
+ for file_path in progress.tqdm(ns.files_uploaded, desc="Reading file(s)"):
337
+ logger.debug(f"Doing {file_path}")
338
+ try:
339
+ documents.extend(load_single_document(f"{file_path}"))
340
+ logger.debug("Done reading files.")
341
+ except Exception as exc:
342
+ logger.error(f"{file_path}: {exc}")
343
 
344
  text_splitter = RecursiveCharacterTextSplitter(
345
  chunk_size=ns.chunk_size, chunk_overlap=ns.chunk_overlap
 
347
  texts = text_splitter.split_documents(documents)
348
 
349
  logger.info(f"Loaded {len(ns.files_uploaded)} files ")
350
+ logger.info(f"Loaded {len(documents)} document(s) ")
351
+ logger.info(f"Split into {len(texts)} chunk(s) of text")
352
+
353
+ # initialize if necessary
354
+ if ns.db is None:
355
+ logger.info(f"loading {ns.model_name:}")
356
+ for _ in progress.tqdm(range(1), desc="diggin..."):
357
+ embedding = SentenceTransformerEmbeddings(
358
+ model_name=ns.model_name, model_kwargs={"device": DEVICE}
359
+ )
360
 
361
+ logger.info("creating vectorstore")
362
+ ns.db = Chroma(
363
+ # persist_directory=PERSIST_DIRECTORY,
364
+ embedding_function=embedding,
365
+ # client_settings=CHROMA_SETTINGS,
366
+ )
367
+ logger.info("done creating vectorstore")
368
 
369
  total = ceil(len(texts) / 101)
370
+ if progress is None:
371
+ # for text in progress.tqdm(
372
+ for idx, text in enumerate(mit.chunked_even(texts, 101)):
373
+ logger.debug(f"-{idx + 1} of {total}")
374
+ ns.db.add_documents(documents=text)
375
+ else:
376
+ # for text in progress.tqdm(
377
+ for idx, text in enumerate(progress.tqdm(
378
+ mit.chunked_even(texts, 101),
379
+ total=total,
380
+ desc="Processing docs",
381
+ )):
382
+ logger.debug(f"{idx + 1} of {total}")
383
+ ns.db.add_documents(documents=text)
384
+ logger.debug(f" done all {total}")
385
+
386
+ # ns.qa = load_qa()
387
+
388
+ llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
389
+ retriever = ns.db.as_retriever()
390
+ ns.qa = RetrievalQA.from_chain_type(
391
+ llm=llm,
392
+ chain_type="stuff",
393
+ retriever=retriever,
394
+ # return_source_documents=True,
395
+ )
396
 
397
  ns.ingest_done = True
398
  _ = [
 
401
  ]
402
  ns.files_info = _
403
 
404
+ logger.debug(f"{ns.ingest_done=}, exit process_files")
405
+ return f"done file(s): {dict(ns.files_info)}"
406
+
407
+
408
+ def respond(message, chat_history):
409
+ """Gen response."""
410
+ logger.debug(f"{ns.files_uploaded=}")
411
+ if not ns.files_uploaded: # no files processed yet
412
+ bot_message = "Upload some file(s) for processing first."
413
+ chat_history.append((message, bot_message))
414
+ return "", chat_history
415
 
416
+ logger.debug(f"{ns.ingest_done=}")
417
+ if not ns.ingest_done: # embedding database not doen yet
418
+ bot_message = (
419
+ "Waiting for ingest (embedding) to finish, "
420
+ "be patient... You can switch the 'Upload files' "
421
+ "Tab to check"
422
+ )
423
+ chat_history.append((message, bot_message))
424
+ return "", chat_history
425
+
426
+ _ = """
427
+ if ns.qa is None: # load qa one time
428
+ logger.info("Loading qa, need to do just one time.")
429
+ ns.qa = load_qa()
430
+ logger.info("Done loading qa, need to do just one time.")
431
+ # """
432
+ logger.debug(f"{ns.qa=}")
433
+ if ns.qa is None:
434
+ bot_message = "Looks like the bot is not ready. Try again later..."
435
+ chat_history.append((message, bot_message))
436
+ return "", chat_history
437
+
438
+ try:
439
+ res = ns.qa(message)
440
+ answer = res.get("result")
441
+ docs = res.get("source_documents")
442
+ if docs:
443
+ bot_message = f"{answer}\n({docs})"
444
+ else:
445
+ bot_message = f"{answer}"
446
+ except Exception as exc:
447
+ logger.error(exc)
448
+ bot_message = f"bummer! {exc}"
449
+
450
+ chat_history.append((message, bot_message))
451
+
452
+ return "", chat_history
453
 
454
 
455
  # pylint disable=unused-argument
 
499
  logger.info(f"Loaded {len(documents)} documents ")
500
  logger.info(f"Split into {len(texts)} chunks of text")
501
 
502
+ # Create embedding
503
+ # embedding = HuggingFaceInstructEmbeddings(
504
+ embedding = SentenceTransformerEmbeddings(
505
  model_name=model_name, model_kwargs={"device": device}
506
  )
507
 
 
512
  # mit.chunked_even(texts, 100)
513
  db = Chroma(
514
  # persist_directory=PERSIST_DIRECTORY,
515
+ embedding_function=embedding,
516
  # client_settings=CHROMA_SETTINGS,
517
  )
518
  # for text in progress.tqdm(
 
523
  with about_time() as atime: # type: ignore
524
  db = Chroma.from_documents(
525
  texts,
526
+ embedding,
527
  persist_directory=PERSIST_DIRECTORY,
528
  client_settings=CHROMA_SETTINGS,
529
  )
 
587
 
588
 
589
  def load_qa(device=None, model_name: str = MODEL_NAME):
590
+ """Gen qa.
591
+
592
+ device = 'cpu'
593
+ model_name = "hkunlp/instructor-xl"
594
+ model_name = "hkunlp/instructor-large"
595
+ model_name = "hkunlp/instructor-base"
596
+ embedding = HuggingFaceInstructEmbeddings(
597
+ """
598
  logger.info("Doing qa")
599
  if device is None:
600
  if torch.cuda.is_available():
 
602
  else:
603
  device = "cpu"
604
 
605
+ embedding = SentenceTransformerEmbeddings(
 
 
 
 
 
606
  model_name=model_name, model_kwargs={"device": device}
607
  )
608
  # xl 4.96G, large 3.5G,
609
 
610
  db = Chroma(
611
  persist_directory=PERSIST_DIRECTORY,
612
+ embedding_function=embedding,
613
  client_settings=CHROMA_SETTINGS,
614
  )
615
  retriever = db.as_retriever()
 
629
 
630
  return qa
631
 
632
+ # TODO: conversation_chain
 
633
  # pylint: disable=unreachable
634
 
635
  # model = 'gpt-3.5-turbo', default text-davinci-003
 
691
  gr.Markdown(dedent(_))
692
 
693
  with gr.Tab("Upload files"):
694
+ # Upload files and generate vectorstore
695
  with gr.Row():
696
  file_output = gr.File()
697
  # file_output = gr.Text()
 
702
  file_count="multiple",
703
  )
704
  with gr.Row():
705
+ text2 = gr.Textbox("Gen embedding")
706
+ process_btn = gr.Button("Click to embed")
707
+
708
+ # reset_btn = gr.Button("Reset everything", visibile=False)
709
 
710
  with gr.Tab("Query docs"):
711
  # interactive chat
 
720
  ns = deepcopy(ns_initial)
721
  return f"reset done: ns={ns}"
722
 
723
+ # reset_btn.click(reset_all, [], text2)
724
 
725
  upload_button.upload(upload_files, upload_button, file_output)
726
  process_btn.click(process_files, [], text2)
727
 
728
  def respond(message, chat_history):
729
  """Gen response."""
730
+ logger.info(f"{ns.ingest_done=}")
731
  if ns.ingest_done is None: # no files processed yet
732
  bot_message = "Upload some file(s) for processing first."
733
  chat_history.append((message, bot_message))
734
  return "", chat_history
735
 
736
+ logger.info(f"{ns.ingest_done=}")
737
  if not ns.ingest_done: # embedding database not doen yet
738
  bot_message = (
739
  "Waiting for ingest (embedding) to finish, "
740
+ f"({ns.ingest_done=})"
741
  "be patient... You can switch the 'Upload files' "
742
  "Tab to check"
743
  )
 
775
  clear.click(lambda: None, None, chatbot, queue=False)
776
 
777
  if __name__ == "__main__":
 
 
 
 
 
 
 
778
  demo.queue(concurrency_count=20).launch(share=share)
779
 
780
  _ = """
 
783
  model_name = "hkunlp/instructor-xl"
784
  model_name = "hkunlp/instructor-large"
785
  model_name = "hkunlp/instructor-base"
786
+ embedding = HuggingFaceInstructEmbeddings(
787
  model_name=,
788
  model_kwargs={"device": device}
789
  )
790
  # xl 4.96G, large 3.5G,
791
+ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding, client_settings=CHROMA_SETTINGS)
792
  retriever = db.as_retriever()
793
 
794
  llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
docs/test2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 总 纲
2
+   中国共产党是中国工人阶级的先锋队,同时是中国人民和中华民族的先锋队,是中国特色社会主义事业的领导核心,代表中国先进生产力的发展要求,代表中国先进文化的前进方向,代表中国最广大人民的根本利益。党的最高理想和最终目标是实现共产主义。
main.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test."""
2
+ # pylint: disable=invalid-name, unused-import, broad-except,
3
+ from copy import deepcopy
4
+
5
+ import gradio as gr
6
+ from app import ingest, ns, ns_initial, process_files, upload_files, respond
7
+ from load_api_key import load_api_key, pk_base, sk_base
8
+ from loguru import logger
9
+
10
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
11
+ with gr.Tab("Upload files"):
12
+ # Upload files and generate vectorstore
13
+ with gr.Row():
14
+ file_output = gr.File()
15
+ # file_output = gr.Text()
16
+ # file_output = gr.DataFrame()
17
+ upload_button = gr.UploadButton(
18
+ "Click to upload",
19
+ # file_types=["*.pdf", "*.epub", "*.docx"],
20
+ file_count="multiple",
21
+ )
22
+ with gr.Row():
23
+ text2 = gr.Textbox("Gen embedding")
24
+ process_btn = gr.Button("Click to embed")
25
+
26
+ reset_btn = gr.Button("Reset everything", visible=False)
27
+
28
+ with gr.Tab("Query docs"):
29
+ # interactive chat
30
+ chatbot = gr.Chatbot()
31
+ msg = gr.Textbox(label="Query")
32
+ clear = gr.Button("Clear")
33
+
34
+ # actions
35
+ def reset_all():
36
+ """Reset ns."""
37
+ # global ns
38
+ globals().update(**{"ns": deepcopy(ns_initial)})
39
+ return f"reset done: ns={ns}"
40
+
41
+ reset_btn.click(reset_all, [], text2)
42
+
43
+ upload_button.upload(upload_files, upload_button, file_output)
44
+ process_btn.click(process_files, [], text2)
45
+
46
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
47
+ clear.click(lambda: None, None, chatbot, queue=False)
48
+
49
+ if __name__ == "__main__":
50
+ demo.queue(concurrency_count=20).launch()
requirements-freeze.txt ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ about-time==4.2.1
2
+ absl-py==0.11.0
3
+ accelerate==0.19.0
4
+ aiofiles==23.1.0
5
+ aiohttp==3.8.4
6
+ aiosignal==1.3.1
7
+ altair==5.0.1
8
+ analytics-python==1.4.post1
9
+ anyio==3.7.0
10
+ argilla==1.8.0
11
+ astroid==2.15.5
12
+ asttokens==2.2.1
13
+ async-timeout==4.0.2
14
+ attrs==23.1.0
15
+ backcall==0.2.0
16
+ backoff==1.10.0
17
+ bcrypt==4.0.1
18
+ bitsandbytes==0.39.0
19
+ black==23.3.0
20
+ certifi==2023.5.7
21
+ cffi==1.15.1
22
+ chardet==5.1.0
23
+ charset-normalizer==3.1.0
24
+ chromadb==0.3.22
25
+ click==8.1.3
26
+ clickhouse-connect==0.5.25
27
+ colorama==0.4.6
28
+ commonmark==0.9.1
29
+ contourpy==1.0.7
30
+ cryptography==41.0.1
31
+ cycler==0.11.0
32
+ dataclasses-json==0.5.7
33
+ decorator==5.1.1
34
+ Deprecated==1.2.14
35
+ dill==0.3.6
36
+ docx2txt==0.8
37
+ duckdb==0.8.0
38
+ EbookLib==0.17.1
39
+ epub2txt==0.1.6
40
+ et-xmlfile==1.1.0
41
+ exceptiongroup==1.1.1
42
+ executing==1.2.0
43
+ faiss-cpu==1.7.4
44
+ fastapi==0.96.0
45
+ ffmpy==0.3.0
46
+ filelock==3.12.0
47
+ fonttools==4.39.4
48
+ frozenlist==1.3.3
49
+ fsspec==2023.5.0
50
+ gradio==3.35.2
51
+ gradio_client==0.2.7
52
+ greenlet==2.0.2
53
+ h11==0.12.0
54
+ hnswlib==0.7.0
55
+ httpcore==0.12.3
56
+ httptools==0.5.0
57
+ httpx==0.16.1
58
+ huggingface-hub==0.15.1
59
+ idna==3.4
60
+ InstructorEmbedding==1.0.1
61
+ ipython==8.14.0
62
+ isort==5.12.0
63
+ jedi==0.18.2
64
+ Jinja2==3.1.2
65
+ joblib==1.2.0
66
+ jsonschema==4.17.3
67
+ kiwisolver==1.4.4
68
+ langchain==0.0.166
69
+ lazy-object-proxy==1.9.0
70
+ linkify-it-py==2.0.2
71
+ llama-cpp-python==0.1.48
72
+ llama-index==0.6.21.post1
73
+ loguru==0.7.0
74
+ logzero==1.7.0
75
+ lxml==4.9.2
76
+ lz4==4.3.2
77
+ Markdown==3.4.3
78
+ markdown-it-py==2.2.0
79
+ MarkupSafe==2.1.3
80
+ marshmallow==3.19.0
81
+ marshmallow-enum==1.5.1
82
+ matplotlib==3.7.1
83
+ matplotlib-inline==0.1.6
84
+ mccabe==0.7.0
85
+ mdit-py-plugins==0.3.3
86
+ mdurl==0.1.2
87
+ monotonic==1.6
88
+ more-itertools==9.1.0
89
+ mpmath==1.3.0
90
+ msg-parser==1.2.0
91
+ multidict==6.0.4
92
+ mypy-extensions==1.0.0
93
+ networkx==3.1
94
+ nltk==3.8.1
95
+ numexpr==2.8.4
96
+ numpy==1.23.5
97
+ olefile==0.46
98
+ openai==0.27.8
99
+ openapi-schema-pydantic==1.2.4
100
+ openpyxl==3.1.2
101
+ orjson==3.9.0
102
+ packaging==23.1
103
+ pandas==1.5.3
104
+ paramiko==3.2.0
105
+ parso==0.8.3
106
+ pathspec==0.11.1
107
+ pdfminer.six==20221105
108
+ pickleshare==0.7.5
109
+ Pillow==9.5.0
110
+ platformdirs==3.5.1
111
+ posthog==3.0.1
112
+ prompt-toolkit==3.0.38
113
+ protobuf==3.20.0
114
+ psutil==5.9.5
115
+ pure-eval==0.2.2
116
+ pycparser==2.21
117
+ pycryptodome==3.18.0
118
+ pydantic==1.10.8
119
+ pydub==0.25.1
120
+ Pygments==2.15.1
121
+ pylint==2.17.4
122
+ PyNaCl==1.5.0
123
+ pypandoc==1.11
124
+ pyparsing==3.0.9
125
+ pypdf==3.9.1
126
+ PyPDF2==3.0.1
127
+ pyrsistent==0.19.3
128
+ python-dateutil==2.8.2
129
+ python-docx==0.8.11
130
+ python-dotenv==1.0.0
131
+ python-magic==0.4.27
132
+ python-multipart==0.0.6
133
+ python-pptx==0.6.21
134
+ pytz==2023.3
135
+ PyYAML==6.0
136
+ regex==2023.6.3
137
+ requests==2.31.0
138
+ rfc3986==1.5.0
139
+ rich==13.0.1
140
+ scikit-learn==1.2.2
141
+ scipy==1.10.1
142
+ semantic-version==2.10.0
143
+ sentence-transformers==2.2.2
144
+ sentencepiece==0.1.99
145
+ six==1.16.0
146
+ sniffio==1.3.0
147
+ SQLAlchemy==2.0.15
148
+ stack-data==0.6.2
149
+ starlette==0.27.0
150
+ sympy==1.12
151
+ tabulate==0.9.0
152
+ tenacity==8.2.2
153
+ threadpoolctl==3.1.0
154
+ tiktoken==0.4.0
155
+ tokenizers==0.13.3
156
+ tomli==2.0.1
157
+ tomlkit==0.11.8
158
+ toolz==0.12.0
159
+ torch==2.0.1
160
+ torchvision==0.15.2
161
+ tqdm==4.65.0
162
+ traitlets==5.9.0
163
+ transformers==4.29.2
164
+ typer==0.9.0
165
+ typing-inspect==0.8.0
166
+ typing_extensions==4.5.0
167
+ tzdata==2023.3
168
+ uc-micro-py==1.0.2
169
+ urllib3==1.26.6
170
+ uvicorn==0.22.0
171
+ watchfiles==0.19.0
172
+ wcwidth==0.2.6
173
+ websockets==11.0.3
174
+ win32-setctime==1.1.0
175
+ wrapt==1.14.1
176
+ xlrd==2.0.1
177
+ XlsxWriter==3.1.2
178
+ yarl==1.9.2
179
+ zstandard==0.21.0
requirements-win10-cpu.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.166
2
+ chromadb==0.3.22
3
+ llama-cpp-python==0.1.48
4
+ urllib3==1.26.6
5
+ pdfminer.six==20221105
6
+ InstructorEmbedding
7
+
8
+ # required by sentence-transformers
9
+ # do not use the following in windows. it will cause
10
+ # "Throws a silent error if function takes more than 5 seconds #3078" issue https://github.com/gradio-app/gradio/issues/3078
11
+ # --extra-index-url https://download.pytorch.org/whl/cpu
12
+ torch
13
+ torchvision
14
+ sentence-transformers
15
+ faiss-cpu
16
+ huggingface_hub
17
+ transformers
18
+ protobuf==3.20.0
19
+ accelerate
20
+ bitsandbytes
21
+ # click
22
+ openpyxl
23
+ loguru
24
+ gradio
25
+ charset-normalizer
26
+ PyPDF2
27
+ epub2txt
28
+ docx2txt
29
+
30
+ about-time
31
+ openai
32
+ more-itertools
33
+ # tqdm
requirements.txt CHANGED
@@ -16,7 +16,7 @@ transformers
16
  protobuf==3.20.0
17
  accelerate
18
  bitsandbytes
19
- click
20
  openpyxl
21
  loguru
22
  gradio
@@ -28,4 +28,4 @@ docx2txt
28
  about-time
29
  openai
30
  more-itertools
31
- tqdm
 
16
  protobuf==3.20.0
17
  accelerate
18
  bitsandbytes
19
+ # click
20
  openpyxl
21
  loguru
22
  gradio
 
28
  about-time
29
  openai
30
  more-itertools
31
+ # tqdm