ffreemt commited on
Commit
50c6a2e
1 Parent(s): ebbd809

Update progressbar

Browse files
Files changed (9) hide show
  1. .editorconfig +10 -0
  2. .gitignore +3 -0
  3. app-org.py +526 -0
  4. app.py +364 -159
  5. docs/test.sdlxliff +0 -0
  6. load_api_key.py +38 -0
  7. package.json +20 -0
  8. requirements.txt +3 -1
  9. yarn.lock +23 -0
.editorconfig ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ root = true
2
+
3
+ [*]
4
+ end_of_line = lf
5
+ insert_final_newline = true
6
+
7
+ [*.{js,json,yml}]
8
+ charset = utf-8
9
+ indent_style = space
10
+ indent_size = 2
.gitignore CHANGED
@@ -4,3 +4,6 @@ dummy
4
  .ENV
5
  .env
6
  __pycache__
 
 
 
 
4
  .ENV
5
  .env
6
  __pycache__
7
+ .yarn
8
+ .chroma
9
+ .pnp.cjs
app-org.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Refer to
2
+ https://huggingface.co/spaces/mikeee/docs-chat/blob/main/app.py
3
+ and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
4
+
5
+ https://python.langchain.com/en/latest/getting_started/tutorials.html
6
+
7
+ unstructured: python-magic python-docx python-pptx
8
+ from langchain.document_loaders import UnstructuredHTMLLoader
9
+
10
+ docs = []
11
+ # for doc in Path('docs').glob("*.pdf"):
12
+ for doc in Path('docs').glob("*"):
13
+ # for doc in Path('docs').glob("*.txt"):
14
+ docs.append(load_single_document(f"{doc}"))
15
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
16
+ texts = text_splitter.split_documents(docs)
17
+
18
+ model_name = "hkunlp/instructor-base"
19
+ embeddings = HuggingFaceInstructEmbeddings(
20
+ model_name=model_name, model_kwargs={"device": device}
21
+ )
22
+
23
+ # constitution.pdf 54344, 72 chunks Wall time: 3min 13s CPU times: total: 9min 4s @golay
24
+ # test.txt 21286, 27 chunks, Wall time: 47 s CPU times: total: 2min 30s @golay
25
+ # both 99 chunks, Wall time: 5min 4s CPU times: total: 13min 31s
26
+ # chunks = len / 800
27
+
28
+ db = Chroma.from_documents(texts, embeddings)
29
+
30
+ db = Chroma.from_documents(
31
+ texts,
32
+ embeddings,
33
+ persist_directory=PERSIST_DIRECTORY,
34
+ client_settings=CHROMA_SETTINGS,
35
+ )
36
+ db.persist()
37
+
38
+ # 中国共产党章程.txt qa
39
+ https://github.com/xanderma/Assistant-Attop/blob/master/Release/%E6%96%87%E5%AD%97%E7%89%88%E9%A2%98%E5%BA%93/31.%E4%B8%AD%E5%9B%BD%E5%85%B1%E4%BA%A7%E5%85%9A%E7%AB%A0%E7%A8%8B.txt
40
+
41
+ colab CPU test.text constitution.pdf
42
+ CPU times: user 1min 27s, sys: 8.09 s, total: 1min 35s
43
+ Wall time: 1min 37s
44
+
45
+ """
46
+ # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long, too-many-return-statements, import-outside-toplevel, no-name-in-module, no-member
47
+ import os
48
+ import time
49
+ from pathlib import Path
50
+ from textwrap import dedent
51
+ from types import SimpleNamespace
52
+
53
+ import gradio as gr
54
+ import torch
55
+ from charset_normalizer import detect
56
+ from chromadb.config import Settings
57
+ from epub2txt import epub2txt
58
+ from langchain.chains import RetrievalQA
59
+ from langchain.docstore.document import Document
60
+ from langchain.document_loaders import (
61
+ CSVLoader,
62
+ Docx2txtLoader,
63
+ PDFMinerLoader,
64
+ TextLoader,
65
+ )
66
+
67
+ # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
68
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
69
+ from langchain.llms import HuggingFacePipeline
70
+ from langchain.text_splitter import (
71
+ # CharacterTextSplitter,
72
+ RecursiveCharacterTextSplitter,
73
+ )
74
+
75
+ # FAISS instead of PineCone
76
+ from langchain.vectorstores import Chroma # FAISS,
77
+ from loguru import logger
78
+ # from PyPDF2 import PdfReader # localgpt
79
+ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
80
+
81
+ # import click
82
+ # from typing import List
83
+
84
+ # from utils import xlxs_to_csv
85
+
86
+ # load possible env such as OPENAI_API_KEY
87
+ # from dotenv import load_dotenv
88
+
89
+ # load_dotenv()load_dotenv()
90
+
91
+ # fix timezone
92
+ os.environ["TZ"] = "Asia/Shanghai"
93
+ try:
94
+ time.tzset() # type: ignore # pylint: disable=no-member
95
+ except Exception:
96
+ # Windows
97
+ logger.warning("Windows, cant run time.tzset()")
98
+
99
+ ROOT_DIRECTORY = Path(__file__).parent
100
+ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
101
+
102
+ # Define the Chroma settings
103
+ CHROMA_SETTINGS = Settings(
104
+ chroma_db_impl="duckdb+parquet",
105
+ persist_directory=PERSIST_DIRECTORY,
106
+ anonymized_telemetry=False,
107
+ )
108
+ ns = SimpleNamespace(qa=None, ingest_done=None, files_info=None)
109
+
110
+
111
+ def load_single_document(file_path: str | Path) -> Document:
112
+ """ingest.py"""
113
+ # Loads a single document from a file path
114
+ # encoding = detect(open(file_path, "rb").read()).get("encoding", "utf-8")
115
+ encoding = detect(Path(file_path).read_bytes()).get("encoding", "utf-8")
116
+ if file_path.endswith(".txt"):
117
+ if encoding is None:
118
+ logger.warning(
119
+ f" {file_path}'s encoding is None "
120
+ "Something is fishy, return empty str "
121
+ )
122
+ return Document(page_content="", metadata={"source": file_path})
123
+
124
+ try:
125
+ loader = TextLoader(file_path, encoding=encoding)
126
+ except Exception as exc:
127
+ logger.warning(f" {exc}, return dummy ")
128
+ return Document(page_content="", metadata={"source": file_path})
129
+
130
+ elif file_path.endswith(".pdf"):
131
+ loader = PDFMinerLoader(file_path)
132
+ elif file_path.endswith(".csv"):
133
+ loader = CSVLoader(file_path)
134
+ elif Path(file_path).suffix in [".docx"]:
135
+ try:
136
+ loader = Docx2txtLoader(file_path)
137
+ except Exception as exc:
138
+ logger.error(f" {file_path} errors: {exc}")
139
+ return Document(page_content="", metadata={"source": file_path})
140
+ elif Path(file_path).suffix in [".epub"]: # for epub? epub2txt unstructured
141
+ try:
142
+ _ = epub2txt(file_path)
143
+ except Exception as exc:
144
+ logger.error(f" {file_path} errors: {exc}")
145
+ return Document(page_content="", metadata={"source": file_path})
146
+ return Document(page_content=_, metadata={"source": file_path})
147
+ else:
148
+ if encoding is None:
149
+ logger.warning(
150
+ f" {file_path}'s encoding is None "
151
+ "Likely binary files, return empty str "
152
+ )
153
+ return Document(page_content="", metadata={"source": file_path})
154
+ try:
155
+ loader = TextLoader(file_path)
156
+ except Exception as exc:
157
+ logger.error(f" {exc}, returnning empty string")
158
+ return Document(page_content="", metadata={"source": file_path})
159
+
160
+ return loader.load()[0]
161
+
162
+
163
+ def get_pdf_text(pdf_docs):
164
+ """docs-chat."""
165
+ text = ""
166
+ for pdf in pdf_docs:
167
+ pdf_reader = PdfReader(f"{pdf}") # taking care of Path
168
+ for page in pdf_reader.pages:
169
+ text += page.extract_text()
170
+ return text
171
+
172
+
173
+ def get_text_chunks(text):
174
+ """docs-chat."""
175
+ text_splitter = CharacterTextSplitter(
176
+ separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
177
+ )
178
+ chunks = text_splitter.split_text(text)
179
+ return chunks
180
+
181
+
182
+ def get_vectorstore(text_chunks):
183
+ """docs-chat."""
184
+ # embeddings = OpenAIEmbeddings()
185
+ model_name = "hkunlp/instructor-xl"
186
+ model_name = "hkunlp/instructor-large"
187
+ model_name = "hkunlp/instructor-base"
188
+ logger.info(f"Loading {model_name}")
189
+ embeddings = HuggingFaceInstructEmbeddings(model_name=model_name)
190
+ logger.info(f"Done loading {model_name}")
191
+
192
+ logger.info(
193
+ "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
194
+ )
195
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
196
+ logger.info(
197
+ "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
198
+ )
199
+
200
+ return vectorstore
201
+
202
+
203
+ def greet(name):
204
+ """Test."""
205
+ logger.debug(f" name: [{name}] ")
206
+ return "Hello " + name + "!!"
207
+
208
+
209
+ def upload_files(files):
210
+ """Upload files."""
211
+ file_paths = [file.name for file in files]
212
+ logger.info(file_paths)
213
+
214
+ ns.ingest_done = False
215
+ res = ingest(file_paths)
216
+ logger.info(f"Processed:\n{res}")
217
+
218
+ # flag ns.qadone
219
+ ns.ingest_done = True
220
+ ns.files_info = res
221
+
222
+ # ns.qa = load_qa()
223
+
224
+ # return [str(elm) for elm in res]
225
+ return file_paths
226
+
227
+ # return ingest(file_paths)
228
+
229
+
230
+ def ingest(
231
+ file_paths: list[str | Path], model_name="hkunlp/instructor-base", device_type=None
232
+ ):
233
+ """Gen Chroma db.
234
+
235
+ torch.cuda.is_available()
236
+
237
+ file_paths =
238
+ ['C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\41b53dd5f203b423f2dced44eaf56e72508b7bbe\\app.py',
239
+ 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\9390755bb391abc530e71a3946a7b50d463ba0ef\\README.md',
240
+ 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\3341f9a410a60ffa57bf4342f3018a3de689f729\\requirements.txt']
241
+ """
242
+ logger.info("\n\t Doing ingest...")
243
+
244
+ if device_type is None:
245
+ if torch.cuda.is_available():
246
+ device_type = "cuda"
247
+ else:
248
+ device_type = "cpu"
249
+
250
+ if device_type in ["cpu", "CPU"]:
251
+ device = "cpu"
252
+ elif device_type in ["mps", "MPS"]:
253
+ device = "mps"
254
+ else:
255
+ device = "cuda"
256
+
257
+ #  Load documents and split in chunks
258
+ # logger.info(f"Loading documents from {SOURCE_DIRECTORY}")
259
+ # documents = load_documents(SOURCE_DIRECTORY)
260
+
261
+ documents = []
262
+ for file_path in file_paths:
263
+ documents.append(load_single_document(f"{file_path}"))
264
+
265
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
266
+ texts = text_splitter.split_documents(documents)
267
+
268
+ logger.info(f"Loaded {len(documents)} documents ")
269
+ logger.info(f"Split into {len(texts)} chunks of text")
270
+
271
+ # Create embeddings
272
+ embeddings = HuggingFaceInstructEmbeddings(
273
+ model_name=model_name, model_kwargs={"device": device}
274
+ )
275
+
276
+ db = Chroma.from_documents(
277
+ texts,
278
+ embeddings,
279
+ persist_directory=PERSIST_DIRECTORY,
280
+ client_settings=CHROMA_SETTINGS,
281
+ )
282
+ db.persist()
283
+ db = None
284
+ logger.info("Done ingest")
285
+
286
+ return [
287
+ [Path(doc.metadata.get("source")).name, len(doc.page_content)]
288
+ for doc in documents
289
+ ]
290
+
291
+
292
+ # TheBloke/Wizard-Vicuna-7B-Uncensored-HF
293
+ # https://huggingface.co/TheBloke/vicuna-7B-1.1-HF
294
+ def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
295
+ """Gen a local llm.
296
+
297
+ localgpt run_localgpt
298
+ https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2
299
+ with torch.device(“cuda”):
300
+ model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
301
+
302
+ model = BetterTransformer.transform(model)
303
+ """
304
+ tokenizer = LlamaTokenizer.from_pretrained(model_id)
305
+ if torch.cuda.is_available():
306
+ model = LlamaForCausalLM.from_pretrained(
307
+ model_id,
308
+ # load_in_8bit=True, # set these options if your GPU supports them!
309
+ # device_map=1 # "auto",
310
+ torch_dtype=torch.float16,
311
+ low_cpu_mem_usage=True,
312
+ )
313
+ else:
314
+ model = LlamaForCausalLM.from_pretrained(model_id)
315
+
316
+ pipe = pipeline(
317
+ "text-generation",
318
+ model=model,
319
+ tokenizer=tokenizer,
320
+ max_length=2048,
321
+ temperature=0,
322
+ top_p=0.95,
323
+ repetition_penalty=1.15,
324
+ )
325
+
326
+ local_llm = HuggingFacePipeline(pipeline=pipe)
327
+ return local_llm
328
+
329
+
330
+ def load_qa(device=None, model_name: str = "hkunlp/instructor-base"):
331
+ """Gen qa."""
332
+ logger.info("Doing qa")
333
+ if device is None:
334
+ if torch.cuda.is_available():
335
+ device = "cuda"
336
+ else:
337
+ device = "cpu"
338
+
339
+ # device = 'cpu'
340
+ # model_name = "hkunlp/instructor-xl"
341
+ # model_name = "hkunlp/instructor-large"
342
+ # model_name = "hkunlp/instructor-base"
343
+ embeddings = HuggingFaceInstructEmbeddings(
344
+ model_name=model_name, model_kwargs={"device": device}
345
+ )
346
+ # xl 4.96G, large 3.5G,
347
+ db = Chroma(
348
+ persist_directory=PERSIST_DIRECTORY,
349
+ embedding_function=embeddings,
350
+ client_settings=CHROMA_SETTINGS,
351
+ )
352
+ retriever = db.as_retriever()
353
+
354
+ llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
355
+
356
+ qa = RetrievalQA.from_chain_type(
357
+ llm=llm,
358
+ chain_type="stuff",
359
+ retriever=retriever,
360
+ return_source_documents=True,
361
+ )
362
+
363
+ logger.info("Done qa")
364
+
365
+ return qa
366
+
367
+
368
+ def main1():
369
+ """Lump codes"""
370
+ with gr.Blocks() as demo:
371
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
372
+ iface.launch()
373
+
374
+ demo.launch()
375
+
376
+
377
+ def main():
378
+ """Do blocks."""
379
+ logger.info(f"ROOT_DIRECTORY: {ROOT_DIRECTORY}")
380
+
381
+ openai_api_key = os.getenv("OPENAI_API_KEY")
382
+ logger.info(f"openai_api_key (env var/hf space SECRETS): {openai_api_key}")
383
+
384
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
385
+ # name = gr.Textbox(label="Name")
386
+ # greet_btn = gr.Button("Submit")
387
+ # output = gr.Textbox(label="Output Box")
388
+ # greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")
389
+ with gr.Accordion("Info", open=False):
390
+ _ = """
391
+ # localgpt
392
+ Talk to your docs (.pdf, .docx, .epub, .txt .md and
393
+ other text docs). It
394
+ takes quite a while to ingest docs (10-30 min. depending
395
+ on net, RAM, CPU etc.).
396
+
397
+ Send empty query (hit Enter) to check embedding status and files info ([filename, numb of chars])
398
+
399
+ Homepage: https://huggingface.co/spaces/mikeee/localgpt
400
+ """
401
+ gr.Markdown(dedent(_))
402
+
403
+ # with gr.Accordion("Upload files", open=True):
404
+ with gr.Tab("Upload files"):
405
+ # Upload files and generate embeddings database
406
+ file_output = gr.File()
407
+ upload_button = gr.UploadButton(
408
+ "Click to upload files",
409
+ # file_types=["*.pdf", "*.epub", "*.docx"],
410
+ file_count="multiple",
411
+ )
412
+ upload_button.upload(upload_files, upload_button, file_output)
413
+
414
+ with gr.Tab("Query docs"):
415
+ # interactive chat
416
+ chatbot = gr.Chatbot()
417
+ msg = gr.Textbox(label="Query")
418
+ clear = gr.Button("Clear")
419
+
420
+ def respond(message, chat_history):
421
+ # bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
422
+ if ns.ingest_done is None: # no files processed yet
423
+ bot_message = "Upload some file(s) for processing first."
424
+ chat_history.append((message, bot_message))
425
+ return "", chat_history
426
+
427
+ if not ns.ingest_done: # embedding database not doen yet
428
+ bot_message = (
429
+ "Waiting for ingest (embedding) to finish, "
430
+ "be patient... You can switch the 'Upload files' "
431
+ "Tab to check"
432
+ )
433
+ chat_history.append((message, bot_message))
434
+ return "", chat_history
435
+
436
+ if ns.qa is None: # load qa one time
437
+ logger.info("Loading qa, need to do just one time.")
438
+ ns.qa = load_qa()
439
+
440
+ try:
441
+ res = ns.qa(message)
442
+ answer, docs = res["result"], res["source_documents"]
443
+ bot_message = f"{answer} ({docs})"
444
+ except Exception as exc:
445
+ logger.error(exc)
446
+ bot_message = f"bummer! {exc}"
447
+
448
+ chat_history.append((message, bot_message))
449
+
450
+ return "", chat_history
451
+
452
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
453
+ clear.click(lambda: None, None, chatbot, queue=False)
454
+
455
+ try:
456
+ from google import colab # noqa
457
+
458
+ share = True # start share when in colab
459
+ except Exception:
460
+ share = False
461
+ demo.launch(share=share)
462
+
463
+
464
+ if __name__ == "__main__":
465
+ main()
466
+
467
+ _ = """
468
+ run_localgpt
469
+ device = 'cpu'
470
+ model_name = "hkunlp/instructor-xl"
471
+ model_name = "hkunlp/instructor-large"
472
+ model_name = "hkunlp/instructor-base"
473
+ embeddings = HuggingFaceInstructEmbeddings(
474
+ model_name=,
475
+ model_kwargs={"device": device}
476
+ )
477
+ # xl 4.96G, large 3.5G,
478
+ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
479
+ retriever = db.as_retriever()
480
+
481
+ llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
482
+
483
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
484
+
485
+ query = 'a'
486
+ res = qa(query)
487
+
488
+ ---
489
+ https://www.linkedin.com/pulse/build-qa-bot-over-private-data-openai-langchain-leo-wang
490
+
491
+ history = [】
492
+
493
+ def user(user_message, history):
494
+ # Get response from QA chain
495
+ response = qa({"question": user_message, "chat_history": history})
496
+ # Append user message and response to chat history
497
+ history.append((user_message, response["answer"]))]
498
+
499
+ ---
500
+ https://llamahub.ai/l/file-unstructured
501
+
502
+ from pathlib import Path
503
+ from llama_index import download_loader
504
+
505
+ UnstructuredReader = download_loader("UnstructuredReader")
506
+
507
+ loader = UnstructuredReader()
508
+ documents = loader.load_data(file=Path('./10k_filing.html'))
509
+
510
+ # --
511
+ from pathlib import Path
512
+ from llama_index import download_loader
513
+
514
+ # SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
515
+ # FileNotFoundError: [Errno 2] No such file or directory
516
+
517
+ documents = SimpleDirectoryReader('./data').load_data()
518
+
519
+ loader = SimpleDirectoryReader('./data', file_extractor={
520
+ ".pdf": "UnstructuredReader",
521
+ ".html": "UnstructuredReader",
522
+ ".eml": "UnstructuredReader",
523
+ ".pptx": "PptxReader"
524
+ })
525
+ documents = loader.load_data()
526
+ """
app.py CHANGED
@@ -1,9 +1,12 @@
1
- """Refer to
2
- https://huggingface.co/spaces/mikeee/docs-chat/blob/main/app.py
3
  and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
4
 
5
  https://python.langchain.com/en/latest/getting_started/tutorials.html
6
 
 
 
 
7
  unstructured: python-magic python-docx python-pptx
8
  from langchain.document_loaders import UnstructuredHTMLLoader
9
 
@@ -34,6 +37,7 @@ db = Chroma.from_documents(
34
  client_settings=CHROMA_SETTINGS,
35
  )
36
  db.persist()
 
37
 
38
  # 中国共产党章程.txt qa
39
  https://github.com/xanderma/Assistant-Attop/blob/master/Release/%E6%96%87%E5%AD%97%E7%89%88%E9%A2%98%E5%BA%93/31.%E4%B8%AD%E5%9B%BD%E5%85%B1%E4%BA%A7%E5%85%9A%E7%AB%A0%E7%A8%8B.txt
@@ -43,19 +47,28 @@ CPU times: user 1min 27s, sys: 8.09 s, total: 1min 35s
43
  Wall time: 1min 37s
44
 
45
  """
46
- # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long, too-many-return-statements, import-outside-toplevel, no-name-in-module, no-member
47
  import os
48
  import time
 
 
49
  from pathlib import Path
 
50
  from textwrap import dedent
51
  from types import SimpleNamespace
 
52
 
53
  import gradio as gr
 
54
  import torch
 
55
  from charset_normalizer import detect
56
  from chromadb.config import Settings
57
- from epub2txt import epub2txt
58
- from langchain.chains import RetrievalQA
 
 
 
59
  from langchain.docstore.document import Document
60
  from langchain.document_loaders import (
61
  CSVLoader,
@@ -63,30 +76,26 @@ from langchain.document_loaders import (
63
  PDFMinerLoader,
64
  TextLoader,
65
  )
66
-
67
- # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
68
- from langchain.embeddings import HuggingFaceInstructEmbeddings
69
- from langchain.llms import HuggingFacePipeline
 
 
70
  from langchain.text_splitter import (
71
  CharacterTextSplitter,
72
  RecursiveCharacterTextSplitter,
73
  )
74
-
75
- # FAISS instead of PineCone
76
  from langchain.vectorstores import FAISS, Chroma
77
  from loguru import logger
78
- from PyPDF2 import PdfReader # localgpt
 
79
  from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
80
 
81
- # import click
82
- # from typing import List
83
 
84
- # from utils import xlxs_to_csv
85
-
86
- # load possible env such as OPENAI_API_KEY
87
- # from dotenv import load_dotenv
88
-
89
- # load_dotenv()load_dotenv()
90
 
91
  # fix timezone
92
  os.environ["TZ"] = "Asia/Shanghai"
@@ -96,6 +105,14 @@ except Exception:
96
  # Windows
97
  logger.warning("Windows, cant run time.tzset()")
98
 
 
 
 
 
 
 
 
 
99
  ROOT_DIRECTORY = Path(__file__).parent
100
  PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
101
 
@@ -105,59 +122,82 @@ CHROMA_SETTINGS = Settings(
105
  persist_directory=PERSIST_DIRECTORY,
106
  anonymized_telemetry=False,
107
  )
108
- ns = SimpleNamespace(qa=None, ingest_done=None, files_info=None)
109
 
 
110
 
111
- def load_single_document(file_path: str | Path) -> Document:
112
- """ingest.py"""
113
- # Loads a single document from a file path
114
- # encoding = detect(open(file_path, "rb").read()).get("encoding", "utf-8")
115
- encoding = detect(Path(file_path).read_bytes()).get("encoding", "utf-8")
116
- if file_path.endswith(".txt"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  if encoding is None:
118
  logger.warning(
119
  f" {file_path}'s encoding is None "
120
  "Something is fishy, return empty str "
121
  )
122
- return Document(page_content="", metadata={"source": file_path})
123
-
124
  try:
125
  loader = TextLoader(file_path, encoding=encoding)
126
  except Exception as exc:
127
  logger.warning(f" {exc}, return dummy ")
128
- return Document(page_content="", metadata={"source": file_path})
129
-
130
- elif file_path.endswith(".pdf"):
131
- loader = PDFMinerLoader(file_path)
 
 
 
132
  elif file_path.endswith(".csv"):
133
- loader = CSVLoader(file_path)
 
 
 
 
134
  elif Path(file_path).suffix in [".docx"]:
135
  try:
136
  loader = Docx2txtLoader(file_path)
137
  except Exception as exc:
138
  logger.error(f" {file_path} errors: {exc}")
139
- return Document(page_content="", metadata={"source": file_path})
140
- elif Path(file_path).suffix in [".epub"]: # for epub? epub2txt unstructured
141
  try:
142
- _ = epub2txt(file_path)
 
143
  except Exception as exc:
144
  logger.error(f" {file_path} errors: {exc}")
145
- return Document(page_content="", metadata={"source": file_path})
146
- return Document(page_content=_, metadata={"source": file_path})
147
  else:
148
  if encoding is None:
149
  logger.warning(
150
  f" {file_path}'s encoding is None "
151
  "Likely binary files, return empty str "
152
  )
153
- return Document(page_content="", metadata={"source": file_path})
154
  try:
155
  loader = TextLoader(file_path)
156
  except Exception as exc:
157
  logger.error(f" {exc}, returnning empty string")
158
- return Document(page_content="", metadata={"source": file_path})
159
 
160
- return loader.load()[0]
161
 
162
 
163
  def get_pdf_text(pdf_docs):
@@ -170,25 +210,59 @@ def get_pdf_text(pdf_docs):
170
  return text
171
 
172
 
173
- def get_text_chunks(text):
174
  """docs-chat."""
175
  text_splitter = CharacterTextSplitter(
176
- separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
177
  )
178
  chunks = text_splitter.split_text(text)
179
  return chunks
180
 
181
 
182
- def get_vectorstore(text_chunks):
183
- """docs-chat."""
 
 
 
 
184
  # embeddings = OpenAIEmbeddings()
 
185
  model_name = "hkunlp/instructor-xl"
186
  model_name = "hkunlp/instructor-large"
187
  model_name = "hkunlp/instructor-base"
 
 
 
 
188
  logger.info(f"Loading {model_name}")
189
- embeddings = HuggingFaceInstructEmbeddings(model_name=model_name)
190
  logger.info(f"Done loading {model_name}")
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  logger.info(
193
  "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
194
  )
@@ -211,15 +285,7 @@ def upload_files(files):
211
  file_paths = [file.name for file in files]
212
  logger.info(file_paths)
213
 
214
- ns.ingest_done = False
215
- res = ingest(file_paths)
216
- logger.info(f"Processed:\n{res}")
217
-
218
- # flag ns.qadone
219
- ns.ingest_done = True
220
- ns.files_info = res
221
-
222
- # ns.qa = load_qa()
223
 
224
  # return [str(elm) for elm in res]
225
  return file_paths
@@ -227,19 +293,63 @@ def upload_files(files):
227
  # return ingest(file_paths)
228
 
229
 
230
- def ingest(
231
- file_paths: list[str | Path], model_name="hkunlp/instructor-base", device_type=None
 
232
  ):
233
- """Gen Chroma db.
 
 
234
 
235
- torch.cuda.is_available()
236
 
237
- file_paths =
238
- ['C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\41b53dd5f203b423f2dced44eaf56e72508b7bbe\\app.py',
239
- 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\9390755bb391abc530e71a3946a7b50d463ba0ef\\README.md',
240
- 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\3341f9a410a60ffa57bf4342f3018a3de689f729\\requirements.txt']
241
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  logger.info("\n\t Doing ingest...")
 
 
 
 
243
 
244
  if device_type is None:
245
  if torch.cuda.is_available():
@@ -260,33 +370,68 @@ def ingest(
260
 
261
  documents = []
262
  for file_path in file_paths:
263
- documents.append(load_single_document(f"{file_path}"))
 
 
264
 
265
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 
 
266
  texts = text_splitter.split_documents(documents)
267
 
 
268
  logger.info(f"Loaded {len(documents)} documents ")
269
  logger.info(f"Split into {len(texts)} chunks of text")
270
 
271
  # Create embeddings
272
- embeddings = HuggingFaceInstructEmbeddings(
 
273
  model_name=model_name, model_kwargs={"device": device}
274
  )
275
 
276
- db = Chroma.from_documents(
277
- texts,
278
- embeddings,
279
- persist_directory=PERSIST_DIRECTORY,
280
- client_settings=CHROMA_SETTINGS,
 
 
 
 
281
  )
282
- db.persist()
283
- db = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  logger.info("Done ingest")
285
 
286
- return [
287
  [Path(doc.metadata.get("source")).name, len(doc.page_content)]
288
  for doc in documents
289
  ]
 
 
 
290
 
291
 
292
  # TheBloke/Wizard-Vicuna-7B-Uncensored-HF
@@ -327,7 +472,7 @@ def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
327
  return local_llm
328
 
329
 
330
- def load_qa(device=None, model_name: str = "hkunlp/instructor-base"):
331
  """Gen qa."""
332
  logger.info("Doing qa")
333
  if device is None:
@@ -340,10 +485,12 @@ def load_qa(device=None, model_name: str = "hkunlp/instructor-base"):
340
  # model_name = "hkunlp/instructor-xl"
341
  # model_name = "hkunlp/instructor-large"
342
  # model_name = "hkunlp/instructor-base"
343
- embeddings = HuggingFaceInstructEmbeddings(
 
344
  model_name=model_name, model_kwargs={"device": device}
345
  )
346
  # xl 4.96G, large 3.5G,
 
347
  db = Chroma(
348
  persist_directory=PERSIST_DIRECTORY,
349
  embedding_function=embeddings,
@@ -351,117 +498,175 @@ def load_qa(device=None, model_name: str = "hkunlp/instructor-base"):
351
  )
352
  retriever = db.as_retriever()
353
 
354
- llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
 
355
 
 
356
  qa = RetrievalQA.from_chain_type(
357
- llm=llm, chain_type="stuff",
358
- retriever=retriever,
359
- return_source_documents=True,
 
360
  )
361
 
362
- logger.info("Done qa")
363
 
364
  return qa
365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  def main1():
368
- """Lump codes"""
369
- with gr.Blocks() as demo:
370
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
371
  iface.launch()
372
 
373
- demo.launch()
374
 
375
 
376
- def main():
377
- """Do blocks."""
378
- logger.info(f"ROOT_DIRECTORY: {ROOT_DIRECTORY}")
379
 
380
- openai_api_key = os.getenv("OPENAI_API_KEY")
381
- logger.info(f"openai_api_key (env var/hf space SECRETS): {openai_api_key}")
382
 
383
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
384
- # name = gr.Textbox(label="Name")
385
- # greet_btn = gr.Button("Submit")
386
- # output = gr.Textbox(label="Output Box")
387
- # greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")
388
- with gr.Accordion("Info", open=False):
389
- _ = """
390
- # localgpt
391
- Talk to your docs (.pdf, .docx, .epub, .txt .md and
392
- other text docs). It
393
- takes quite a while to ingest docs (10-30 min. depending
394
- on net, RAM, CPU etc.).
 
 
395
 
396
- Send empty query (hit Enter) to check embedding status and files info ([filename, numb of chars])
397
 
398
- Homepage: https://huggingface.co/spaces/mikeee/localgpt
399
- """
400
- gr.Markdown(dedent(_))
401
 
402
- # with gr.Accordion("Upload files", open=True):
403
- with gr.Tab("Upload files"):
404
- # Upload files and generate embeddings database
405
  file_output = gr.File()
 
 
406
  upload_button = gr.UploadButton(
407
- "Click to upload files",
408
  # file_types=["*.pdf", "*.epub", "*.docx"],
409
  file_count="multiple",
410
  )
411
- upload_button.upload(upload_files, upload_button, file_output)
412
-
413
- with gr.Tab("Query docs"):
414
- # interactive chat
415
- chatbot = gr.Chatbot()
416
- msg = gr.Textbox(label="Query")
417
- clear = gr.Button("Clear")
418
-
419
- def respond(message, chat_history):
420
- # bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
421
- if ns.ingest_done is None: # no files processed yet
422
- bot_message = "Upload some file(s) for processing first."
423
- chat_history.append((message, bot_message))
424
- return "", chat_history
425
-
426
- if not ns.ingest_done: # embedding database not doen yet
427
- bot_message = (
428
- "Waiting for ingest (embedding) to finish, "
429
- "be patient... You can switch the 'Upload files' "
430
- "Tab to check"
431
- )
432
- chat_history.append((message, bot_message))
433
- return "", chat_history
434
-
435
- if ns.qa is None: # load qa one time
436
- logger.info("Loading qa, need to do just one time.")
437
- ns.qa = load_qa()
438
-
439
- try:
440
- res = ns.qa(message)
441
- answer, docs = res["result"], res["source_documents"]
442
- bot_message = f"{answer} ({docs})"
443
- except Exception as exc:
444
- logger.error(exc)
445
- bot_message = f"bummer! {exc}"
446
-
447
- chat_history.append((message, bot_message))
448
-
449
- return "", chat_history
450
-
451
- msg.submit(respond, [msg, chatbot], [msg, chatbot])
452
- clear.click(lambda: None, None, chatbot, queue=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
 
 
 
 
 
 
454
  try:
455
- from google import colab # noqa
456
 
457
  share = True # start share when in colab
458
  except Exception:
459
  share = False
460
- demo.launch(share=share)
461
-
462
-
463
- if __name__ == "__main__":
464
- main()
465
 
466
  _ = """
467
  run_localgpt
 
1
+ """Refer to https://huggingface.co/spaces/mikeee/docs-chat/blob/main/app.py.
2
+
3
  and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
4
 
5
  https://python.langchain.com/en/latest/getting_started/tutorials.html
6
 
7
+ gradio.Progress example:
8
+ https://colab.research.google.com/github/gradio-app/gradio/blob/main/demo/progress/run.ipynb#scrollTo=2.8891853944186117e%2B38
9
+
10
  unstructured: python-magic python-docx python-pptx
11
  from langchain.document_loaders import UnstructuredHTMLLoader
12
 
 
37
  client_settings=CHROMA_SETTINGS,
38
  )
39
  db.persist()
40
+ est. 1min/100 text1
41
 
42
  # 中国共产党章程.txt qa
43
  https://github.com/xanderma/Assistant-Attop/blob/master/Release/%E6%96%87%E5%AD%97%E7%89%88%E9%A2%98%E5%BA%93/31.%E4%B8%AD%E5%9B%BD%E5%85%B1%E4%BA%A7%E5%85%9A%E7%AB%A0%E7%A8%8B.txt
 
47
  Wall time: 1min 37s
48
 
49
  """
50
+ # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long, too-many-return-statements, import-outside-toplevel, no-name-in-module, no-member, too-many-branches, unused-variable, too-many-arguments, global-statement
51
  import os
52
  import time
53
+ from copy import deepcopy
54
+ from math import ceil
55
  from pathlib import Path
56
+ from tempfile import _TemporaryFileWrapper
57
  from textwrap import dedent
58
  from types import SimpleNamespace
59
+ from typing import List
60
 
61
  import gradio as gr
62
+ import more_itertools as mit
63
  import torch
64
+ from about_time import about_time
65
  from charset_normalizer import detect
66
  from chromadb.config import Settings
67
+
68
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
69
+ # from langchain.llms import HuggingFacePipeline
70
+ # from epub2txt import epub2txt
71
+ from langchain.chains import ConversationalRetrievalChain, RetrievalQA
72
  from langchain.docstore.document import Document
73
  from langchain.document_loaders import (
74
  CSVLoader,
 
76
  PDFMinerLoader,
77
  TextLoader,
78
  )
79
+ from langchain.embeddings import (
80
+ HuggingFaceInstructEmbeddings,
81
+ SentenceTransformerEmbeddings,
82
+ )
83
+ from langchain.llms import HuggingFacePipeline, OpenAI
84
+ from langchain.memory import ConversationBufferMemory
85
  from langchain.text_splitter import (
86
  CharacterTextSplitter,
87
  RecursiveCharacterTextSplitter,
88
  )
 
 
89
  from langchain.vectorstores import FAISS, Chroma
90
  from loguru import logger
91
+ from PyPDF2 import PdfReader
92
+ from tqdm import tqdm
93
  from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
94
 
95
+ from epub_loader import EpubLoader
96
+ from load_api_key import load_api_key, pk_base, sk_base
97
 
98
+ MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
 
 
 
 
 
99
 
100
  # fix timezone
101
  os.environ["TZ"] = "Asia/Shanghai"
 
105
  # Windows
106
  logger.warning("Windows, cant run time.tzset()")
107
 
108
+ api_key = load_api_key()
109
+ if api_key is not None:
110
+ os.environ.setdefault("OPENAI_API_KEY", api_key)
111
+ if api_key.startswith("sk-"):
112
+ os.environ.setdefault("OPENAI_API_BASE", sk_base)
113
+ elif api_key.startswith("pk-"):
114
+ os.environ.setdefault("OPENAI_API_BASE", pk_base)
115
+
116
  ROOT_DIRECTORY = Path(__file__).parent
117
  PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
118
 
 
122
  persist_directory=PERSIST_DIRECTORY,
123
  anonymized_telemetry=False,
124
  )
 
125
 
126
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
127
 
128
+ ns_initial = SimpleNamespace(
129
+ qa=None,
130
+ ingest_done=None,
131
+ files_info=None,
132
+ files_uploaded=[],
133
+ db_ready=None,
134
+ )
135
+ ns = deepcopy(ns_initial)
136
+
137
+ def load_single_document(file_path: str | Path) -> List[Document]:
138
+ """Loads a single document from a file path."""
139
+ try:
140
+ _ = Path(file_path).read_bytes()
141
+ encoding = detect(_).get("encoding")
142
+ if encoding is not None:
143
+ encoding = str(encoding)
144
+ except Exception as exc:
145
+ logger.error(f"{file_path}: {exc}")
146
+ encoding = None
147
+
148
+ file_path = Path(file_path).as_posix()
149
+
150
+ if Path(file_path).suffix in [".txt"]:
151
  if encoding is None:
152
  logger.warning(
153
  f" {file_path}'s encoding is None "
154
  "Something is fishy, return empty str "
155
  )
156
+ return [Document(page_content="", metadata={"source": file_path})]
 
157
  try:
158
  loader = TextLoader(file_path, encoding=encoding)
159
  except Exception as exc:
160
  logger.warning(f" {exc}, return dummy ")
161
+ return [Document(page_content="", metadata={"source": file_path})]
162
+ elif Path(file_path).suffix in [".pdf"]:
163
+ try:
164
+ loader = PDFMinerLoader(file_path)
165
+ except Exception as exc:
166
+ logger.error(exc)
167
+ return [Document(page_content="", metadata={"source": file_path})]
168
  elif file_path.endswith(".csv"):
169
+ try:
170
+ loader = CSVLoader(file_path)
171
+ except Exception as exc:
172
+ logger.error(exc)
173
+ return [Document(page_content="", metadata={"source": file_path})]
174
  elif Path(file_path).suffix in [".docx"]:
175
  try:
176
  loader = Docx2txtLoader(file_path)
177
  except Exception as exc:
178
  logger.error(f" {file_path} errors: {exc}")
179
+ return [Document(page_content="", metadata={"source": file_path})]
180
+ elif Path(file_path).suffix in [".epub"]:
181
  try:
182
+ # _ = epub2txt(file_path)
183
+ loader = EpubLoader(file_path)
184
  except Exception as exc:
185
  logger.error(f" {file_path} errors: {exc}")
186
+ return [Document(page_content="", metadata={"source": file_path})]
 
187
  else:
188
  if encoding is None:
189
  logger.warning(
190
  f" {file_path}'s encoding is None "
191
  "Likely binary files, return empty str "
192
  )
193
+ return [Document(page_content="", metadata={"source": file_path})]
194
  try:
195
  loader = TextLoader(file_path)
196
  except Exception as exc:
197
  logger.error(f" {exc}, returnning empty string")
198
+ return [Document(page_content="", metadata={"source": file_path})]
199
 
200
+ return loader.load() # use extend when combining
201
 
202
 
203
  def get_pdf_text(pdf_docs):
 
210
  return text
211
 
212
 
213
+ def get_text_chunks(text, chunk_size=1000):
214
  """docs-chat."""
215
  text_splitter = CharacterTextSplitter(
216
+ separator="\n", chunk_size=chunk_size, chunk_overlap=200, length_function=len
217
  )
218
  chunks = text_splitter.split_text(text)
219
  return chunks
220
 
221
 
222
+ def get_vectorstore(
223
+ text_chunks,
224
+ vectorstore=None,
225
+ persist=True,
226
+ ):
227
+ """Gne vectorstore."""
228
  # embeddings = OpenAIEmbeddings()
229
+ # for HuggingFaceInstructEmbeddings
230
  model_name = "hkunlp/instructor-xl"
231
  model_name = "hkunlp/instructor-large"
232
  model_name = "hkunlp/instructor-base"
233
+
234
+ # embeddings = HuggingFaceInstructEmbeddings(model_name=model_name)
235
+
236
+ model_name = MODEL_NAME
237
  logger.info(f"Loading {model_name}")
238
+ embeddings = SentenceTransformerEmbeddings(model_name=model_name)
239
  logger.info(f"Done loading {model_name}")
240
 
241
+ if vectorstore is None:
242
+ vectorstore = "chroma"
243
+
244
+ if vectorstore.lower() in ["chroma"]:
245
+ logger.info(
246
+ "Doing vectorstore Chroma.from_texts(texts=text_chunks, embedding=embeddings)"
247
+ )
248
+ if persist:
249
+ vectorstore = Chroma.from_texts(
250
+ texts=text_chunks,
251
+ embedding=embeddings,
252
+ persist_directory=PERSIST_DIRECTORY,
253
+ client_settings=CHROMA_SETTINGS,
254
+ )
255
+ else:
256
+ vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embeddings)
257
+
258
+ logger.info(
259
+ "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
260
+ )
261
+
262
+ return vectorstore
263
+
264
+ # if vectorstore.lower() not in ['chroma']
265
+ # TODO handle other cases
266
  logger.info(
267
  "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
268
  )
 
285
  file_paths = [file.name for file in files]
286
  logger.info(file_paths)
287
 
288
+ ns.files_uploaded = file_paths
 
 
 
 
 
 
 
 
289
 
290
  # return [str(elm) for elm in res]
291
  return file_paths
 
293
  # return ingest(file_paths)
294
 
295
 
296
+ def process_files(
297
+ # file_paths,
298
+ progress=gr.Progress()
299
  ):
300
+ """Process uploaded files."""
301
+ if not ns.files_uploaded:
302
+ return f"No files uploaded: {ns.files_uploaded}"
303
 
304
+ logger.debug(f"{ns.files_uploaded}")
305
 
306
+ logger.info(f"ingest({ns.files_uploaded})...")
307
+
308
+ # imgs = [None] * 24
309
+ # for img in progress.tqdm(imgs, desc="Loading from list"):
310
+ # time.sleep(0.1)
311
+
312
+ imgs = [[None] * 8] * 3
313
+ for img_set in progress.tqdm(imgs, desc="Nested list"):
314
+ time.sleep(.2)
315
+ for img in progress.tqdm(img_set, desc="inner list"):
316
+ time.sleep(10.1)
317
+
318
+ return f"done file(s): {ns.files_info}"
319
+ # return f"done file(s)"
320
+
321
+ _ = """
322
+ documents = []
323
+ for file_path in progress.tqdm(ns.files_uploaded, desc="Reading file(s)"):
324
+ logger.debug(f"Doing {file_path}")
325
+ try:
326
+ documents.extend(load_single_document(f"{file_path}"))
327
+ logger.debug("Done reading files.")
328
+ except Exception as exc:
329
+ logger.error(f"{file_path}: {exc}")
330
+ # """
331
+
332
+ ns.ingest_done = True
333
+
334
+ # ns.qa = load_qa()
335
+
336
+ return f"done file(s): {ns.files_info}"
337
+
338
+
339
+ # pylint disable=unused-argument
340
+ def ingest(
341
+ file_paths: list[str | Path],
342
+ model_name: str = MODEL_NAME,
343
+ device_type=None,
344
+ chunk_size: int = 256,
345
+ chunk_overlap: int = 50,
346
+ ):
347
+ """Gen Chroma db."""
348
  logger.info("\n\t Doing ingest...")
349
+ logger.debug(f" file_paths: {file_paths}")
350
+ logger.debug(f"type of file_paths: {type(file_paths)}")
351
+
352
+ # raise SystemExit(0)
353
 
354
  if device_type is None:
355
  if torch.cuda.is_available():
 
370
 
371
  documents = []
372
  for file_path in file_paths:
373
+ # documents.append(load_single_document(f"{file_path}"))
374
+ logger.debug(f"Doing {file_path}")
375
+ documents.extend(load_single_document(f"{file_path}"))
376
 
377
+ text_splitter = RecursiveCharacterTextSplitter(
378
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
379
+ )
380
  texts = text_splitter.split_documents(documents)
381
 
382
+ logger.info(f"Loaded {len(file_paths)} files ")
383
  logger.info(f"Loaded {len(documents)} documents ")
384
  logger.info(f"Split into {len(texts)} chunks of text")
385
 
386
  # Create embeddings
387
+ # embeddings = HuggingFaceInstructEmbeddings(
388
+ embeddings = SentenceTransformerEmbeddings(
389
  model_name=model_name, model_kwargs={"device": device}
390
  )
391
 
392
+ # https://stackoverflow.com/questions/76048941/how-to-combine-two-chroma-databases
393
+ # db = Chroma(persist_directory=chroma_directory, embedding_function=embedding)
394
+ # db.add_documents(documents=texts1)
395
+
396
+ # mit.chunked_even(texts, 100)
397
+ db = Chroma(
398
+ # persist_directory=PERSIST_DIRECTORY,
399
+ embedding_function=embeddings,
400
+ # client_settings=CHROMA_SETTINGS,
401
  )
402
+ # for text in progress.tqdm(
403
+ for text in tqdm(
404
+ mit.chunked_even(texts, 101), total=ceil(len(texts) / 101)
405
+ ):
406
+ db.add_documents(documents=text)
407
+
408
+ _ = """
409
+ with about_time() as atime: # type: ignore
410
+ db = Chroma.from_documents(
411
+ texts,
412
+ embeddings,
413
+ persist_directory=PERSIST_DIRECTORY,
414
+ client_settings=CHROMA_SETTINGS,
415
+ )
416
+ logger.info(f"Time spent: {atime.duration_human}") # type: ignore
417
+ """
418
+
419
+ logger.info(f"persist_directory: {PERSIST_DIRECTORY}")
420
+
421
+ # db.persist()
422
+ # db = None
423
+ # ns.db = db
424
+ ns.qa = db
425
+
426
  logger.info("Done ingest")
427
 
428
+ _ = [
429
  [Path(doc.metadata.get("source")).name, len(doc.page_content)]
430
  for doc in documents
431
  ]
432
+ ns.files_info = _
433
+
434
+ return _
435
 
436
 
437
  # TheBloke/Wizard-Vicuna-7B-Uncensored-HF
 
472
  return local_llm
473
 
474
 
475
+ def load_qa(device=None, model_name: str = MODEL_NAME):
476
  """Gen qa."""
477
  logger.info("Doing qa")
478
  if device is None:
 
485
  # model_name = "hkunlp/instructor-xl"
486
  # model_name = "hkunlp/instructor-large"
487
  # model_name = "hkunlp/instructor-base"
488
+ # embeddings = HuggingFaceInstructEmbeddings(
489
+ embeddings = SentenceTransformerEmbeddings(
490
  model_name=model_name, model_kwargs={"device": device}
491
  )
492
  # xl 4.96G, large 3.5G,
493
+
494
  db = Chroma(
495
  persist_directory=PERSIST_DIRECTORY,
496
  embedding_function=embeddings,
 
498
  )
499
  retriever = db.as_retriever()
500
 
501
+ # _ = """
502
+ # llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
503
 
504
+ llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
505
  qa = RetrievalQA.from_chain_type(
506
+ llm=llm,
507
+ chain_type="stuff",
508
+ retriever=retriever,
509
+ # return_source_documents=True,
510
  )
511
 
512
+ # {"query": ..., "result": ..., "source_documents": ...}
513
 
514
  return qa
515
 
516
+ # """
517
+
518
+ # pylint: disable=unreachable
519
+
520
+ # model = 'gpt-3.5-turbo', default text-davinci-003
521
+ # max_tokens: int = 256 max_retries: int = 6
522
+ # openai_api_key: Optional[str] = None,
523
+ # openai_api_base: Optional[str] = None,
524
+
525
+ # llm = OpenAI(temperature=0, max_tokens=0)
526
+ llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
527
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
528
+ conversation_chain = ConversationalRetrievalChain.from_llm(
529
+ llm=llm,
530
+ # retriever=vectorstore.as_retriever(),
531
+ retriever=db.as_retriever(),
532
+ memory=memory,
533
+ )
534
+
535
+ logger.info("Done qa")
536
+
537
+ return conversation_chain
538
+ # memory.clear()
539
+ # response = conversation_chain({'question': user_question})
540
+ # response['question'], response['answer']
541
+
542
 
543
  def main1():
544
+ """Lump codes."""
545
+ with gr.Blocks() as demo1:
546
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
547
  iface.launch()
548
 
549
+ demo1.launch()
550
 
551
 
552
+ logger.info(f"ROOT_DIRECTORY: {ROOT_DIRECTORY}")
 
 
553
 
554
+ openai_api_key = os.getenv("OPENAI_API_KEY")
555
+ logger.info(f"openai_api_key (env var/hf space SECRETS): {openai_api_key}")
556
 
557
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
558
+ # name = gr.Textbox(label="Name")
559
+ # greet_btn = gr.Button("Submit")
560
+ # output = gr.Textbox(label="Output Box")
561
+ # greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")
562
+ #
563
+ # ### layout ###
564
+ with gr.Accordion("Info", open=False):
565
+ _ = """
566
+ # localgpt
567
+ Talk to your docs (.pdf, .docx, .epub, .txt .md and
568
+ other text docs). It
569
+ takes quite a while to ingest docs (10-30 min. depending
570
+ on net, RAM, CPU etc.).
571
 
572
+ Send empty query (hit Enter) to check embedding status and files info ([filename, numb of chars])
573
 
574
+ Homepage: https://huggingface.co/spaces/mikeee/localgpt
575
+ """
576
+ gr.Markdown(dedent(_))
577
 
578
+ with gr.Tab("Upload files"):
579
+ # Upload files and generate embeddings database
580
+ with gr.Row():
581
  file_output = gr.File()
582
+ # file_output = gr.Text()
583
+ # file_output = gr.DataFrame()
584
  upload_button = gr.UploadButton(
585
+ "Click to upload",
586
  # file_types=["*.pdf", "*.epub", "*.docx"],
587
  file_count="multiple",
588
  )
589
+ with gr.Row():
590
+ text2 = gr.Textbox("Progress/Log")
591
+ process_btn = gr.Button("Click to process files")
592
+ reset_btn = gr.Button("Reset everything")
593
+
594
+ with gr.Tab("Query docs"):
595
+ # interactive chat
596
+ chatbot = gr.Chatbot()
597
+ msg = gr.Textbox(label="Query")
598
+ clear = gr.Button("Clear")
599
+
600
+ # actions
601
+ def reset_all():
602
+ """Reset ns."""
603
+ global ns
604
+ ns = deepcopy(ns_initial)
605
+ return f"reset done: ns={ns}"
606
+
607
+ reset_btn.click(reset_all, [], text2)
608
+
609
+ upload_button.upload(upload_files, upload_button, file_output)
610
+ process_btn.click(process_files, [], text2)
611
+
612
+ def respond(message, chat_history):
613
+ """Gen response."""
614
+ if ns.ingest_done is None: # no files processed yet
615
+ bot_message = "Upload some file(s) for processing first."
616
+ chat_history.append((message, bot_message))
617
+ return "", chat_history
618
+
619
+ if not ns.ingest_done: # embedding database not doen yet
620
+ bot_message = (
621
+ "Waiting for ingest (embedding) to finish, "
622
+ "be patient... You can switch the 'Upload files' "
623
+ "Tab to check"
624
+ )
625
+ chat_history.append((message, bot_message))
626
+ return "", chat_history
627
+
628
+ _ = """
629
+ if ns.qa is None: # load qa one time
630
+ logger.info("Loading qa, need to do just one time.")
631
+ ns.qa = load_qa()
632
+ logger.info("Done loading qa, need to do just one time.")
633
+ # """
634
+ if ns.qa is None:
635
+ bot_message = (
636
+ "Looks like the bot is not ready. "
637
+ "Try again later..."
638
+ )
639
+ chat_history.append((message, bot_message))
640
+ return "", chat_history
641
+
642
+ try:
643
+ res = ns.qa(message)
644
+ answer = res.get("result")
645
+ docs = res.get("source_documents")
646
+ if docs:
647
+ bot_message = f"{answer}\n({docs})"
648
+ else:
649
+ bot_message = f"{answer}"
650
+ except Exception as exc:
651
+ logger.error(exc)
652
+ bot_message = f"bummer! {exc}"
653
+
654
+ chat_history.append((message, bot_message))
655
+
656
+ return "", chat_history
657
 
658
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
659
+ clear.click(lambda: None, None, chatbot, queue=False)
660
+
661
+ if __name__ == "__main__":
662
+ # main()
663
  try:
664
+ from google import colab # noqa # type: ignore
665
 
666
  share = True # start share when in colab
667
  except Exception:
668
  share = False
669
+ demo.queue(concurrency_count=20).launch(share=share)
 
 
 
 
670
 
671
  _ = """
672
  run_localgpt
docs/test.sdlxliff DELETED
The diff for this file is too large to render. See raw diff
 
load_api_key.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load sk-/pk- key."""
2
+ # pylint: disable=invalid-name
3
+ from os import getenv
4
+ from typing import Optional
5
+
6
+ from dotenv import load_dotenv
7
+
8
+ sk_base = "https://api.openai.com/v1"
9
+ pk_base = "https://api.pawan.krd/v1"
10
+
11
+
12
+ def load_api_key(env_var: Optional[str] = None):
13
+ """Load OPENAI_API_KEY/SK-/PK- key.
14
+
15
+ if env_var is None, load from .env
16
+ order: "OPENAI_API_KEY", SK_KEY, PK_KEY
17
+ else:
18
+ dotenv_values("env_var") | os.getenv("env_var")
19
+ """
20
+ # with override=True .env has higher priority
21
+ # than os.get(...)
22
+ load_dotenv(override=True)
23
+
24
+ if env_var is not None:
25
+ return getenv(str(env_var))
26
+
27
+ _ = [
28
+ "OPENAI_API_KEY",
29
+ "SK_KEY",
30
+ "PK_KEY",
31
+ ]
32
+
33
+ api_key = None
34
+ for api_key in map(getenv, _):
35
+ if api_key:
36
+ break
37
+
38
+ return api_key
package.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "localgpt",
3
+ "scripts": {
4
+ "start": "nodemon -w app.py -x run-s check run:app",
5
+ "run:app": "python app.py",
6
+ "run:app-w": "nodemon -w app.py -x python app.py",
7
+ "check-w": "nodemon -w app.py -x run-s isort format flake8 docstyle lint type:check",
8
+ "check": "run-s isort format flake8 docstyle lint type:check",
9
+ "isort": "isort --profile=black app.py",
10
+ "format": "black app.py",
11
+ "flake8": "flake8 --exit-zero app.py",
12
+ "docstyle": "pydocstyle --convention=google app.py",
13
+ "lint": "pylint app.py --disable=fixme",
14
+ "type:check": "pyright app.py"
15
+ },
16
+ "packageManager": "yarn@3.5.0",
17
+ "devDependencies": {
18
+ "run-all": "^1.0.1"
19
+ }
20
+ }
requirements.txt CHANGED
@@ -26,4 +26,6 @@ epub2txt
26
  docx2txt
27
 
28
  about-time
29
- openai
 
 
 
26
  docx2txt
27
 
28
  about-time
29
+ openai
30
+ more-itertools
31
+ tqdm
yarn.lock ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is generated by running "yarn install" inside your project.
2
+ # Manual changes might be lost - proceed with caution!
3
+
4
+ __metadata:
5
+ version: 6
6
+ cacheKey: 8
7
+
8
+ "localgpt@workspace:.":
9
+ version: 0.0.0-use.local
10
+ resolution: "localgpt@workspace:."
11
+ dependencies:
12
+ run-all: ^1.0.1
13
+ languageName: unknown
14
+ linkType: soft
15
+
16
+ "run-all@npm:^1.0.1":
17
+ version: 1.0.1
18
+ resolution: "run-all@npm:1.0.1"
19
+ bin:
20
+ run-all: lib/command.js
21
+ checksum: 3b38424af8b3637f5c4e8cf1d6421481c2fc15ec9d14899979ec2278c2bf6d5c27c9c58468bcbb1537acaf62868a3c80b34bb83093899625363d90339884f2e7
22
+ languageName: node
23
+ linkType: hard