ffreemt commited on
Commit
4f331cc
1 Parent(s): 89dc142

Update chatbox

Browse files
Files changed (6) hide show
  1. .gitignore +0 -1
  2. app.py +130 -32
  3. install-sw.sh +3 -3
  4. install-sw1.sh +3 -3
  5. requirements.txt +1 -0
  6. start-sshd.sh +4 -0
.gitignore CHANGED
@@ -1,4 +1,3 @@
1
  .venv
2
  db
3
  dummy
4
- start-sshd.sh
 
1
  .venv
2
  db
3
  dummy
 
app.py CHANGED
@@ -4,18 +4,19 @@ and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
4
 
5
  https://python.langchain.com/en/latest/getting_started/tutorials.html
6
  """
7
- # pylint: disable=broad-exception-caught, unused-import
8
  import os
9
  import time
10
  from pathlib import Path
11
-
12
- # import click
13
- # from typing import List
14
 
15
  import gradio as gr
16
  from charset_normalizer import detect
 
17
  from langchain.chains import RetrievalQA
18
  from langchain.docstore.document import Document
 
 
19
  from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader
20
 
21
  # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
@@ -25,12 +26,16 @@ from langchain.text_splitter import (
25
  CharacterTextSplitter,
26
  RecursiveCharacterTextSplitter,
27
  )
 
28
  # FAISS instead of PineCone
29
  from langchain.vectorstores import FAISS, Chroma
30
  from loguru import logger
31
  from PyPDF2 import PdfReader # localgpt
32
- from chromadb.config import Settings
33
- from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline
 
 
 
34
 
35
  # from utils import xlxs_to_csv
36
 
@@ -52,12 +57,14 @@ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
52
 
53
  # Define the Chroma settings
54
  CHROMA_SETTINGS = Settings(
55
- chroma_db_impl='duckdb+parquet',
56
- persist_directory=PERSIST_DIRECTORY,
57
- anonymized_telemetry=False
58
  )
 
 
59
 
60
- def load_single_document(file_path: str|Path) -> Document:
61
  """ingest.py"""
62
  # Loads a single document from a file path
63
  # encoding = detect(open(file_path, "rb").read()).get("encoding", "utf-8")
@@ -68,13 +75,13 @@ def load_single_document(file_path: str|Path) -> Document:
68
  f" {file_path}'s encoding is None "
69
  "Something is fishy, return empty str "
70
  )
71
- return Document(page_content='', metadata={'source': file_path})
72
 
73
  try:
74
  loader = TextLoader(file_path, encoding=encoding)
75
  except Exception as exc:
76
  logger.warning(f" {exc}, return dummy ")
77
- return Document(page_content='', metadata={'source': file_path})
78
 
79
  elif file_path.endswith(".pdf"):
80
  loader = PDFMinerLoader(file_path)
@@ -93,7 +100,7 @@ def load_single_document(file_path: str|Path) -> Document:
93
  loader = TextLoader(file_path)
94
  except Exception as exc:
95
  logger.error(f" {exc}, returnning empty string")
96
- return Document(page_content='', metadata={'source': file_path})
97
 
98
  return loader.load()[0]
99
 
@@ -150,6 +157,10 @@ def upload_files(files):
150
  logger.info(file_paths)
151
 
152
  res = ingest(file_paths)
 
 
 
 
153
 
154
  # return [str(elm) for elm in res]
155
  return file_paths
@@ -157,16 +168,25 @@ def upload_files(files):
157
  # return ingest(file_paths)
158
 
159
 
160
- def ingest(file_paths: list[str | Path], model_name="hkunlp/instructor-base", device_type="cpu"):
 
 
161
  """Gen Chroma db.
162
- file_paths = ['C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\41b53dd5f203b423f2dced44eaf56e72508b7bbe\\app.py', 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\9390755bb391abc530e71a3946a7b50d463ba0ef\\README.md', 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\3341f9a410a60ffa57bf4342f3018a3de689f729\\requirements.txt']
 
 
 
 
 
 
163
  """
164
- if device_type in ['cpu', 'CPU']:
165
- device='cpu'
166
- elif device_type in ['mps', 'MPS']:
167
- device='mps'
 
168
  else:
169
- device='cuda'
170
 
171
  #  Load documents and split in chunks
172
  # logger.info(f"Loading documents from {SOURCE_DIRECTORY}")
@@ -184,24 +204,32 @@ def ingest(file_paths: list[str | Path], model_name="hkunlp/instructor-base", de
184
 
185
  # Create embeddings
186
  embeddings = HuggingFaceInstructEmbeddings(
187
- model_name=model_name,
188
- model_kwargs={"device": device}
189
  )
190
 
191
  db = Chroma.from_documents(
192
- texts, embeddings,
 
193
  persist_directory=PERSIST_DIRECTORY,
194
- client_settings=CHROMA_SETTINGS
195
  )
196
  db.persist()
197
  db = None
198
  logger.info("Done ingest")
199
 
200
- return [[Path(doc.metadata.get("source")).name, len(doc.page_content)] for doc in documents]
 
 
 
201
 
202
 
 
203
  def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
204
- """Gen a local llm."""
 
 
 
 
205
  model = LlamaForCausalLM.from_pretrained(
206
  model_id,
207
  # load_in_8bit=True, # set these options if your GPU supports them!
@@ -217,13 +245,42 @@ def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
217
  max_length=2048,
218
  temperature=0,
219
  top_p=0.95,
220
- repetition_penalty=1.15
221
  )
222
 
223
  local_llm = HuggingFacePipeline(pipeline=pipe)
224
  return local_llm
225
 
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def main1():
228
  """Lump codes"""
229
  with gr.Blocks() as demo:
@@ -241,21 +298,62 @@ def main():
241
  logger.info(f"openai_api_key (hf space SECRETS/env): {openai_api_key}")
242
 
243
  with gr.Blocks() as demo:
244
- name = gr.Textbox(label="Name")
245
- greet_btn = gr.Button("Submit")
246
- output = gr.Textbox(label="Output Box")
247
- greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")
248
 
 
249
  file_output = gr.File()
250
  upload_button = gr.UploadButton(
251
  "Click to upload files",
252
  # file_types=["*.pdf", "*.epub", "*.docx"],
253
- file_count="multiple"
254
  )
255
  upload_button.upload(upload_files, upload_button, file_output)
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  demo.launch()
258
 
259
 
260
  if __name__ == "__main__":
261
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  https://python.langchain.com/en/latest/getting_started/tutorials.html
6
  """
7
+ # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long
8
  import os
9
  import time
10
  from pathlib import Path
11
+ from types import SimpleNamespace
 
 
12
 
13
  import gradio as gr
14
  from charset_normalizer import detect
15
+ from chromadb.config import Settings
16
  from langchain.chains import RetrievalQA
17
  from langchain.docstore.document import Document
18
+
19
+ # Docx2txtLoader
20
  from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader
21
 
22
  # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
 
26
  CharacterTextSplitter,
27
  RecursiveCharacterTextSplitter,
28
  )
29
+
30
  # FAISS instead of PineCone
31
  from langchain.vectorstores import FAISS, Chroma
32
  from loguru import logger
33
  from PyPDF2 import PdfReader # localgpt
34
+ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
35
+
36
+ # import click
37
+ # from typing import List
38
+
39
 
40
  # from utils import xlxs_to_csv
41
 
 
57
 
58
  # Define the Chroma settings
59
  CHROMA_SETTINGS = Settings(
60
+ chroma_db_impl="duckdb+parquet",
61
+ persist_directory=PERSIST_DIRECTORY,
62
+ anonymized_telemetry=False,
63
  )
64
+ ns = SimpleNamespace(qa=None)
65
+
66
 
67
+ def load_single_document(file_path: str | Path) -> Document:
68
  """ingest.py"""
69
  # Loads a single document from a file path
70
  # encoding = detect(open(file_path, "rb").read()).get("encoding", "utf-8")
 
75
  f" {file_path}'s encoding is None "
76
  "Something is fishy, return empty str "
77
  )
78
+ return Document(page_content="", metadata={"source": file_path})
79
 
80
  try:
81
  loader = TextLoader(file_path, encoding=encoding)
82
  except Exception as exc:
83
  logger.warning(f" {exc}, return dummy ")
84
+ return Document(page_content="", metadata={"source": file_path})
85
 
86
  elif file_path.endswith(".pdf"):
87
  loader = PDFMinerLoader(file_path)
 
100
  loader = TextLoader(file_path)
101
  except Exception as exc:
102
  logger.error(f" {exc}, returnning empty string")
103
+ return Document(page_content="", metadata={"source": file_path})
104
 
105
  return loader.load()[0]
106
 
 
157
  logger.info(file_paths)
158
 
159
  res = ingest(file_paths)
160
+ logger.info("Processed:\n{res}")
161
+ del res
162
+
163
+ ns.qa = load_qa()
164
 
165
  # return [str(elm) for elm in res]
166
  return file_paths
 
168
  # return ingest(file_paths)
169
 
170
 
171
+ def ingest(
172
+ file_paths: list[str | Path], model_name="hkunlp/instructor-base", device_type="cpu"
173
+ ):
174
  """Gen Chroma db.
175
+
176
+ torch.cuda.is_available()
177
+
178
+ file_paths =
179
+ ['C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\41b53dd5f203b423f2dced44eaf56e72508b7bbe\\app.py',
180
+ 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\9390755bb391abc530e71a3946a7b50d463ba0ef\\README.md',
181
+ 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\3341f9a410a60ffa57bf4342f3018a3de689f729\\requirements.txt']
182
  """
183
+ logger.info("Doing ingest...")
184
+ if device_type in ["cpu", "CPU"]:
185
+ device = "cpu"
186
+ elif device_type in ["mps", "MPS"]:
187
+ device = "mps"
188
  else:
189
+ device = "cuda"
190
 
191
  #  Load documents and split in chunks
192
  # logger.info(f"Loading documents from {SOURCE_DIRECTORY}")
 
204
 
205
  # Create embeddings
206
  embeddings = HuggingFaceInstructEmbeddings(
207
+ model_name=model_name, model_kwargs={"device": device}
 
208
  )
209
 
210
  db = Chroma.from_documents(
211
+ texts,
212
+ embeddings,
213
  persist_directory=PERSIST_DIRECTORY,
214
+ client_settings=CHROMA_SETTINGS,
215
  )
216
  db.persist()
217
  db = None
218
  logger.info("Done ingest")
219
 
220
+ return [
221
+ [Path(doc.metadata.get("source")).name, len(doc.page_content)]
222
+ for doc in documents
223
+ ]
224
 
225
 
226
+ # TheBloke/vicuna-7B-1.1-GPTQ-4bit-128g
227
  def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
228
+ """Gen a local llm.
229
+
230
+ localgpt run_localgpt
231
+ """
232
+ tokenizer = LlamaTokenizer.from_pretrained(model_id)
233
  model = LlamaForCausalLM.from_pretrained(
234
  model_id,
235
  # load_in_8bit=True, # set these options if your GPU supports them!
 
245
  max_length=2048,
246
  temperature=0,
247
  top_p=0.95,
248
+ repetition_penalty=1.15,
249
  )
250
 
251
  local_llm = HuggingFacePipeline(pipeline=pipe)
252
  return local_llm
253
 
254
 
255
+ def load_qa(device: str = "cpu", model_name: str = "hkunlp/instructor-base"):
256
+ """Gen qa."""
257
+ logger.info("Doing qa")
258
+ # device = 'cpu'
259
+ # model_name = "hkunlp/instructor-xl"
260
+ # model_name = "hkunlp/instructor-large"
261
+ # model_name = "hkunlp/instructor-base"
262
+ embeddings = HuggingFaceInstructEmbeddings(
263
+ model_name=model_name, model_kwargs={"device": device}
264
+ )
265
+ # xl 4.96G, large 3.5G,
266
+ db = Chroma(
267
+ persist_directory=PERSIST_DIRECTORY,
268
+ embedding_function=embeddings,
269
+ client_settings=CHROMA_SETTINGS,
270
+ )
271
+ retriever = db.as_retriever()
272
+
273
+ llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
274
+
275
+ qa = RetrievalQA.from_chain_type(
276
+ llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
277
+ )
278
+
279
+ logger.info("Done qa")
280
+
281
+ return qa
282
+
283
+
284
  def main1():
285
  """Lump codes"""
286
  with gr.Blocks() as demo:
 
298
  logger.info(f"openai_api_key (hf space SECRETS/env): {openai_api_key}")
299
 
300
  with gr.Blocks() as demo:
301
+ # name = gr.Textbox(label="Name")
302
+ # greet_btn = gr.Button("Submit")
303
+ # output = gr.Textbox(label="Output Box")
304
+ # greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")
305
 
306
+ # Upload files and generate embeddings database
307
  file_output = gr.File()
308
  upload_button = gr.UploadButton(
309
  "Click to upload files",
310
  # file_types=["*.pdf", "*.epub", "*.docx"],
311
+ file_count="multiple",
312
  )
313
  upload_button.upload(upload_files, upload_button, file_output)
314
 
315
+ # interactive chat
316
+ chatbot = gr.Chatbot()
317
+ msg = gr.Textbox()
318
+ clear = gr.Button("Clear")
319
+
320
+ def respond(message, chat_history):
321
+ # bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
322
+ res = ns.qa(message)
323
+ answer, docs = res["result"], res["source_documents"]
324
+ bot_message = f"{answer} ({docs})"
325
+ chat_history.append((message, bot_message))
326
+ time.sleep(0.21)
327
+ return "", chat_history
328
+
329
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
330
+ clear.click(lambda: None, None, chatbot, queue=False)
331
+
332
  demo.launch()
333
 
334
 
335
  if __name__ == "__main__":
336
  main()
337
+
338
+ _ = """
339
+ run_localgpt
340
+ device = 'cpu'
341
+ model_name = "hkunlp/instructor-xl"
342
+ model_name = "hkunlp/instructor-large"
343
+ model_name = "hkunlp/instructor-base"
344
+ embeddings = HuggingFaceInstructEmbeddings(
345
+ model_name=,
346
+ model_kwargs={"device": device}
347
+ )
348
+ # xl 4.96G, large 3.5G,
349
+ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
350
+ retriever = db.as_retriever()
351
+
352
+ llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
353
+
354
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
355
+
356
+ query = 'a'
357
+ res = qa(query)
358
+
359
+ """
install-sw.sh CHANGED
@@ -12,12 +12,12 @@ echo export PATH=~/.local/bin:$PATH > ~/.bashrc
12
  source ~/.bashrc
13
  # ~/.local/bin/poetry install
14
 
15
- wget -c https://deb.nodesource.com/setup_14.x
16
- bash setup_14.x
17
  apt-get install -y nodejs
18
  npm install -g npm@latest
19
  npm install -g nodemon
20
- rm setup_14.x
21
 
22
  # apt upate # alerady done in apt-get install -y nodejs
23
  apt install byobu -y > /dev/null 2>&1
 
12
  source ~/.bashrc
13
  # ~/.local/bin/poetry install
14
 
15
+ wget -c https://deb.nodesource.com/setup_18.x
16
+ bash setup_18.x
17
  apt-get install -y nodejs
18
  npm install -g npm@latest
19
  npm install -g nodemon
20
+ rm setup_18.x
21
 
22
  # apt upate # alerady done in apt-get install -y nodejs
23
  apt install byobu -y > /dev/null 2>&1
install-sw1.sh CHANGED
@@ -12,12 +12,12 @@ echo export PATH=~/.local/bin:$PATH > ~/.bashrc
12
  source ~/.bashrc
13
  # ~/.local/bin/poetry install
14
 
15
- wget -qO- https://deb.nodesource.com/setup_14.x | bash
16
- # bash setup_14.x
17
  apt-get install -y nodejs
18
  npm install -g npm@latest
19
  npm install -g nodemon
20
- # rm setup_14.x
21
 
22
  # apt update # alerady done in apt-get install -y nodejs
23
  apt install byobu -y > /dev/null 2>&1
 
12
  source ~/.bashrc
13
  # ~/.local/bin/poetry install
14
 
15
+ wget -qO- https://deb.nodesource.com/setup_18.x | bash
16
+ # bash setup_18.x
17
  apt-get install -y nodejs
18
  npm install -g npm@latest
19
  npm install -g nodemon
20
+ # rm setup_18.x
21
 
22
  # apt update # alerady done in apt-get install -y nodejs
23
  apt install byobu -y > /dev/null 2>&1
requirements.txt CHANGED
@@ -23,3 +23,4 @@ gradio
23
  charset-normalizer
24
  PyPDF2
25
  epub2txt
 
 
23
  charset-normalizer
24
  PyPDF2
25
  epub2txt
26
+ docx2txt
start-sshd.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ apt update && apt-get install openssh-server -y
2
+ /etc/init.d/ssh restart && mkdir -p ~/.ssh && echo ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOl+SiDFL1ZUh1QJ0454eYKtamkMCVs2hhuv3cWN1LU7 id_ed25519_colab > ~/.ssh/authorized_keys
3
+ echo ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCizaBJkWzdC/pvwFzBx8/fNWhvDDcSjp3B8pqgS7nF/+CXstK/k5vbN+PlZTupnOrOd0jQ7KdDUqsx/GFGTub8n1RDOF8nCHjvKScQii3M53i6OVH3m5+9eyhag5J8vLugnbbT57tUaVnFe7z0vomxsmVUfyXex3EZhW+zM1+kfGH9rvQxoh5OMiZLPqcyNRQHsJV8JDD2IRxHid0mMXcPFbws1CcjZiEWRLV4878KFt2vWwp+9xjwgSzcoKtFnxMrDKFfyKoEAYnyO7SrEVvm8T2rMpXCApDEMFnV0g2bUDu67iD1xAGHSvTgjEtSG3mLJGrnKBnzzO2ksCZf68/z GOLAY\User@golay >> ~/.ssh/authorized_keys
4
+ echo cd /usr/src/app >> ~/.bashrc