ffreemt commited on
Commit
b140cfb
1 Parent(s): cee68d1

Update gen_doc_chunks

Browse files
app.py CHANGED
@@ -85,7 +85,7 @@ from langchain.embeddings import (
85
  from langchain.llms import HuggingFacePipeline, OpenAI
86
  from langchain.memory import ConversationBufferMemory
87
  from langchain.text_splitter import (
88
- CharacterTextSplitter,
89
  RecursiveCharacterTextSplitter,
90
  )
91
  from langchain.vectorstores import FAISS, Chroma
@@ -97,8 +97,6 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
97
  from epub_loader import EpubLoader
98
  from load_api_key import load_api_key, pk_base, sk_base
99
 
100
- MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
101
-
102
  # fix timezone
103
  os.environ["TZ"] = "Asia/Shanghai"
104
  try:
@@ -135,6 +133,10 @@ CHROMA_SETTINGS = Settings(
135
 
136
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
137
 
 
 
 
 
138
  ns_initial = SimpleNamespace(
139
  db=None,
140
  qa=None,
@@ -143,8 +145,8 @@ ns_initial = SimpleNamespace(
143
  files_info=None,
144
  files_uploaded=[],
145
  db_ready=None,
146
- chunk_size=250,
147
- chunk_overlap=250,
148
  model_name=MODEL_NAME,
149
  )
150
  ns = deepcopy(ns_initial)
@@ -226,65 +228,94 @@ def get_pdf_text(pdf_docs):
226
  return text
227
 
228
 
229
- def get_text_chunks(text, chunk_size=1000):
230
- """docs-chat."""
231
- text_splitter = CharacterTextSplitter(
232
- separator="\n", chunk_size=chunk_size, chunk_overlap=200, length_function=len
 
 
 
 
 
 
 
 
 
 
 
233
  )
234
- chunks = text_splitter.split_text(text)
 
 
235
  return chunks
236
 
237
 
238
  def get_vectorstore(
239
- text_chunks,
 
240
  vectorstore=None,
 
241
  persist=True,
 
242
  ):
243
  """Gne vectorstore."""
244
  # embedding = OpenAIEmbeddings()
245
  # for HuggingFaceInstructEmbeddings
246
- model_name = "hkunlp/instructor-xl"
247
- model_name = "hkunlp/instructor-large"
248
- model_name = "hkunlp/instructor-base"
249
 
250
  # embedding = HuggingFaceInstructEmbeddings(model_name=model_name)
251
 
252
- model_name = MODEL_NAME
 
 
 
 
 
 
 
 
253
  logger.info(f"Loading {model_name}")
254
  embedding = SentenceTransformerEmbeddings(model_name=model_name)
255
  logger.info(f"Done loading {model_name}")
256
 
257
- if vectorstore is None:
258
- vectorstore = "chroma"
259
-
260
  if vectorstore.lower() in ["chroma"]:
261
  logger.info(
262
- "Doing vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
 
263
  )
264
  if persist:
265
- vectorstore = Chroma.from_texts(
266
- texts=text_chunks,
 
 
267
  embedding=embedding,
268
  persist_directory=PERSIST_DIRECTORY,
269
  client_settings=CHROMA_SETTINGS,
270
  )
271
  else:
272
- vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
 
273
 
274
  logger.info(
275
- "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embedding)"
 
276
  )
277
 
278
  return vectorstore
279
 
280
  # if vectorstore.lower() not in ['chroma']
 
281
  # TODO handle other cases
 
282
  logger.info(
283
- "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embedding)"
284
  )
285
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
 
286
  logger.info(
287
- "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embedding)"
288
  )
289
 
290
  return vectorstore
@@ -386,11 +417,10 @@ def embed_files(progress=gr.Progress()):
386
  # initialize if necessary
387
  if ns.db is None:
388
  logger.info(f"loading {ns.model_name:}")
 
 
 
389
  for _ in progress.tqdm(range(1), desc="diggin..."):
390
- embedding = SentenceTransformerEmbeddings(
391
- model_name=ns.model_name, model_kwargs={"device": DEVICE}
392
- )
393
-
394
  logger.info("creating vectorstore")
395
  ns.db = Chroma(
396
  # persist_directory=PERSIST_DIRECTORY,
 
85
  from langchain.llms import HuggingFacePipeline, OpenAI
86
  from langchain.memory import ConversationBufferMemory
87
  from langchain.text_splitter import (
88
+ # CharacterTextSplitter,
89
  RecursiveCharacterTextSplitter,
90
  )
91
  from langchain.vectorstores import FAISS, Chroma
 
97
  from epub_loader import EpubLoader
98
  from load_api_key import load_api_key, pk_base, sk_base
99
 
 
 
100
  # fix timezone
101
  os.environ["TZ"] = "Asia/Shanghai"
102
  try:
 
133
 
134
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
135
 
136
+ MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
137
+ CHUNK_SIZE = 1000 # 250
138
+ CHUNK_OVERLAP = 100 # 50
139
+
140
  ns_initial = SimpleNamespace(
141
  db=None,
142
  qa=None,
 
145
  files_info=None,
146
  files_uploaded=[],
147
  db_ready=None,
148
+ chunk_size=CHUNK_SIZE,
149
+ chunk_overlap=CHUNK_OVERLAP,
150
  model_name=MODEL_NAME,
151
  )
152
  ns = deepcopy(ns_initial)
 
228
  return text
229
 
230
 
231
+ # def get_text_chunks(text, chunk_size=None, chunk_overlap=None):
232
+ def get_doc_chunks(doc: Document, chunk_size=None, chunk_overlap=None) -> List[Document]:
233
+ """Generate doc chunks."""
234
+ if chunk_size is None:
235
+ chunk_size = ns.chunk_size
236
+ if chunk_overlap is None:
237
+ chunk_overlap = ns.chunk_overlap
238
+
239
+ # text_splitter = CharacterTextSplitter(
240
+ text_splitter = RecursiveCharacterTextSplitter(
241
+ # separator="\n",
242
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
243
+ chunk_size=chunk_size,
244
+ chunk_overlap=chunk_overlap,
245
+ length_function=len
246
  )
247
+ # chunks = text_splitter.split_text(text)
248
+ chunks = text_splitter.split_documents(doc)
249
+
250
  return chunks
251
 
252
 
253
  def get_vectorstore(
254
+ # text_chunks: List[Document],
255
+ doc_chunks: List[Document],
256
  vectorstore=None,
257
+ model_name=None,
258
  persist=True,
259
+ persist_directory=None
260
  ):
261
  """Gne vectorstore."""
262
  # embedding = OpenAIEmbeddings()
263
  # for HuggingFaceInstructEmbeddings
264
+ # model_name = "hkunlp/instructor-xl"
265
+ # model_name = "hkunlp/instructor-large"
266
+ # model_name = "hkunlp/instructor-base"
267
 
268
  # embedding = HuggingFaceInstructEmbeddings(model_name=model_name)
269
 
270
+ if vectorstore is None:
271
+ vectorstore = "chroma"
272
+
273
+ if model_name is None:
274
+ model_name = MODEL_NAME
275
+
276
+ if persist_directory is None:
277
+ persist_directory = PERSIST_DIRECTORY
278
+
279
  logger.info(f"Loading {model_name}")
280
  embedding = SentenceTransformerEmbeddings(model_name=model_name)
281
  logger.info(f"Done loading {model_name}")
282
 
 
 
 
283
  if vectorstore.lower() in ["chroma"]:
284
  logger.info(
285
+ # "Doing vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
286
+ "Doing vectorstore Chroma.from_documents(texts=doc_chunks, embedding=embedding)"
287
  )
288
  if persist:
289
+ # vectorstore = Chroma.from_texts(
290
+ vectorstore = Chroma.from_documents(
291
+ # texts=text_chunks,
292
+ documents=doc_chunks,
293
  embedding=embedding,
294
  persist_directory=PERSIST_DIRECTORY,
295
  client_settings=CHROMA_SETTINGS,
296
  )
297
  else:
298
+ # vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
299
+ vectorstore = Chroma.from_documents(documents=doc_chunks, embedding=embedding)
300
 
301
  logger.info(
302
+ # "Done vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
303
+ "Done vectorstore Chroma.from_texts(documents=doc_chunks, embedding=embedding)"
304
  )
305
 
306
  return vectorstore
307
 
308
  # if vectorstore.lower() not in ['chroma']
309
+
310
  # TODO handle other cases
311
+
312
  logger.info(
313
+ "Doing vectorstore FAISS.from_texts(documents=doc_chunks, embedding=embedding)"
314
  )
315
+ # vectorstore = FAISS.from_texts(documents=doc_chunks, embedding=embedding)
316
+ vectorstore = FAISS.from_documents(documents=doc_chunks, embedding=embedding)
317
  logger.info(
318
+ "Done vectorstore FAISS.from_documents(documents=doc_chunks, embedding=embedding)"
319
  )
320
 
321
  return vectorstore
 
417
  # initialize if necessary
418
  if ns.db is None:
419
  logger.info(f"loading {ns.model_name:}")
420
+ embedding = SentenceTransformerEmbeddings(
421
+ model_name=ns.model_name, model_kwargs={"device": DEVICE}
422
+ )
423
  for _ in progress.tqdm(range(1), desc="diggin..."):
 
 
 
 
424
  logger.info("creating vectorstore")
425
  ns.db = Chroma(
426
  # persist_directory=PERSIST_DIRECTORY,
docs/{340-脂砚斋重批红楼梦.txt → hlm.txt} RENAMED
File without changes
ggml-try.py CHANGED
@@ -2,8 +2,9 @@
2
 
3
  https://raw.githubusercontent.com/imartinez/privateGPT/main/requirements.txt
4
 
5
- -c https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin
6
- """
 
7
  from dotenv import load_dotenv, dotenv_values
8
  from langchain.chains import RetrievalQA
9
  from langchain.embeddings import HuggingFaceEmbeddings
@@ -37,16 +38,14 @@ settings = dict([('PERSIST_DIRECTORY', 'db1'),
37
 
38
  # models/ggml-gpt4all-j-v1.3-groovy.bin ~5G
39
 
40
- # all-MiniLM-L6-v2 () or
41
- embeddings_model_name = settings.get("EMBEDDINGS_MODEL_NAME")
42
-
43
- # embeddings_model_name = 'all-MiniLM-L6-v2'
44
- embeddings_model_name = 'paraphrase-multilingual-mpnet-base-v2'
45
-
46
  persist_directory = settings.get('PERSIST_DIRECTORY')
47
 
48
  model_type = settings.get('MODEL_TYPE')
49
  model_path = settings.get('MODEL_PATH')
 
 
 
 
50
  model_n_ctx = settings.get('MODEL_N_CTX')
51
  model_n_batch = int(settings.get('MODEL_N_BATCH',8))
52
  target_source_chunks = int(settings.get('TARGET_SOURCE_CHUNKS',4))
@@ -60,6 +59,7 @@ CHROMA_SETTINGS = Settings(
60
 
61
  args = SimpleNamespace(hide_source=False, mute_stream=False)
62
 
 
63
  embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
64
  db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
65
 
@@ -78,4 +78,19 @@ match model_type:
78
 
79
  # need about 5G RAM
80
 
81
- qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  https://raw.githubusercontent.com/imartinez/privateGPT/main/requirements.txt
4
 
5
+ from pathlib import Path
6
+ Path("models").mkdir(exit_ok=True)
7
+ !time wget -c https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin -O models/ggml-gpt4all-j-v1.3-groovy.bin"""
8
  from dotenv import load_dotenv, dotenv_values
9
  from langchain.chains import RetrievalQA
10
  from langchain.embeddings import HuggingFaceEmbeddings
 
38
 
39
  # models/ggml-gpt4all-j-v1.3-groovy.bin ~5G
40
 
 
 
 
 
 
 
41
  persist_directory = settings.get('PERSIST_DIRECTORY')
42
 
43
  model_type = settings.get('MODEL_TYPE')
44
  model_path = settings.get('MODEL_PATH')
45
+ embeddings_model_name = settings.get("EMBEDDINGS_MODEL_NAME")
46
+ # embeddings_model_name = 'all-MiniLM-L6-v2'
47
+ # embeddings_model_name = 'paraphrase-multilingual-mpnet-base-v2'
48
+
49
  model_n_ctx = settings.get('MODEL_N_CTX')
50
  model_n_batch = int(settings.get('MODEL_N_BATCH',8))
51
  target_source_chunks = int(settings.get('TARGET_SOURCE_CHUNKS',4))
 
59
 
60
  args = SimpleNamespace(hide_source=False, mute_stream=False)
61
 
62
+ # load chroma database from db1
63
  embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
64
  db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
65
 
 
78
 
79
  # need about 5G RAM
80
 
81
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
82
+
83
+ # Get the answer from the chain
84
+
85
+ query = "共产党是什么"
86
+
87
+ start = time.time()
88
+ res = qa(query)
89
+ answer, docs = res['result'], [] if args.hide_source else res['source_documents']
90
+ end = time.time()
91
+
92
+ # Print the result
93
+ print("\n\n> Question:")
94
+ print(query)
95
+ print(f"\n> Answer (took {round(end - start, 2)} s.):")
96
+ print(answer)