My Duong commited on
Commit
a39d9ba
·
1 Parent(s): a3507d8

upload demo

Browse files
Files changed (2) hide show
  1. app_official.py +43 -36
  2. app.py → vector_embedding.py +0 -0
app_official.py CHANGED
@@ -1,59 +1,66 @@
 
 
 
1
  import gradio as gr
2
  from sentence_transformers import SentenceTransformer
3
- from langchain.vectorstores import Chroma
4
- from langchain.utils import DataLoader
5
- from accelerate import Accelerator
6
- import numpy as np
7
- from tqdm import tqdm
8
 
9
- # Wrapper for embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  class SentenceTransformerWrapper:
11
- def __init__(self, model_name, batch_size=32):
12
- self.batch_size = batch_size
13
- self.accelerator = Accelerator() # Create an accelerator instance
14
  self.model = SentenceTransformer(model_name)
15
- # Move the model to the appropriate device
16
- self.model.to(self.accelerator.device)
17
-
18
- def embed_documents(self, texts):
19
- # Create a DataLoader for the texts
20
- dataloader = DataLoader(texts, batch_size=self.batch_size)
21
- all_embeddings = []
22
- # Optionally, prepare the DataLoader with accelerator if needed
23
- dataloader = self.accelerator.prepare(dataloader)
24
 
25
- for batch in tqdm(dataloader, desc="Embedding documents"):
26
- # SentenceTransformer.encode already supports batching;
27
- batch_embeddings = self.model.encode(batch, show_progress_bar=False)
28
- all_embeddings.append(batch_embeddings)
29
- embeddings = np.concatenate(all_embeddings, axis=0)
30
- return embeddings.tolist()
31
-
32
  def embed_query(self, text):
 
33
  return self.model.encode(text).tolist()
34
 
35
- # Instantiate wrapper with model
36
  embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
37
 
38
- # Load vector store
39
  vector_db = Chroma(
40
- persist_directory="chroma_db",
41
  embedding_function=embedding_model # Use your SentenceTransformerWrapper instance
42
  )
43
 
44
- # Display results
45
- def retrieve_info(query, k=5):
46
  results = vector_db.similarity_search(query, k)
 
47
  for i, doc in enumerate(results):
48
- print(f"Result {i+1}:")
49
- print(f"Metadata: {doc.metadata}")
50
- print(f"Content: {doc.page_content[:200]}...") # Display a preview of the chunk
51
- return f"Result {i+1}:\nMetadata: {doc.metadata}\nContent: {doc.page_content[:200]}..."
52
-
53
  demo = gr.Interface(
54
  fn=retrieve_info,
55
  inputs=["text", gr.Number(label="k (Number of chunks to retrieve)")],
56
- outputs=[gr.Textbox(label="Output chunk(s)", lines=500)],
57
  )
58
 
59
  demo.launch()
 
1
+ import os
2
+ import zipfile
3
+ from huggingface_hub import hf_hub_download
4
  import gradio as gr
5
  from sentence_transformers import SentenceTransformer
6
+ from langchain_chroma import Chroma
 
 
 
 
7
 
8
+ # Step 1: Download and Extract the Chroma Vector Store
9
+ def prepare_chroma_db(hf_token=None):
10
+ persist_directory = "chroma_db"
11
+ if not os.path.exists(persist_directory):
12
+ print("Downloading chroma_db.zip from the dataset repository...")
13
+ zip_path = hf_hub_download(
14
+ repo_id="datasets/camiellia/phapdien_demo", # dataset repository
15
+ filename="chroma_db.zip",
16
+ token=hf_token
17
+ )
18
+ print(f"Downloaded to {zip_path}")
19
+
20
+ # Extract the zip file into the persist_directory
21
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
22
+ zip_ref.extractall(persist_directory)
23
+ print(f"Extracted chroma_db to ./{persist_directory}")
24
+ else:
25
+ print(f"{persist_directory} directory already exists.")
26
+ return persist_directory
27
+
28
+ persist_directory = prepare_chroma_db()
29
+
30
+ # Step 2: wrapper
31
  class SentenceTransformerWrapper:
32
+ def __init__(self, model_name):
 
 
33
  self.model = SentenceTransformer(model_name)
 
 
 
 
 
 
 
 
 
34
 
35
+ def embed_documents(self, texts):
36
+ # Convert the list of texts to embeddings
37
+ return self.model.encode(texts, show_progress_bar=True).tolist()
38
+
 
 
 
39
  def embed_query(self, text):
40
+ # Convert a single query to its embedding
41
  return self.model.encode(text).tolist()
42
 
 
43
  embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
44
 
45
+ # Step 3: Load the vector store from the directory
46
  vector_db = Chroma(
47
+ persist_directory=persist_directory,
48
  embedding_function=embedding_model # Use your SentenceTransformerWrapper instance
49
  )
50
 
51
+ # Step 4: Gradio function
52
+ def retrieve_info(query, k):
53
  results = vector_db.similarity_search(query, k)
54
+ output = ""
55
  for i, doc in enumerate(results):
56
+ output += f"Result {i+1}:\nMetadata: {doc.metadata}\nContent: {doc.page_content[:1000]}\n\n"
57
+ return output
58
+
59
+ # Step 5: Launch the Gradio interface
 
60
  demo = gr.Interface(
61
  fn=retrieve_info,
62
  inputs=["text", gr.Number(label="k (Number of chunks to retrieve)")],
63
+ outputs=[gr.Textbox(label="Output chunk(s)", lines=25)],
64
  )
65
 
66
  demo.launch()
app.py → vector_embedding.py RENAMED
File without changes