Spaces:
Running
Running
My Duong
commited on
Commit
·
a39d9ba
1
Parent(s):
a3507d8
upload demo
Browse files- app_official.py +43 -36
- app.py → vector_embedding.py +0 -0
app_official.py
CHANGED
@@ -1,59 +1,66 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from sentence_transformers import SentenceTransformer
|
3 |
-
from
|
4 |
-
from langchain.utils import DataLoader
|
5 |
-
from accelerate import Accelerator
|
6 |
-
import numpy as np
|
7 |
-
from tqdm import tqdm
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
class SentenceTransformerWrapper:
|
11 |
-
def __init__(self, model_name
|
12 |
-
self.batch_size = batch_size
|
13 |
-
self.accelerator = Accelerator() # Create an accelerator instance
|
14 |
self.model = SentenceTransformer(model_name)
|
15 |
-
# Move the model to the appropriate device
|
16 |
-
self.model.to(self.accelerator.device)
|
17 |
-
|
18 |
-
def embed_documents(self, texts):
|
19 |
-
# Create a DataLoader for the texts
|
20 |
-
dataloader = DataLoader(texts, batch_size=self.batch_size)
|
21 |
-
all_embeddings = []
|
22 |
-
# Optionally, prepare the DataLoader with accelerator if needed
|
23 |
-
dataloader = self.accelerator.prepare(dataloader)
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
embeddings = np.concatenate(all_embeddings, axis=0)
|
30 |
-
return embeddings.tolist()
|
31 |
-
|
32 |
def embed_query(self, text):
|
|
|
33 |
return self.model.encode(text).tolist()
|
34 |
|
35 |
-
# Instantiate wrapper with model
|
36 |
embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
|
37 |
|
38 |
-
# Load vector store
|
39 |
vector_db = Chroma(
|
40 |
-
persist_directory=
|
41 |
embedding_function=embedding_model # Use your SentenceTransformerWrapper instance
|
42 |
)
|
43 |
|
44 |
-
#
|
45 |
-
def retrieve_info(query, k
|
46 |
results = vector_db.similarity_search(query, k)
|
|
|
47 |
for i, doc in enumerate(results):
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
demo = gr.Interface(
|
54 |
fn=retrieve_info,
|
55 |
inputs=["text", gr.Number(label="k (Number of chunks to retrieve)")],
|
56 |
-
outputs=[gr.Textbox(label="Output chunk(s)", lines=
|
57 |
)
|
58 |
|
59 |
demo.launch()
|
|
|
1 |
+
import os
|
2 |
+
import zipfile
|
3 |
+
from huggingface_hub import hf_hub_download
|
4 |
import gradio as gr
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
+
from langchain_chroma import Chroma
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
# Step 1: Download and Extract the Chroma Vector Store
|
9 |
+
def prepare_chroma_db(hf_token=None):
|
10 |
+
persist_directory = "chroma_db"
|
11 |
+
if not os.path.exists(persist_directory):
|
12 |
+
print("Downloading chroma_db.zip from the dataset repository...")
|
13 |
+
zip_path = hf_hub_download(
|
14 |
+
repo_id="datasets/camiellia/phapdien_demo", # dataset repository
|
15 |
+
filename="chroma_db.zip",
|
16 |
+
token=hf_token
|
17 |
+
)
|
18 |
+
print(f"Downloaded to {zip_path}")
|
19 |
+
|
20 |
+
# Extract the zip file into the persist_directory
|
21 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
22 |
+
zip_ref.extractall(persist_directory)
|
23 |
+
print(f"Extracted chroma_db to ./{persist_directory}")
|
24 |
+
else:
|
25 |
+
print(f"{persist_directory} directory already exists.")
|
26 |
+
return persist_directory
|
27 |
+
|
28 |
+
persist_directory = prepare_chroma_db()
|
29 |
+
|
30 |
+
# Step 2: wrapper
|
31 |
class SentenceTransformerWrapper:
|
32 |
+
def __init__(self, model_name):
|
|
|
|
|
33 |
self.model = SentenceTransformer(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
def embed_documents(self, texts):
|
36 |
+
# Convert the list of texts to embeddings
|
37 |
+
return self.model.encode(texts, show_progress_bar=True).tolist()
|
38 |
+
|
|
|
|
|
|
|
39 |
def embed_query(self, text):
|
40 |
+
# Convert a single query to its embedding
|
41 |
return self.model.encode(text).tolist()
|
42 |
|
|
|
43 |
embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
|
44 |
|
45 |
+
# Step 3: Load the vector store from the directory
|
46 |
vector_db = Chroma(
|
47 |
+
persist_directory=persist_directory,
|
48 |
embedding_function=embedding_model # Use your SentenceTransformerWrapper instance
|
49 |
)
|
50 |
|
51 |
+
# Step 4: Gradio function
|
52 |
+
def retrieve_info(query, k):
|
53 |
results = vector_db.similarity_search(query, k)
|
54 |
+
output = ""
|
55 |
for i, doc in enumerate(results):
|
56 |
+
output += f"Result {i+1}:\nMetadata: {doc.metadata}\nContent: {doc.page_content[:1000]}\n\n"
|
57 |
+
return output
|
58 |
+
|
59 |
+
# Step 5: Launch the Gradio interface
|
|
|
60 |
demo = gr.Interface(
|
61 |
fn=retrieve_info,
|
62 |
inputs=["text", gr.Number(label="k (Number of chunks to retrieve)")],
|
63 |
+
outputs=[gr.Textbox(label="Output chunk(s)", lines=25)],
|
64 |
)
|
65 |
|
66 |
demo.launch()
|
app.py → vector_embedding.py
RENAMED
File without changes
|