My Duong commited on
Commit
7d6c1f1
·
1 Parent(s): cd6cc82

update files

Browse files
Files changed (6) hide show
  1. .gitignore +3 -1
  2. app.py +34 -5
  3. requirements.txt +4 -0
  4. semantic_search.ipynb +93 -0
  5. vector_create.ipynb +199 -0
  6. vectorize_text.py +84 -0
.gitignore CHANGED
@@ -1 +1,3 @@
1
- /demovv
 
 
 
1
+ \demovv
2
+ BoPhapDienDienTu
3
+ vbpl_links.txt
app.py CHANGED
@@ -1,13 +1,42 @@
1
  import gradio as gr
2
  from sentence_transformers import SentenceTransformer
 
3
 
4
- def greet(name, k):
5
- return ("Hello, " + name + "!") * k
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  demo = gr.Interface(
8
- fn=greet,
9
- inputs=["text", gr.Slider(value=5, minimum=1, maximum=100, step=1)],
10
- outputs=[gr.Textbox(label="greeting", lines=500)],
11
  )
12
 
13
  demo.launch()
 
1
  import gradio as gr
2
  from sentence_transformers import SentenceTransformer
3
+ from langchain.vectorstores import Chroma
4
 
5
+ # Load model
6
+ class SentenceTransformerWrapper:
7
+ def __init__(self, model_name):
8
+ self.model = SentenceTransformer(model_name)
9
+
10
+ def embed_documents(self, texts):
11
+ # Convert the list of texts to embeddings
12
+ return self.model.encode(texts, show_progress_bar=True).tolist()
13
+
14
+ def embed_query(self, text):
15
+ # Convert a single query to its embedding
16
+ return self.model.encode(text).tolist()
17
 
18
+ # Instantiate wrapper with model
19
+ embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
20
+
21
+ # Load vector store
22
+ vector_db = Chroma(
23
+ persist_directory="chroma_db_new",
24
+ embedding=embedding_model # Use your SentenceTransformerWrapper instance
25
+ )
26
+
27
+ # Display results
28
+ def retrieve_info(query, k=5):
29
+ results = vector_db.similarity_search(query, k)
30
+ for i, doc in enumerate(results):
31
+ print(f"Result {i+1}:")
32
+ print(f"Metadata: {doc.metadata}")
33
+ print(f"Content: {doc.page_content[:200]}...") # Display a preview of the chunk
34
+ return f"Result {i+1}:\nMetadata: {doc.metadata}\nContent: {doc.page_content[:200]}..."
35
+
36
  demo = gr.Interface(
37
+ fn=retrieve_info,
38
+ inputs=["text", gr.Number(default=1, label="k (Number of chunks to retrieve)")],
39
+ outputs=[gr.Textbox(label="Output chunk(s)", lines=500)],
40
  )
41
 
42
  demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,9 @@
1
  torch
 
 
 
2
  langchain
 
3
  tensorflow
4
  tqdm
5
  accelerate
 
1
  torch
2
+ matplotlib
3
+ numpy
4
+ pandas
5
  langchain
6
+ scikit-learn
7
  tensorflow
8
  tqdm
9
  accelerate
semantic_search.ipynb ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "- Write a Python notebook that does semantic search on the vector database and return top k results (use LangChain). Comment on what you observe."
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from sentence_transformers import SentenceTransformer\n",
17
+ "import numpy as np\n",
18
+ "from tqdm import tqdm\n",
19
+ "import os\n",
20
+ "from langchain.vectorstores import Chroma"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "# Wrapper with embed_documents and embed_query\n",
30
+ "class SentenceTransformerWrapper:\n",
31
+ " def __init__(self, model_name):\n",
32
+ " self.model = SentenceTransformer(model_name)\n",
33
+ " \n",
34
+ " def embed_documents(self, texts):\n",
35
+ " # Convert the list of texts to embeddings\n",
36
+ " return self.model.encode(texts, show_progress_bar=True).tolist()\n",
37
+ " \n",
38
+ " def embed_query(self, text):\n",
39
+ " # Convert a single query to its embedding\n",
40
+ " return self.model.encode(text).tolist()\n",
41
+ "\n",
42
+ "# Instantiate wrapper with model\n",
43
+ "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "# Chroma database\n",
53
+ "vector_db = Chroma(\n",
54
+ " persist_directory=\"chroma_db_new\",\n",
55
+ " embedding=embedding_model # Use your SentenceTransformerWrapper instance\n",
56
+ ")\n",
57
+ "\n",
58
+ "# Test by running a similarity search\n",
59
+ "query = input(\"Enter your query: \")\n",
60
+ "results = vector_db.similarity_search(query, k=5)\n",
61
+ "\n",
62
+ "# Display the results\n",
63
+ "print(f\"\\nTop 5 results for query: '{query}'\\n\")\n",
64
+ "for i, doc in enumerate(results):\n",
65
+ " print(f\"Result {i+1}:\")\n",
66
+ " print(f\"Metadata: {doc.metadata}\")\n",
67
+ " print(f\"Content: {doc.page_content[:50]}...\") # Display a preview of the chunk\n",
68
+ " print(\"-\" * 50)\n"
69
+ ]
70
+ }
71
+ ],
72
+ "metadata": {
73
+ "kernelspec": {
74
+ "display_name": "phapdienvv",
75
+ "language": "python",
76
+ "name": "python3"
77
+ },
78
+ "language_info": {
79
+ "codemirror_mode": {
80
+ "name": "ipython",
81
+ "version": 3
82
+ },
83
+ "file_extension": ".py",
84
+ "mimetype": "text/x-python",
85
+ "name": "python",
86
+ "nbconvert_exporter": "python",
87
+ "pygments_lexer": "ipython3",
88
+ "version": "3.11.4"
89
+ }
90
+ },
91
+ "nbformat": 4,
92
+ "nbformat_minor": 2
93
+ }
vector_create.ipynb ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)\n",
8
+ "- ingest the document files only (full_ItemID.html files)\n",
9
+ "- it is required to save the file path in the metadata"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 83,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "from tqdm import tqdm\n",
20
+ "from langchain_text_splitters import CharacterTextSplitter\n",
21
+ "from langchain.vectorstores import Chroma\n",
22
+ "from bs4 import BeautifulSoup\n",
23
+ "from sentence_transformers import SentenceTransformer"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "Loading documents: 100%|██████████| 5101/5101 [52:41<00:00, 1.61it/s] \n"
36
+ ]
37
+ },
38
+ {
39
+ "name": "stdout",
40
+ "output_type": "stream",
41
+ "text": [
42
+ "Loaded 5101 documents\n"
43
+ ]
44
+ },
45
+ {
46
+ "name": "stderr",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "Created a chunk of size 3623, which is longer than the specified 2000\n",
50
+ "Created a chunk of size 10118, which is longer than the specified 2000\n",
51
+ "Created a chunk of size 10168, which is longer than the specified 2000\n",
52
+ "Created a chunk of size 3836, which is longer than the specified 2000\n",
53
+ "Created a chunk of size 8935, which is longer than the specified 2000\n",
54
+ "Created a chunk of size 5101, which is longer than the specified 2000\n",
55
+ "Created a chunk of size 16204, which is longer than the specified 2000\n",
56
+ "Created a chunk of size 8374, which is longer than the specified 2000\n",
57
+ "Created a chunk of size 3134, which is longer than the specified 2000\n"
58
+ ]
59
+ }
60
+ ],
61
+ "source": [
62
+ "# Step 1: HTML dir\n",
63
+ "input_dir = rf\"D:\\PhapDien_semantic_search\\BoPhapDienDienTu\\vbpl\"\n",
64
+ "model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')\n",
65
+ "\n",
66
+ "# Step 2: Clean the HTML files\n",
67
+ "def load_and_clean_html(file_path):\n",
68
+ " with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
69
+ " html_content = f.read()\n",
70
+ " soup = BeautifulSoup(html_content, \"html.parser\")\n",
71
+ " text = soup.get_text() # Extract plain text from the HTML\n",
72
+ " return text\n",
73
+ "\n",
74
+ "# Step 3: Process all files in the directory\n",
75
+ "documents = []\n",
76
+ "metadata = []\n",
77
+ "for file_name in tqdm(os.listdir(input_dir), desc=\"Loading documents\"):\n",
78
+ " if file_name.startswith(\"full_\") and file_name.endswith(\".html\"):\n",
79
+ " file_path = os.path.join(input_dir, file_name)\n",
80
+ " text = load_and_clean_html(file_path)\n",
81
+ " documents.append(text)\n",
82
+ " metadata.append({\"file_path\": file_path})\n",
83
+ "\n",
84
+ "print(f\"Loaded {len(documents)} documents\")\n",
85
+ "# Step 4: Split text into chunks\n",
86
+ "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
87
+ " encoding_name=\"cl100k_base\", chunk_size=2000, chunk_overlap=20, separator=\"\\n\"\n",
88
+ ")\n",
89
+ "splitted_docs = []\n",
90
+ "splitted_metadata = []\n",
91
+ "\n",
92
+ "for doc, meta in zip(documents, metadata):\n",
93
+ " chunks = text_splitter.split_text(doc)\n",
94
+ " for chunk in chunks:\n",
95
+ " splitted_docs.append(chunk)\n",
96
+ " splitted_metadata.append(meta)\n",
97
+ "# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.\n",
98
+ "# Notice that headers , menu text items, html tags, warnings in English contain a lot of \n",
99
+ "# whitespaces when splitted with \\n. Thus, I removed those instances since almost all of\n",
100
+ "# the information for retrieval is conveniently formatted well.\n",
101
+ "print(splitted_docs)\n",
102
+ "print(splitted_metadata)\n",
103
+ "processed_splitted_docs = []\n",
104
+ "processed_metadata = []\n",
105
+ "for i, doc in enumerate(splitted_docs):\n",
106
+ " processed = doc.split(\"\\n\")\n",
107
+ " for phrase in processed:\n",
108
+ " if len(phrase) > 50 and \" \" not in phrase:\n",
109
+ " processed_splitted_docs.append(phrase)\n",
110
+ " processed_metadata.append(splitted_metadata[i])"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# Wrapper with embed_documents and embed_query\n",
120
+ "class SentenceTransformerWrapper:\n",
121
+ " def __init__(self, model_name):\n",
122
+ " self.model = SentenceTransformer(model_name)\n",
123
+ " \n",
124
+ " def embed_documents(self, texts):\n",
125
+ " # Convert the list of texts to embeddings\n",
126
+ " return self.model.encode(texts, show_progress_bar=True).tolist()\n",
127
+ " \n",
128
+ " def embed_query(self, text):\n",
129
+ " # Convert a single query to its embedding\n",
130
+ " return self.model.encode(text).tolist()\n",
131
+ "\n",
132
+ "# Instantiate wrapper with model\n",
133
+ "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "name": "stderr",
143
+ "output_type": "stream",
144
+ "text": [
145
+ "Batches: 0%| | 0/7 [00:00<?, ?it/s]"
146
+ ]
147
+ },
148
+ {
149
+ "name": "stderr",
150
+ "output_type": "stream",
151
+ "text": [
152
+ "Batches: 100%|██████████| 7/7 [00:16<00:00, 2.36s/it]\n"
153
+ ]
154
+ },
155
+ {
156
+ "name": "stdout",
157
+ "output_type": "stream",
158
+ "text": [
159
+ "Database saved successfully!\n"
160
+ ]
161
+ }
162
+ ],
163
+ "source": [
164
+ "# Step 6: Generate embeddings using BKAI model\n",
165
+ "\n",
166
+ "# Step 7: Save the vectors to ChromaDB\n",
167
+ "vector_db = Chroma.from_texts(\n",
168
+ " texts=processed_splitted_docs,\n",
169
+ " embedding=embedding_model,\n",
170
+ " metadatas=processed_metadata,\n",
171
+ " persist_directory=\"chroma_db_new\" # Directory where the database will be saved\n",
172
+ ")\n",
173
+ "\n",
174
+ "print(\"Database saved successfully!\")\n"
175
+ ]
176
+ }
177
+ ],
178
+ "metadata": {
179
+ "kernelspec": {
180
+ "display_name": "phapdienvv",
181
+ "language": "python",
182
+ "name": "python3"
183
+ },
184
+ "language_info": {
185
+ "codemirror_mode": {
186
+ "name": "ipython",
187
+ "version": 3
188
+ },
189
+ "file_extension": ".py",
190
+ "mimetype": "text/x-python",
191
+ "name": "python",
192
+ "nbconvert_exporter": "python",
193
+ "pygments_lexer": "ipython3",
194
+ "version": "3.11.4"
195
+ }
196
+ },
197
+ "nbformat": 4,
198
+ "nbformat_minor": 2
199
+ }
vectorize_text.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+ from langchain_text_splitters import CharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from bs4 import BeautifulSoup
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ # Step 1: HTML dir & set up model
9
+ input_dir = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl"
10
+ model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
11
+
12
+ # Wrapper with embed_documents and embed_query
13
+ class SentenceTransformerWrapper:
14
+ def __init__(self, model_name):
15
+ self.model = SentenceTransformer(model_name)
16
+
17
+ def embed_documents(self, texts):
18
+ # Convert the list of texts to embeddings
19
+ return self.model.encode(texts, show_progress_bar=True).tolist()
20
+
21
+ def embed_query(self, text):
22
+ # Convert a single query to its embedding
23
+ return self.model.encode(text).tolist()
24
+
25
+ # Instantiate wrapper with model
26
+ embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
27
+
28
+ # Step 2: Clean the HTML files
29
+ def load_and_clean_html(file_path):
30
+ with open(file_path, "r", encoding="utf-8") as f:
31
+ html_content = f.read()
32
+ soup = BeautifulSoup(html_content, "html.parser")
33
+ text = soup.get_text() # Extract plain text from the HTML
34
+ return text
35
+
36
+ # Step 3: Process all files in the directory
37
+ documents = []
38
+ metadata = []
39
+ for file_name in tqdm(os.listdir(input_dir), desc="Loading documents"):
40
+ if file_name.startswith("full_") and file_name.endswith(".html"):
41
+ file_path = os.path.join(input_dir, file_name)
42
+ text = load_and_clean_html(file_path)
43
+ documents.append(text)
44
+ metadata.append({"file_path": file_path})
45
+
46
+ print(f"Loaded {len(documents)} documents")
47
+ # Step 4: Split text into chunks
48
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
49
+ encoding_name="cl100k_base", chunk_size=2000, chunk_overlap=20, separator="\n"
50
+ )
51
+ splitted_docs = []
52
+ splitted_metadata = []
53
+
54
+ for doc, meta in zip(documents, metadata):
55
+ chunks = text_splitter.split_text(doc)
56
+ for chunk in chunks:
57
+ splitted_docs.append(chunk)
58
+ splitted_metadata.append(meta)
59
+ # Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.
60
+ # Notice that headers , menu text items, html tags, warnings in English contain a lot of
61
+ # whitespaces when splitted with \n. Thus, I removed those instances since almost all of
62
+ # the information for retrieval is conveniently formatted well.
63
+ processed_splitted_docs = []
64
+ processed_metadata = []
65
+ for i, doc in tqdm(enumerate(splitted_docs), desc="Cleaning text"):
66
+ processed = doc.split("\n")
67
+ for phrase in processed:
68
+ if len(phrase) > 50 and " " not in phrase:
69
+ processed_splitted_docs.append(phrase)
70
+ processed_metadata.append(splitted_metadata[i])
71
+
72
+ print(f"Processed {len(processed_splitted_docs)} chunks")
73
+
74
+ # Step 6: Generate embeddings using BKAI model
75
+
76
+ # Step 7: Save the vectors to ChromaDB
77
+ vector_db = Chroma.from_texts(
78
+ texts=processed_splitted_docs,
79
+ embedding=embedding_model,
80
+ metadatas=processed_metadata,
81
+ persist_directory="chroma_db_new" # Directory where the database will be saved
82
+ )
83
+
84
+ print("Database saved successfully!")