Spaces:

camiellia
/

phapdien_demo

Running

App Files Files Community

My Duong commited on 16 days ago

Commit

7d6c1f1

1 Parent(s): cd6cc82

update files

Browse files

Files changed (6) hide show

.gitignore +3 -1
app.py +34 -5
requirements.txt +4 -0
semantic_search.ipynb +93 -0
vector_create.ipynb +199 -0
vectorize_text.py +84 -0

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- /demovv

+\demovv
+BoPhapDienDienTu
+vbpl_links.txt

app.py CHANGED Viewed

@@ -1,13 +1,42 @@
 import gradio as gr
 from sentence_transformers import SentenceTransformer
-def greet(name, k):
-    return ("Hello, " + name + "!") * k
 demo = gr.Interface(
-    fn=greet,
-    inputs=["text", gr.Slider(value=5, minimum=1, maximum=100, step=1)],
-    outputs=[gr.Textbox(label="greeting", lines=500)],
 )
 demo.launch()

 import gradio as gr
 from sentence_transformers import SentenceTransformer
+from langchain.vectorstores import Chroma
+# Load model
+class SentenceTransformerWrapper:
+    def __init__(self, model_name):
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts):
+        # Convert the list of texts to embeddings
+        return self.model.encode(texts, show_progress_bar=True).tolist()
+    def embed_query(self, text):
+        # Convert a single query to its embedding
+        return self.model.encode(text).tolist()
+# Instantiate wrapper with model
+embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
+# Load vector store
+vector_db = Chroma(
+    persist_directory="chroma_db_new",
+    embedding=embedding_model  # Use your SentenceTransformerWrapper instance
+)
+# Display results
+def retrieve_info(query, k=5):
+    results = vector_db.similarity_search(query, k)
+    for i, doc in enumerate(results):
+        print(f"Result {i+1}:")
+        print(f"Metadata: {doc.metadata}")
+        print(f"Content: {doc.page_content[:200]}...")  # Display a preview of the chunk
+        return f"Result {i+1}:\nMetadata: {doc.metadata}\nContent: {doc.page_content[:200]}..."
 demo = gr.Interface(
+    fn=retrieve_info,
+    inputs=["text", gr.Number(default=1, label="k (Number of chunks to retrieve)")],
+    outputs=[gr.Textbox(label="Output chunk(s)", lines=500)],
 )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,9 @@
 torch
 langchain
 tensorflow
 tqdm
 accelerate

 torch
+matplotlib
+numpy
+pandas
 langchain
+scikit-learn
 tensorflow
 tqdm
 accelerate

semantic_search.ipynb ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Write a Python notebook that does semantic search on the vector database and return top k results (use LangChain). Comment on what you observe."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "from langchain.vectorstores import Chroma"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wrapper with embed_documents and embed_query\n",
+    "class SentenceTransformerWrapper:\n",
+    "    def __init__(self, model_name):\n",
+    "        self.model = SentenceTransformer(model_name)\n",
+    "        \n",
+    "    def embed_documents(self, texts):\n",
+    "        # Convert the list of texts to embeddings\n",
+    "        return self.model.encode(texts, show_progress_bar=True).tolist()\n",
+    "    \n",
+    "    def embed_query(self, text):\n",
+    "        # Convert a single query to its embedding\n",
+    "        return self.model.encode(text).tolist()\n",
+    "\n",
+    "# Instantiate wrapper with model\n",
+    "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chroma database\n",
+    "vector_db = Chroma(\n",
+    "    persist_directory=\"chroma_db_new\",\n",
+    "    embedding=embedding_model  # Use your SentenceTransformerWrapper instance\n",
+    ")\n",
+    "\n",
+    "# Test by running a similarity search\n",
+    "query = input(\"Enter your query: \")\n",
+    "results = vector_db.similarity_search(query, k=5)\n",
+    "\n",
+    "# Display the results\n",
+    "print(f\"\\nTop 5 results for query: '{query}'\\n\")\n",
+    "for i, doc in enumerate(results):\n",
+    "    print(f\"Result {i+1}:\")\n",
+    "    print(f\"Metadata: {doc.metadata}\")\n",
+    "    print(f\"Content: {doc.page_content[:50]}...\")  # Display a preview of the chunk\n",
+    "    print(\"-\" * 50)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "phapdienvv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

vector_create.ipynb ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)\n",
+    "- ingest the document files only (full_ItemID.html files)\n",
+    "- it is required to save the file path in the metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from tqdm import tqdm\n",
+    "from langchain_text_splitters import CharacterTextSplitter\n",
+    "from langchain.vectorstores import Chroma\n",
+    "from bs4 import BeautifulSoup\n",
+    "from sentence_transformers import SentenceTransformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading documents: 100%|██████████| 5101/5101 [52:41<00:00,  1.61it/s]  \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 5101 documents\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Created a chunk of size 3623, which is longer than the specified 2000\n",
+      "Created a chunk of size 10118, which is longer than the specified 2000\n",
+      "Created a chunk of size 10168, which is longer than the specified 2000\n",
+      "Created a chunk of size 3836, which is longer than the specified 2000\n",
+      "Created a chunk of size 8935, which is longer than the specified 2000\n",
+      "Created a chunk of size 5101, which is longer than the specified 2000\n",
+      "Created a chunk of size 16204, which is longer than the specified 2000\n",
+      "Created a chunk of size 8374, which is longer than the specified 2000\n",
+      "Created a chunk of size 3134, which is longer than the specified 2000\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step 1: HTML dir\n",
+    "input_dir = rf\"D:\\PhapDien_semantic_search\\BoPhapDienDienTu\\vbpl\"\n",
+    "model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')\n",
+    "\n",
+    "# Step 2: Clean the HTML files\n",
+    "def load_and_clean_html(file_path):\n",
+    "    with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
+    "        html_content = f.read()\n",
+    "    soup = BeautifulSoup(html_content, \"html.parser\")\n",
+    "    text = soup.get_text()  # Extract plain text from the HTML\n",
+    "    return text\n",
+    "\n",
+    "# Step 3: Process all files in the directory\n",
+    "documents = []\n",
+    "metadata = []\n",
+    "for file_name in tqdm(os.listdir(input_dir), desc=\"Loading documents\"):\n",
+    "    if file_name.startswith(\"full_\") and file_name.endswith(\".html\"):\n",
+    "        file_path = os.path.join(input_dir, file_name)\n",
+    "        text = load_and_clean_html(file_path)\n",
+    "        documents.append(text)\n",
+    "        metadata.append({\"file_path\": file_path})\n",
+    "\n",
+    "print(f\"Loaded {len(documents)} documents\")\n",
+    "# Step 4: Split text into chunks\n",
+    "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
+    "    encoding_name=\"cl100k_base\", chunk_size=2000, chunk_overlap=20, separator=\"\\n\"\n",
+    ")\n",
+    "splitted_docs = []\n",
+    "splitted_metadata = []\n",
+    "\n",
+    "for doc, meta in zip(documents, metadata):\n",
+    "    chunks = text_splitter.split_text(doc)\n",
+    "    for chunk in chunks:\n",
+    "        splitted_docs.append(chunk)\n",
+    "        splitted_metadata.append(meta)\n",
+    "# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.\n",
+    "# Notice that headers , menu text items, html tags, warnings in English contain a lot of \n",
+    "# whitespaces when splitted with \\n. Thus, I removed those instances since almost all of\n",
+    "# the information for retrieval is conveniently formatted well.\n",
+    "print(splitted_docs)\n",
+    "print(splitted_metadata)\n",
+    "processed_splitted_docs = []\n",
+    "processed_metadata = []\n",
+    "for i, doc in enumerate(splitted_docs):\n",
+    "    processed = doc.split(\"\\n\")\n",
+    "    for phrase in processed:\n",
+    "        if len(phrase) > 50 and \"    \" not in phrase:\n",
+    "            processed_splitted_docs.append(phrase)\n",
+    "            processed_metadata.append(splitted_metadata[i])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wrapper with embed_documents and embed_query\n",
+    "class SentenceTransformerWrapper:\n",
+    "    def __init__(self, model_name):\n",
+    "        self.model = SentenceTransformer(model_name)\n",
+    "        \n",
+    "    def embed_documents(self, texts):\n",
+    "        # Convert the list of texts to embeddings\n",
+    "        return self.model.encode(texts, show_progress_bar=True).tolist()\n",
+    "    \n",
+    "    def embed_query(self, text):\n",
+    "        # Convert a single query to its embedding\n",
+    "        return self.model.encode(text).tolist()\n",
+    "\n",
+    "# Instantiate wrapper with model\n",
+    "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches:   0%|          | 0/7 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 7/7 [00:16<00:00,  2.36s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Database saved successfully!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step 6: Generate embeddings using BKAI model\n",
+    "\n",
+    "# Step 7: Save the vectors to ChromaDB\n",
+    "vector_db = Chroma.from_texts(\n",
+    "    texts=processed_splitted_docs,\n",
+    "    embedding=embedding_model,\n",
+    "    metadatas=processed_metadata,\n",
+    "    persist_directory=\"chroma_db_new\"  # Directory where the database will be saved\n",
+    ")\n",
+    "\n",
+    "print(\"Database saved successfully!\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "phapdienvv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

vectorize_text.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+from tqdm import tqdm
+from langchain_text_splitters import CharacterTextSplitter
+from langchain.vectorstores import Chroma
+from bs4 import BeautifulSoup
+from sentence_transformers import SentenceTransformer
+# Step 1: HTML dir & set up model
+input_dir = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl"
+model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
+# Wrapper with embed_documents and embed_query
+class SentenceTransformerWrapper:
+    def __init__(self, model_name):
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts):
+        # Convert the list of texts to embeddings
+        return self.model.encode(texts, show_progress_bar=True).tolist()
+    def embed_query(self, text):
+        # Convert a single query to its embedding
+        return self.model.encode(text).tolist()
+# Instantiate wrapper with model
+embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')
+# Step 2: Clean the HTML files
+def load_and_clean_html(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        html_content = f.read()
+    soup = BeautifulSoup(html_content, "html.parser")
+    text = soup.get_text()  # Extract plain text from the HTML
+    return text
+# Step 3: Process all files in the directory
+documents = []
+metadata = []
+for file_name in tqdm(os.listdir(input_dir), desc="Loading documents"):
+    if file_name.startswith("full_") and file_name.endswith(".html"):
+        file_path = os.path.join(input_dir, file_name)
+        text = load_and_clean_html(file_path)
+        documents.append(text)
+        metadata.append({"file_path": file_path})
+print(f"Loaded {len(documents)} documents")
+# Step 4: Split text into chunks
+text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+    encoding_name="cl100k_base", chunk_size=2000, chunk_overlap=20, separator="\n"
+)
+splitted_docs = []
+splitted_metadata = []
+for doc, meta in zip(documents, metadata):
+    chunks = text_splitter.split_text(doc)
+    for chunk in chunks:
+        splitted_docs.append(chunk)
+        splitted_metadata.append(meta)
+# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.
+# Notice that headers , menu text items, html tags, warnings in English contain a lot of
+# whitespaces when splitted with \n. Thus, I removed those instances since almost all of
+# the information for retrieval is conveniently formatted well.
+processed_splitted_docs = []
+processed_metadata = []
+for i, doc in tqdm(enumerate(splitted_docs), desc="Cleaning text"):
+    processed = doc.split("\n")
+    for phrase in processed:
+        if len(phrase) > 50 and "    " not in phrase:
+            processed_splitted_docs.append(phrase)
+            processed_metadata.append(splitted_metadata[i])
+print(f"Processed {len(processed_splitted_docs)} chunks")
+# Step 6: Generate embeddings using BKAI model
+# Step 7: Save the vectors to ChromaDB
+vector_db = Chroma.from_texts(
+    texts=processed_splitted_docs,
+    embedding=embedding_model,
+    metadatas=processed_metadata,
+    persist_directory="chroma_db_new"  # Directory where the database will be saved
+)
+print("Database saved successfully!")