{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Write a Python notebook that creates a vector database using ChromaDB (use LangChain)\n", "- ingest the document files only (full_ItemID.html files)\n", "- it is required to save the file path in the metadata" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "import os\n", "from tqdm import tqdm\n", "from langchain_text_splitters import CharacterTextSplitter\n", "from langchain.vectorstores import Chroma\n", "from bs4 import BeautifulSoup\n", "from sentence_transformers import SentenceTransformer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading documents: 100%|██████████| 5101/5101 [52:41<00:00, 1.61it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loaded 5101 documents\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Created a chunk of size 3623, which is longer than the specified 2000\n", "Created a chunk of size 10118, which is longer than the specified 2000\n", "Created a chunk of size 10168, which is longer than the specified 2000\n", "Created a chunk of size 3836, which is longer than the specified 2000\n", "Created a chunk of size 8935, which is longer than the specified 2000\n", "Created a chunk of size 5101, which is longer than the specified 2000\n", "Created a chunk of size 16204, which is longer than the specified 2000\n", "Created a chunk of size 8374, which is longer than the specified 2000\n", "Created a chunk of size 3134, which is longer than the specified 2000\n" ] } ], "source": [ "# Step 1: HTML dir\n", "input_dir = rf\"D:\\PhapDien_semantic_search\\BoPhapDienDienTu\\vbpl\"\n", "model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')\n", "\n", "# Step 2: Clean the HTML files\n", "def load_and_clean_html(file_path):\n", " with open(file_path, \"r\", encoding=\"utf-8\") as f:\n", " html_content = f.read()\n", " soup = BeautifulSoup(html_content, \"html.parser\")\n", " text = soup.get_text() # Extract plain text from the HTML\n", " return text\n", "\n", "# Step 3: Process all files in the directory\n", "documents = []\n", "metadata = []\n", "for file_name in tqdm(os.listdir(input_dir), desc=\"Loading documents\"):\n", " if file_name.startswith(\"full_\") and file_name.endswith(\".html\"):\n", " file_path = os.path.join(input_dir, file_name)\n", " text = load_and_clean_html(file_path)\n", " documents.append(text)\n", " metadata.append({\"file_path\": file_path})\n", "\n", "print(f\"Loaded {len(documents)} documents\")\n", "# Step 4: Split text into chunks\n", "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n", " encoding_name=\"cl100k_base\", chunk_size=2000, chunk_overlap=20, separator=\"\\n\"\n", ")\n", "splitted_docs = []\n", "splitted_metadata = []\n", "\n", "for doc, meta in zip(documents, metadata):\n", " chunks = text_splitter.split_text(doc)\n", " for chunk in chunks:\n", " splitted_docs.append(chunk)\n", " splitted_metadata.append(meta)\n", "# Step 5: Naive text cleaning: for each chunk, remove extra whitespaces and newlines, remove text components less than 50 characters.\n", "# Notice that headers , menu text items, html tags, warnings in English contain a lot of \n", "# whitespaces when splitted with \\n. Thus, I removed those instances since almost all of\n", "# the information for retrieval is conveniently formatted well.\n", "print(splitted_docs)\n", "print(splitted_metadata)\n", "processed_splitted_docs = []\n", "processed_metadata = []\n", "for i, doc in enumerate(splitted_docs):\n", " processed = doc.split(\"\\n\")\n", " for phrase in processed:\n", " if len(phrase) > 50 and \" \" not in phrase:\n", " processed_splitted_docs.append(phrase)\n", " processed_metadata.append(splitted_metadata[i])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Wrapper with embed_documents and embed_query\n", "class SentenceTransformerWrapper:\n", " def __init__(self, model_name):\n", " self.model = SentenceTransformer(model_name)\n", " \n", " def embed_documents(self, texts):\n", " # Convert the list of texts to embeddings\n", " return self.model.encode(texts, show_progress_bar=True).tolist()\n", " \n", " def embed_query(self, text):\n", " # Convert a single query to its embedding\n", " return self.model.encode(text).tolist()\n", "\n", "# Instantiate wrapper with model\n", "embedding_model = SentenceTransformerWrapper('bkai-foundation-models/vietnamese-bi-encoder')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Batches: 0%| | 0/7 [00:00