{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook for processing the text data (chunking, cleaning, embeddings)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json\n", "from typing import Iterable\n", "from langchain.docstore.document import Document\n", "from typing import List\n", "\n", "# Helper methods for storing and loading already generated documents\n", "def store_documents(documents, file_path: str) -> None:\n", " with open(file_path, \"w\") as jsonl_file:\n", " for doc in documents:\n", " jsonl_file.write(doc.json() + \"\\n\")\n", "\n", "\n", "def load_documents(file_path: str) -> List[Document]:\n", " documents = []\n", " with open(file_path, \"r\") as jsonl_file:\n", " for line in jsonl_file:\n", " data = json.loads(line)\n", " obj = Document(**data)\n", " documents.append(obj)\n", " return documents" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_pdf_documents(all_docs: bool):\n", " \"\"\"\n", " Method for returning the documents of the PDFs. Processing and updating takes place in update_pdf_documents.\n", " all_docs parameter defines whether to load all documents or only new ones. Only new ones can be used if the index is already build and new documents should be added.\n", " \"\"\"\n", " pdf_documents = []\n", " if all_docs:\n", " pdf_documents = load_documents(\"./../input_data/PDF/documents/all_documents\")\n", " else:\n", " pdf_documents = load_documents(\"./../input_data/PDF/documents/new_documents\")\n", "\n", " return pdf_documents\n", "\n", "def get_web_documents(all_docs: bool) -> List[Document]:\n", " \"\"\"\n", " Method for returning the already processed documents. FIRST need to call get_web_docs_for_cleaning and clean manually. As it is a manual cleaning process, the methods are need to be called asynchronously.\n", " \"\"\"\n", " web_documents = []\n", " if all_docs:\n", " web_documents = load_documents(\"./../input_data/Web/documents/all_documents\")\n", " else:\n", " web_documents = load_documents(\"./../input_data/Web/documents/new_documents\")\n", "\n", " return web_documents\n", "\n", "def get_template_documents(all_docs: bool) -> List[Document]:\n", " \"\"\"\n", " Method for returning the documents of the templates.\n", " \"\"\"\n", " template_documents = []\n", " if all_docs:\n", " template_documents = load_documents(\"./../input_data/Templates/documents/all_documents\")\n", " else:\n", " template_documents = load_documents(\"./../input_data/Templates/documents/new_documents\")\n", "\n", " return template_documents\n", "\n", "def get_dataset_documents() -> List[Document]:\n", " \"\"\"\n", " Method for returning the documents of the templates.\n", " \"\"\"\n", " template_documents = []\n", " template_documents = load_documents(\"./../input_data/QA_dataset/all_documents\")\n", "\n", " return template_documents" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_documents_from_files(all_docs: bool):\n", " \"\"\"\n", " Gets documents from all document types.\n", " \"\"\"\n", " documents_all = []\n", " documents_PDF = get_pdf_documents(all_docs)\n", " document_web = get_web_documents(all_docs)\n", " document_template = get_template_documents(all_docs)\n", " document_dataset = get_dataset_documents()\n", " \n", " documents_all.extend(documents_PDF)\n", " documents_all.extend(document_web)\n", " documents_all.extend(document_template)\n", " documents_all.extend(document_dataset)\n", " \n", " print(\"Number of documents: \" + str(len(documents_all)) + \"\\n\")\n", " return documents_all" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", "def split_docs(documents: List[Document], chunk_size: int, chunk_overlap: int):\n", "\n", " text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[\" \"])\n", " chunkedDocuments = text_splitter.split_documents(documents)\n", " return chunkedDocuments" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def clean_text(text: str) -> str:\n", " # Replace multiple whitespaces (except newlines) with a single space\n", " text = re.sub(r\"(?!\\n)\\s+\", \" \", text)\n", " # Replace multiple newlines with a single newline\n", " text = re.sub(r\"\\n+\", \"\\n\", text)\n", " # Remove leading and trailing whitespace\n", " text = text.strip()\n", " return text\n", "\n", "def clean_and_process_chunked_documents(chunkedDocuments: List[Document]) -> List[Document]:\n", " counter = 1\n", " for i in chunkedDocuments:\n", " i.page_content = clean_text(i.page_content)\n", " i.metadata[\"original_text\"] = i.page_content\n", " i.metadata[\"doc_ID\"] = counter\n", " counter += 1\n", "\n", " i.page_content = i.page_content.lower() \n", "\n", " return chunkedDocuments" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", "def get_embedding_model():\n", " path = \"Basti8499/bge-large-en-v1.5-ISO-27001\"\n", " model = HuggingFaceEmbeddings(model_name=path)\n", " return model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def create_embedding_vectors(embedding_model, documents: List[Document]):\n", " texts = []\n", " for document in documents:\n", " texts.append(document.page_content)\n", "\n", " embeddings = embedding_model.embed_documents(texts)\n", "\n", " return embeddings" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "def preprocess_data(chunk_size: int, chunk_overlap: int, all_docs: bool):\n", " documents = get_documents_from_files(all_docs)\n", " chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", " chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n", " embedding_model = get_embedding_model()\n", " embeddings = create_embedding_vectors(embedding_model, chunked_cleaned_documents)\n", "\n", " return chunked_cleaned_documents, embedding_model, embeddings" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }