### Notebook for processing the text data (chunking, cleaning, embeddings)

In [2]:
import json
from typing import Iterable
from langchain.docstore.document import Document
from typing import List

# Helper methods for storing and loading already generated documents
def store_documents(documents, file_path: str) -> None:
 with open(file_path, "w") as jsonl_file:
 for doc in documents:
 jsonl_file.write(doc.json() + "\n")


def load_documents(file_path: str) -> List[Document]:
 documents = []
 with open(file_path, "r") as jsonl_file:
 for line in jsonl_file:
 data = json.loads(line)
 obj = Document(**data)
 documents.append(obj)
 return documents

In [3]:
def get_pdf_documents(all_docs: bool):
 """
 Method for returning the documents of the PDFs. Processing and updating takes place in update_pdf_documents.
 all_docs parameter defines whether to load all documents or only new ones. Only new ones can be used if the index is already build and new documents should be added.
 """
 pdf_documents = []
 if all_docs:
 pdf_documents = load_documents("./../input_data/PDF/documents/all_documents")
 else:
 pdf_documents = load_documents("./../input_data/PDF/documents/new_documents")

 return pdf_documents

def get_web_documents(all_docs: bool) -> List[Document]:
 """
 Method for returning the already processed documents. FIRST need to call get_web_docs_for_cleaning and clean manually. As it is a manual cleaning process, the methods are need to be called asynchronously.
 """
 web_documents = []
 if all_docs:
 web_documents = load_documents("./../input_data/Web/documents/all_documents")
 else:
 web_documents = load_documents("./../input_data/Web/documents/new_documents")

 return web_documents

def get_template_documents(all_docs: bool) -> List[Document]:
 """
 Method for returning the documents of the templates.
 """
 template_documents = []
 if all_docs:
 template_documents = load_documents("./../input_data/Templates/documents/all_documents")
 else:
 template_documents = load_documents("./../input_data/Templates/documents/new_documents")

 return template_documents

def get_dataset_documents() -> List[Document]:
 """
 Method for returning the documents of the templates.
 """
 template_documents = []
 template_documents = load_documents("./../input_data/QA_dataset/all_documents")

 return template_documents

In [4]:
def get_documents_from_files(all_docs: bool):
 """
 Gets documents from all document types.
 """
 documents_all = []
 documents_PDF = get_pdf_documents(all_docs)
 document_web = get_web_documents(all_docs)
 document_template = get_template_documents(all_docs)
 document_dataset = get_dataset_documents()
 
 documents_all.extend(documents_PDF)
 documents_all.extend(document_web)
 documents_all.extend(document_template)
 documents_all.extend(document_dataset)
 
 print("Number of documents: " + str(len(documents_all)) + "\n")
 return documents_all

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents: List[Document], chunk_size: int, chunk_overlap: int):

 text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" "])
 chunkedDocuments = text_splitter.split_documents(documents)
 return chunkedDocuments

In [6]:
import re

def clean_text(text: str) -> str:
 # Replace multiple whitespaces (except newlines) with a single space
 text = re.sub(r"(?!\n)\s+", " ", text)
 # Replace multiple newlines with a single newline
 text = re.sub(r"\n+", "\n", text)
 # Remove leading and trailing whitespace
 text = text.strip()
 return text

def clean_and_process_chunked_documents(chunkedDocuments: List[Document]) -> List[Document]:
 counter = 1
 for i in chunkedDocuments:
 i.page_content = clean_text(i.page_content)
 i.metadata["original_text"] = i.page_content
 i.metadata["doc_ID"] = counter
 counter += 1

 i.page_content = i.page_content.lower() 

 return chunkedDocuments

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

def get_embedding_model():
 path = "Basti8499/bge-large-en-v1.5-ISO-27001"
 model = HuggingFaceEmbeddings(model_name=path)
 return model

In [8]:
def create_embedding_vectors(embedding_model, documents: List[Document]):
 texts = []
 for document in documents:
 texts.append(document.page_content)

 embeddings = embedding_model.embed_documents(texts)

 return embeddings

In [1]:
def preprocess_data(chunk_size: int, chunk_overlap: int, all_docs: bool):
 documents = get_documents_from_files(all_docs)
 chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)
 embedding_model = get_embedding_model()
 embeddings = create_embedding_vectors(embedding_model, chunked_cleaned_documents)

 return chunked_cleaned_documents, embedding_model, embeddings