# Helper function for printing docs from langchain_community.vectorstores import Chroma from langchain_core.embeddings import Embeddings from langchain_core.documents import Document from langchain_openai.embeddings import OpenAIEmbeddings from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader from typing import List import matplotlib.pyplot as plt import tiktoken from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain_experimental.text_splitter import SemanticChunker from ragas import metrics from finalthesis import constants from datasets import load_from_disk, concatenate_datasets EVAL_METRICS = [ metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer, metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer. metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth. metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt. metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth. metrics.faithfulness, # measures the factual consistency of the generated answer against the given context. ] def load_test_set(): """Load the test set from the disk. And adapt it that it's fir for ragas""" testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE) testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI) testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING) testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning]) testset = testset.rename_column("contexts", "origin_contexts") return testset def pretty_print_docs(docs): print( f"\n{'-' * 100}\n".join( [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)] ) ) def count_tokens(text: str, model='openai') -> int: if model == 'openai': # Load the GPT-3.5 tokenizer encoding = tiktoken.get_encoding("cl100k_base") # Encode the text to get the tokens tokens = encoding.encode(text) else: raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported") # Return the number of tokens return len(tokens) def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()): # load from disk return Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings()) def load_epub_documents(directory: str) -> List[Document]: book_dir_loader = DirectoryLoader( directory, glob="**/*.epub", show_progress=True, loader_cls=UnstructuredEPubLoader ) docs = book_dir_loader.load() return docs def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ): plt.hist(data, bins=bins) plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.show() def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'): plt.hist(chunk_lengths, bins=bins) plt.title(f"Distribution of the number of {count_of} in the {chunks_name}") plt.xlabel(f"number of {count_of}") plt.ylabel(f"{chunks_name}") plt.show() def get_pagecontents(documents: list): return [x.page_content for x in documents] def plot_result(result): # getting the keys and values metrics = list(result.keys()) values = list(result.values()) # creating the bar plot plt.barh(metrics, values,) plt.xlim(0,1) plt.xlabel("metric values") plt.ylabel("metrics") plt.title("Results of the evaluation") plt.show() # .documents import Documents def get_fixed_size_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200): text_splitter = CharacterTextSplitter(separator="", chunk_size= chunk_size, chunk_overlap= chunk_overlap) return text_splitter.split_documents(docs) # .documents import Documents def get_recursive_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200): text_splitter = RecursiveCharacterTextSplitter( chunk_size= chunk_size, chunk_overlap= chunk_overlap ) return text_splitter.split_documents(docs) # .documents import Documents def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float, breakpoint_threshold_type:str = 'percentile'): text_splitter = SemanticChunker( embeddings, add_start_index = True, breakpoint_threshold_type = breakpoint_threshold_type, breakpoint_threshold_amount= breakpoint_threshold_amount, ) return text_splitter.split_documents(docs)