Spaces:
Sleeping
Sleeping
# Helper function for printing docs | |
from langchain_community.vectorstores import Chroma | |
from langchain_core.embeddings import Embeddings | |
from langchain_core.documents import Document | |
from langchain_openai.embeddings import OpenAIEmbeddings | |
from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader | |
from typing import List | |
import matplotlib.pyplot as plt | |
import tiktoken | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain_experimental.text_splitter import SemanticChunker | |
from ragas import metrics | |
from finalthesis import constants | |
from datasets import load_from_disk, concatenate_datasets | |
EVAL_METRICS = [ | |
metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer, | |
metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer. | |
metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. | |
metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth. | |
metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt. | |
metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth. | |
metrics.faithfulness, # measures the factual consistency of the generated answer against the given context. | |
] | |
def load_test_set(): | |
"""Load the test set from the disk. And adapt it that it's fir for ragas""" | |
testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE) | |
testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI) | |
testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING) | |
testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning]) | |
testset = testset.rename_column("contexts", "origin_contexts") | |
return testset | |
def pretty_print_docs(docs): | |
print( | |
f"\n{'-' * 100}\n".join( | |
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)] | |
) | |
) | |
def count_tokens(text: str, model='openai') -> int: | |
if model == 'openai': | |
# Load the GPT-3.5 tokenizer | |
encoding = tiktoken.get_encoding("cl100k_base") | |
# Encode the text to get the tokens | |
tokens = encoding.encode(text) | |
else: | |
raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported") | |
# Return the number of tokens | |
return len(tokens) | |
def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()): | |
# load from disk | |
return Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings()) | |
def load_epub_documents(directory: str) -> List[Document]: | |
book_dir_loader = DirectoryLoader( | |
directory, | |
glob="**/*.epub", | |
show_progress=True, | |
loader_cls=UnstructuredEPubLoader | |
) | |
docs = book_dir_loader.load() | |
return docs | |
def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ): | |
plt.hist(data, bins=bins) | |
plt.title(title) | |
plt.xlabel(xlabel) | |
plt.ylabel(ylabel) | |
plt.show() | |
def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'): | |
plt.hist(chunk_lengths, bins=bins) | |
plt.title(f"Distribution of the number of {count_of} in the {chunks_name}") | |
plt.xlabel(f"number of {count_of}") | |
plt.ylabel(f"{chunks_name}") | |
plt.show() | |
def get_pagecontents(documents: list): | |
return [x.page_content for x in documents] | |
def plot_result(result): | |
# getting the keys and values | |
metrics = list(result.keys()) | |
values = list(result.values()) | |
# creating the bar plot | |
plt.barh(metrics, values,) | |
plt.xlim(0,1) | |
plt.xlabel("metric values") | |
plt.ylabel("metrics") | |
plt.title("Results of the evaluation") | |
plt.show() | |
# .documents import Documents | |
def get_fixed_size_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200): | |
text_splitter = CharacterTextSplitter(separator="", | |
chunk_size= chunk_size, | |
chunk_overlap= chunk_overlap) | |
return text_splitter.split_documents(docs) | |
# .documents import Documents | |
def get_recursive_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size= chunk_size, | |
chunk_overlap= chunk_overlap | |
) | |
return text_splitter.split_documents(docs) | |
# .documents import Documents | |
def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float, breakpoint_threshold_type:str = 'percentile'): | |
text_splitter = SemanticChunker( | |
embeddings, | |
add_start_index = True, | |
breakpoint_threshold_type = breakpoint_threshold_type, | |
breakpoint_threshold_amount= breakpoint_threshold_amount, | |
) | |
return text_splitter.split_documents(docs) |