Leopat's picture
upload src files
3b4f6eb verified
raw
history blame
5.17 kB
# Helper function for printing docs
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader
from typing import List
import matplotlib.pyplot as plt
import tiktoken
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from ragas import metrics
from finalthesis import constants
from datasets import load_from_disk, concatenate_datasets
EVAL_METRICS = [
metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer,
metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer.
metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not.
metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth.
metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt.
metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth.
metrics.faithfulness, # measures the factual consistency of the generated answer against the given context.
]
def load_test_set():
"""Load the test set from the disk. And adapt it that it's fir for ragas"""
testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE)
testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI)
testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING)
testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning])
testset = testset.rename_column("contexts", "origin_contexts")
return testset
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
)
)
def count_tokens(text: str, model='openai') -> int:
if model == 'openai':
# Load the GPT-3.5 tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
# Encode the text to get the tokens
tokens = encoding.encode(text)
else:
raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported")
# Return the number of tokens
return len(tokens)
def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()):
# load from disk
return Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
def load_epub_documents(directory: str) -> List[Document]:
book_dir_loader = DirectoryLoader(
directory,
glob="**/*.epub",
show_progress=True,
loader_cls=UnstructuredEPubLoader
)
docs = book_dir_loader.load()
return docs
def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ):
plt.hist(data, bins=bins)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'):
plt.hist(chunk_lengths, bins=bins)
plt.title(f"Distribution of the number of {count_of} in the {chunks_name}")
plt.xlabel(f"number of {count_of}")
plt.ylabel(f"{chunks_name}")
plt.show()
def get_pagecontents(documents: list):
return [x.page_content for x in documents]
def plot_result(result):
# getting the keys and values
metrics = list(result.keys())
values = list(result.values())
# creating the bar plot
plt.barh(metrics, values,)
plt.xlim(0,1)
plt.xlabel("metric values")
plt.ylabel("metrics")
plt.title("Results of the evaluation")
plt.show()
# .documents import Documents
def get_fixed_size_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200):
text_splitter = CharacterTextSplitter(separator="",
chunk_size= chunk_size,
chunk_overlap= chunk_overlap)
return text_splitter.split_documents(docs)
# .documents import Documents
def get_recursive_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size= chunk_size,
chunk_overlap= chunk_overlap
)
return text_splitter.split_documents(docs)
# .documents import Documents
def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float, breakpoint_threshold_type:str = 'percentile'):
text_splitter = SemanticChunker(
embeddings,
add_start_index = True,
breakpoint_threshold_type = breakpoint_threshold_type,
breakpoint_threshold_amount= breakpoint_threshold_amount,
)
return text_splitter.split_documents(docs)