Spaces:
Sleeping
Sleeping
File size: 5,173 Bytes
3b4f6eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# Helper function for printing docs
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader
from typing import List
import matplotlib.pyplot as plt
import tiktoken
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from ragas import metrics
from finalthesis import constants
from datasets import load_from_disk, concatenate_datasets
EVAL_METRICS = [
metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer,
metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer.
metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not.
metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth.
metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt.
metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth.
metrics.faithfulness, # measures the factual consistency of the generated answer against the given context.
]
def load_test_set():
"""Load the test set from the disk. And adapt it that it's fir for ragas"""
testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE)
testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI)
testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING)
testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning])
testset = testset.rename_column("contexts", "origin_contexts")
return testset
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
)
)
def count_tokens(text: str, model='openai') -> int:
if model == 'openai':
# Load the GPT-3.5 tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
# Encode the text to get the tokens
tokens = encoding.encode(text)
else:
raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported")
# Return the number of tokens
return len(tokens)
def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()):
# load from disk
return Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
def load_epub_documents(directory: str) -> List[Document]:
book_dir_loader = DirectoryLoader(
directory,
glob="**/*.epub",
show_progress=True,
loader_cls=UnstructuredEPubLoader
)
docs = book_dir_loader.load()
return docs
def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ):
plt.hist(data, bins=bins)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'):
plt.hist(chunk_lengths, bins=bins)
plt.title(f"Distribution of the number of {count_of} in the {chunks_name}")
plt.xlabel(f"number of {count_of}")
plt.ylabel(f"{chunks_name}")
plt.show()
def get_pagecontents(documents: list):
return [x.page_content for x in documents]
def plot_result(result):
# getting the keys and values
metrics = list(result.keys())
values = list(result.values())
# creating the bar plot
plt.barh(metrics, values,)
plt.xlim(0,1)
plt.xlabel("metric values")
plt.ylabel("metrics")
plt.title("Results of the evaluation")
plt.show()
# .documents import Documents
def get_fixed_size_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200):
text_splitter = CharacterTextSplitter(separator="",
chunk_size= chunk_size,
chunk_overlap= chunk_overlap)
return text_splitter.split_documents(docs)
# .documents import Documents
def get_recursive_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size= chunk_size,
chunk_overlap= chunk_overlap
)
return text_splitter.split_documents(docs)
# .documents import Documents
def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float, breakpoint_threshold_type:str = 'percentile'):
text_splitter = SemanticChunker(
embeddings,
add_start_index = True,
breakpoint_threshold_type = breakpoint_threshold_type,
breakpoint_threshold_amount= breakpoint_threshold_amount,
)
return text_splitter.split_documents(docs) |