File size: 5,173 Bytes
3b4f6eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Helper function for printing docs
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader
from typing import List
import matplotlib.pyplot as plt 
import tiktoken
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from ragas import metrics
from finalthesis import constants
from datasets import load_from_disk, concatenate_datasets



EVAL_METRICS = [
    metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer,
    metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer.
    metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not.
    metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth.
    metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt. 
    metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth.
    metrics.faithfulness, # measures the factual consistency of the generated answer against the given context. 
]

def load_test_set():
    """Load the test set from the disk. And adapt it that it's fir for ragas"""
    testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE)
    testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI)
    testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING)
    testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning])
    testset = testset.rename_column("contexts", "origin_contexts")
    return testset


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


def count_tokens(text: str, model='openai') -> int:
    if model == 'openai':
        # Load the GPT-3.5 tokenizer
        encoding = tiktoken.get_encoding("cl100k_base")
        # Encode the text to get the tokens
        tokens = encoding.encode(text)
    else:
        raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported")
    # Return the number of tokens
    return len(tokens)

def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()):

    # load from disk
    return  Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    
def load_epub_documents(directory: str) -> List[Document]:
    
    book_dir_loader = DirectoryLoader(
        directory, 
        glob="**/*.epub", 
        show_progress=True, 
        loader_cls=UnstructuredEPubLoader
        )
    docs = book_dir_loader.load()
    return  docs


def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ):
    plt.hist(data, bins=bins)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'):
    plt.hist(chunk_lengths, bins=bins)
    plt.title(f"Distribution of the number of {count_of} in the {chunks_name}")
    plt.xlabel(f"number of {count_of}")
    plt.ylabel(f"{chunks_name}")
    plt.show()


def get_pagecontents(documents: list):
    return [x.page_content for x in documents]


def plot_result(result):
    # getting the keys and values
    metrics = list(result.keys())
    values = list(result.values())
    
    # creating the bar plot
    plt.barh(metrics, values,)
    plt.xlim(0,1)
    plt.xlabel("metric values")
    plt.ylabel("metrics")
    plt.title("Results of the evaluation")
    plt.show()


# .documents import Documents
def get_fixed_size_chunks(docs:list[Document],  chunk_size: int, chunk_overlap:int= 200):
    text_splitter = CharacterTextSplitter(separator="", 
                                          chunk_size= chunk_size, 
                                          chunk_overlap= chunk_overlap)
    return text_splitter.split_documents(docs)


# .documents import Documents
def get_recursive_chunks(docs:list[Document],  chunk_size: int, chunk_overlap:int= 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= chunk_size, 
        chunk_overlap= chunk_overlap
        )
    return text_splitter.split_documents(docs)



# .documents import Documents
def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float,  breakpoint_threshold_type:str = 'percentile'):
    text_splitter = SemanticChunker(
        embeddings,
        add_start_index = True,
        breakpoint_threshold_type = breakpoint_threshold_type,
        breakpoint_threshold_amount= breakpoint_threshold_amount,
        )
    return text_splitter.split_documents(docs)