Spaces:

Leopat
/

thesis_chat_with_history_books

Sleeping

File size: 5,173 Bytes

3b4f6eb

# Helper function for printing docs
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader
from typing import List
import matplotlib.pyplot as plt 
import tiktoken
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from ragas import metrics
from finalthesis import constants
from datasets import load_from_disk, concatenate_datasets



EVAL_METRICS = [
    metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer,
    metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer.
    metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not.
    metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth.
    metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt. 
    metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth.
    metrics.faithfulness, # measures the factual consistency of the generated answer against the given context. 
]

def load_test_set():
    """Load the test set from the disk. And adapt it that it's fir for ragas"""
    testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE)
    testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI)
    testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING)
    testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning])
    testset = testset.rename_column("contexts", "origin_contexts")
    return testset


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


def count_tokens(text: str, model='openai') -> int:
    if model == 'openai':
        # Load the GPT-3.5 tokenizer
        encoding = tiktoken.get_encoding("cl100k_base")
        # Encode the text to get the tokens
        tokens = encoding.encode(text)
    else:
        raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported")
    # Return the number of tokens
    return len(tokens)

def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()):

    # load from disk
    return  Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    
def load_epub_documents(directory: str) -> List[Document]:
    
    book_dir_loader = DirectoryLoader(
        directory, 
        glob="**/*.epub", 
        show_progress=True, 
        loader_cls=UnstructuredEPubLoader
        )
    docs = book_dir_loader.load()
    return  docs


def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ):
    plt.hist(data, bins=bins)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'):
    plt.hist(chunk_lengths, bins=bins)
    plt.title(f"Distribution of the number of {count_of} in the {chunks_name}")
    plt.xlabel(f"number of {count_of}")
    plt.ylabel(f"{chunks_name}")
    plt.show()


def get_pagecontents(documents: list):
    return [x.page_content for x in documents]


def plot_result(result):
    # getting the keys and values
    metrics = list(result.keys())
    values = list(result.values())
    
    # creating the bar plot
    plt.barh(metrics, values,)
    plt.xlim(0,1)
    plt.xlabel("metric values")
    plt.ylabel("metrics")
    plt.title("Results of the evaluation")
    plt.show()


# .documents import Documents
def get_fixed_size_chunks(docs:list[Document],  chunk_size: int, chunk_overlap:int= 200):
    text_splitter = CharacterTextSplitter(separator="", 
                                          chunk_size= chunk_size, 
                                          chunk_overlap= chunk_overlap)
    return text_splitter.split_documents(docs)


# .documents import Documents
def get_recursive_chunks(docs:list[Document],  chunk_size: int, chunk_overlap:int= 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= chunk_size, 
        chunk_overlap= chunk_overlap
        )
    return text_splitter.split_documents(docs)



# .documents import Documents
def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float,  breakpoint_threshold_type:str = 'percentile'):
    text_splitter = SemanticChunker(
        embeddings,
        add_start_index = True,
        breakpoint_threshold_type = breakpoint_threshold_type,
        breakpoint_threshold_amount= breakpoint_threshold_amount,
        )
    return text_splitter.split_documents(docs)