Spaces:

Leopat
/

thesis_chat_with_history_books

Sleeping

App Files Files

thesis_chat_with_history_books / finalthesis /helper.py

Leopat

upload src files

3b4f6eb verified 11 months ago

raw

history blame

5.17 kB

	# Helper function for printing docs
	from langchain_community.vectorstores import Chroma
	from langchain_core.embeddings import Embeddings
	from langchain_core.documents import Document
	from langchain_openai.embeddings import OpenAIEmbeddings
	from langchain_community.document_loaders import DirectoryLoader, UnstructuredEPubLoader
	from typing import List
	import matplotlib.pyplot as plt
	import tiktoken
	from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
	from langchain_experimental.text_splitter import SemanticChunker
	from ragas import metrics
	from finalthesis import constants
	from datasets import load_from_disk, concatenate_datasets



	EVAL_METRICS = [
	metrics.context_recall, # measures the extent to which the retrieved context aligns with the annotated answer,
	metrics.context_entity_recall, # measures the extent to which the retrieved context aligns with the annotated answer.
	metrics.context_precision, # evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not.
	metrics.answer_similarity, # measures the semantic similarity between the generated answer and the ground_truth.
	metrics.answer_relevancy, # assesses how pertinent the generated answer is to the given prompt.
	metrics.answer_correctness, # measures the correctness of the generated answer based on the ground_truth.
	metrics.faithfulness, # measures the factual consistency of the generated answer against the given context.
	]

	def load_test_set():
	"""Load the test set from the disk. And adapt it that it's fir for ragas"""
	testset_simple = load_from_disk(constants.TEST_SET_PATH_SIMPLE)
	testset_multi_context = load_from_disk(constants.TEST_SET_PATH_MULTI)
	testset_reasoning = load_from_disk(constants.TEST_SET_PATH_REASONING)
	testset = concatenate_datasets([testset_simple, testset_multi_context, testset_reasoning])
	testset = testset.rename_column("contexts", "origin_contexts")
	return testset


	def pretty_print_docs(docs):
	print(
	f"\n{'-' * 100}\n".join(
	[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
	)
	)


	def count_tokens(text: str, model='openai') -> int:
	if model == 'openai':
	# Load the GPT-3.5 tokenizer
	encoding = tiktoken.get_encoding("cl100k_base")
	# Encode the text to get the tokens
	tokens = encoding.encode(text)
	else:
	raise NotImplementedError(f"Model {model} not implemented, currently only 'openai' is supported")
	# Return the number of tokens
	return len(tokens)

	def load_chroma_db_from_disk(persist_directory: str, embedding_function:Embeddings = OpenAIEmbeddings()):

	# load from disk
	return Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())

	def load_epub_documents(directory: str) -> List[Document]:

	book_dir_loader = DirectoryLoader(
	directory,
	glob="*/.epub",
	show_progress=True,
	loader_cls=UnstructuredEPubLoader
	)
	docs = book_dir_loader.load()
	return docs


	def plot_histogram(data: list[int], title:str, xlabel:str, ylabel:str, bins: int=30, ):
	plt.hist(data, bins=bins)
	plt.title(title)
	plt.xlabel(xlabel)
	plt.ylabel(ylabel)
	plt.show()

	def plot_chunk_dist(chunk_lengths: list[int], chunks_name: str, bins: int=30, count_of:str = 'characters'):
	plt.hist(chunk_lengths, bins=bins)
	plt.title(f"Distribution of the number of {count_of} in the {chunks_name}")
	plt.xlabel(f"number of {count_of}")
	plt.ylabel(f"{chunks_name}")
	plt.show()


	def get_pagecontents(documents: list):
	return [x.page_content for x in documents]


	def plot_result(result):
	# getting the keys and values
	metrics = list(result.keys())
	values = list(result.values())

	# creating the bar plot
	plt.barh(metrics, values,)
	plt.xlim(0,1)
	plt.xlabel("metric values")
	plt.ylabel("metrics")
	plt.title("Results of the evaluation")
	plt.show()


	# .documents import Documents
	def get_fixed_size_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200):
	text_splitter = CharacterTextSplitter(separator="",
	chunk_size= chunk_size,
	chunk_overlap= chunk_overlap)
	return text_splitter.split_documents(docs)


	# .documents import Documents
	def get_recursive_chunks(docs:list[Document], chunk_size: int, chunk_overlap:int= 200):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size= chunk_size,
	chunk_overlap= chunk_overlap
	)
	return text_splitter.split_documents(docs)



	# .documents import Documents
	def get_semantic_chunks(docs:list[Document], embeddings, breakpoint_threshold_amount:float, breakpoint_threshold_type:str = 'percentile'):
	text_splitter = SemanticChunker(
	embeddings,
	add_start_index = True,
	breakpoint_threshold_type = breakpoint_threshold_type,
	breakpoint_threshold_amount= breakpoint_threshold_amount,
	)
	return text_splitter.split_documents(docs)