Spaces:

yixin6178
/

ChatPaper

Build error

App Files Files Community

ChatPaper / chat_pdf.py

yiyixin

upload

28c2a3d over 1 year ago

raw

history blame

4.84 kB


	class ChatPDF():
	"""ChatPDF enables us to chat with a PDF file
	"""

	def __init__(self, pdf, bot, embedding_model, similarity_metric, expect_answer_token_length_max=100, expect_q_token_length_max=100, user_stamp=None) -> None:
	self.pdf = pdf
	self.bot = bot
	self.embedding_model = embedding_model
	self.similarity_metric = similarity_metric
	self.user_stamp = user_stamp

	self.system_task_prompt = f"You are a helpful PDF file. Your task is to provide information and answer any questions related to the topic of {self.pdf.metadata['title']}. You should use the sections of the PDF as your source of information and try to provide concise and accurate answers to any questions asked by the user. If you are unable to find relevant information in the given sections, you will need to let the user know that the source does not contain relevant information but still try to provide an answer based on your general knowledge. You must refer to the corresponding section name and page that you refer to when answering. The following is the related information about the PDF file that will help you answer users' questions:\n\n"
	self.system_information_prompt = "Title:\n" + self.pdf.metadata['title'] + "\n\nAbstract:\n" + self.pdf.metadata["abstract"] + \
	"\n\nFiltered paragraphs from each sections (the section titles are enclosed in asterisks**):\n\n"

	self.system_token_length = self.bot.encode_length(
	self.system_task_prompt) + self.bot.encode_length(self.system_information_prompt)
	self.expect_answer_token_length_max = expect_answer_token_length_max
	self.expect_q_token_length_max = expect_q_token_length_max
	self.context_max_length = self.bot.max_tokens - self.system_token_length - \
	self.bot.overhead_token - self.expect_answer_token_length_max - \
	self.expect_q_token_length_max

	def _get_related_context(self, user_query):
	all_contextes = [user_query]+self.pdf.flattn_paragraphs
	rank_indices = self.rank_indices(all_contextes, 0,)
	rank_indices = list(rank_indices)
	rank_indices.remove(0)
	inital_context = ":\n\n".join(self.pdf.section_names_with_page_index)
	context_dict = {section_name: []
	for section_name in self.pdf.section_names}
	inital_context_token_length = self.bot.encode_length(inital_context)
	running_length = inital_context_token_length
	for idx in rank_indices:
	text_to_insert = all_contextes[idx]
	text_to_insert_token_length = self.bot.encode_length(
	text_to_insert)
	if running_length + text_to_insert_token_length < self.context_max_length:
	running_length += text_to_insert_token_length
	section = self.pdf.content2section[text_to_insert]
	context_dict[section].append(text_to_insert)
	else:
	break
	composed_context = ""
	for i, section_name in enumerate(self.pdf.section_names):
	if len(context_dict[section_name]) > 0:
	section_name_with_page_index = self.pdf.section_names_with_page_index[i]
	composed_context += ""+section_name_with_page_index + "" + \
	":\n" + "\n".join(context_dict[section_name]) + "\n\n"
	return composed_context

	def chat(self, user_query):
	"""Chat with the PDF file
	"""
	context_data = self._get_related_context(user_query)
	dynamic_system_context = self.system_task_prompt + \
	self.system_information_prompt + context_data
	print(
	"********** Start of Composed Context **********\n",
	dynamic_system_context,
	"\n********** End of Composed Context **********\n"
	)
	response = self.bot.query(
	context=dynamic_system_context, questions=user_query, convo_id=self.user_stamp)
	return response

	def rank_indices(
	self,
	strings: list[str],
	index_of_source_string: int,
	) -> list[int]:
	"""Rank the indices of the strings in the list based on their similarity to the source string."""
	# get embeddings for all strings
	embeddings = [self.embedding_model(string) for string in strings]
	# get the embedding of the source string
	query_embedding = embeddings[index_of_source_string]
	# get distances between the source embedding and other embeddings (function from embeddings_utils.py)
	distances = self.similarity_metric(query_embedding, embeddings)
	# get rank of indices based on distances
	import numpy as np
	indices_of_nearest_neighbors = np.argsort(distances, )
	return indices_of_nearest_neighbors