Spaces:

leandrocarneiro
/

BotNews

Runtime error

App Files Files Community

BotNews / rag.py

leandrocarneiro

Update rag.py

26c225d verified about 1 year ago

raw

history blame contribute delete

9.57 kB

	# Created by Leandro Carneiro at 19/01/2024
	# Description:
	# ------------------------------------------------
	#from langchain.embeddings import OpenAIEmbeddings
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_together.embeddings import TogetherEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_community.document_loaders import DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.prompts import PromptTemplate
	from langchain_openai import ChatOpenAI
	from langchain_together import Together
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	import os
	import csv
	import time

	def read_csv_to_dict(filename):
	data_dict = {}
	with open(filename, mode='r', encoding='utf-8') as file:
	csv_reader = csv.reader(file)
	for row in csv_reader:
	key, value = row[0].split(';')
	data_dict[key] = value
	return data_dict

	def generate_embeddings_and_vectorstore(path, model):
	try:
	loader = DirectoryLoader(path=path, glob="*/.txt")
	corpus = loader.load()
	print(f' Total de documentos antes do text_split = {len(corpus)}')

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=400)
	docs = text_splitter.split_documents(corpus)
	num_total_characters = sum([len(x.page_content) for x in docs])
	print(f" Total de chunks depois do text_split = {len(docs)}")
	print(f" Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")

	dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
	for doc in docs:
	filename = os.path.basename(doc.metadata["source"])
	doc.metadata["link"] = dict_filename_url.get(filename)

	#print('docs')
	#print(docs)
	if model == 'openai':
	fc_embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_KEY'])
	vectorstore = Chroma.from_documents(docs, fc_embeddings)
	else:
	#fc_embeddings = HuggingFaceEmbeddings(model_name = 'intfloat/multilingual-e5-large-instruct')
	#vectorstore = Chroma.from_documents(docs, fc_embeddings)
	fc_embeddings = TogetherEmbeddings(model = 'togethercomputer/m2-bert-80M-8k-retrieval', together_api_key = os.environ['TOGETHER_KEY'])
	for doc in docs:
	vectorstore = Chroma.from_documents(documents=[doc], embedding=fc_embeddings)
	time.sleep(1.1)
	print('total de docs no vectorstore=',len(vectorstore.get()['documents']))

	return vectorstore
	except Exception as e:
	print(str(e))
	return str(e)

	class Rag:
	def __init__(self, vectorstore, min_words, max_words, model):
	self.text = None
	self.vectorstore = vectorstore
	self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

	if model == 'openai':
	prompt_template = """Your task is to create news for a newspaper based on pieces of text delimited by <> and a question delimited by <>.
	Do not use only your knowledge to make the news. Make the news based on the question, but using the pieces of text.
	If the pieces of text don't enough information about the question to create the news, just say that you need more sources of information, nothing more.
	The news should have a title.
	The news should be written in a formal language.
	The news should have between {min_words} and {max_words} words and it should be in Portuguese language.
	The news should be about the following context: <{context}>
	Question: <{question}>
	Answer here:"""
	self.prompt = PromptTemplate(template=prompt_template,
	input_variables=["context", "question"],
	partial_variables={"min_words": min_words, "max_words": max_words})

	self.qa = ConversationalRetrievalChain.from_llm(
	llm=ChatOpenAI(model_name="gpt-3.5-turbo-0125", #0125 #1106
	temperature=0,
	openai_api_key=os.environ['OPENAI_KEY'],
	max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
	memory=self.memory,
	# retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
	# search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
	retriever=vectorstore.as_retriever(),
	combine_docs_chain_kwargs={"prompt": self.prompt},
	chain_type="stuff",#map_reduce, refine, map_rerank
	return_source_documents=True,
	)
	else:
	prompt_template = """Your task is to create news for a newspaper based on pieces of text delimited by <> and a question delimited by <>.
	The news should be written in Portuguese language.
	Do not use only your knowledge to make the news. Make the news based on the question, but using the pieces of text.
	If the pieces of text don't enough information about the question to create the news, just say that you need more sources of information, nothing more.
	The news should have a title.
	The news should be written in a formal language.
	The news should have between {min_words} and {max_words} words.
	The source should not be shown in the news.
	The total of words should no be shown in the news.
	The news should be written in Portuguese language.
	Answer the title and the news, nothing more.
	The news should be about the following context: <{context}>
	Question: <{question}>
	Answer here: """
	self.prompt = PromptTemplate(template=prompt_template,
	input_variables=["context", "question"],
	partial_variables={"min_words": min_words, "max_words": max_words})

	self.qa = ConversationalRetrievalChain.from_llm(
	llm=Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", #0125 #1106
	temperature=0,
	together_api_key=os.environ['TOGETHER_KEY'],
	max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
	memory=self.memory,
	# retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
	# search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
	retriever=vectorstore.as_retriever(),
	combine_docs_chain_kwargs={"prompt": self.prompt},
	chain_type="stuff",#map_reduce, refine, map_rerank
	return_source_documents=True,
	)

	# from langchain_together import Together
	# self.qa = ConversationalRetrievalChain.from_llm(
	# llm=Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", # 0125 #1106
	# temperature=0,
	# #top_k=20,
	# together_api_key=os.environ['TOGETHER_KEY'],
	# max_tokens=int(int(max_words) + (int(max_words) / 2))),
	# # número máximo de tokens para a resposta
	# memory=self.memory,
	# # retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
	# # search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
	# retriever=vectorstore.as_retriever(),
	# combine_docs_chain_kwargs={"prompt": self.prompt},
	# chain_type="stuff", # map_reduce, refine, map_rerank
	# return_source_documents=True,
	# )


	def generate_text(self, subject):
	try:
	query = f"Elabore uma nova notícia sobre {subject}."
	result_text = self.qa.invoke({"question": query})
	print('##### result', result_text)

	list_result_sources = []
	str_result_sources = ''
	for doc in result_text["source_documents"]:
	list_result_sources.append(doc.metadata['link'])
	result_sources = list(set(list_result_sources))
	for i in range(len(result_sources)):
	str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'

	self.vectorstore.delete_collection()

	return (result_text["answer"], str_result_sources)
	except Exception as e:
	self.vectorstore.delete_collection()
	return str(e)