Spaces:
Running
Running
File size: 9,569 Bytes
a2ee974 5dd704c 51e1c58 a2ee974 5dd704c a2ee974 51e1c58 a2ee974 5dd704c a2ee974 f966467 a2ee974 5dd704c 51e1c58 5dd704c 51e1c58 26c225d 566bba1 a2ee974 5dd704c a2ee974 5dd704c 9db2841 5dd704c 9db2841 41f1164 9db2841 5e4d2de f88d5c8 7e729ae bd5a0eb f88d5c8 9db2841 bd5a0eb 9db2841 5dd704c 24add6f a2ee974 566bba1 6f82717 566bba1 a2ee974 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# Created by Leandro Carneiro at 19/01/2024
# Description:
# ------------------------------------------------
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_together.embeddings import TogetherEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_together import Together
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import os
import csv
import time
def read_csv_to_dict(filename):
data_dict = {}
with open(filename, mode='r', encoding='utf-8') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
key, value = row[0].split(';')
data_dict[key] = value
return data_dict
def generate_embeddings_and_vectorstore(path, model):
try:
loader = DirectoryLoader(path=path, glob="**/*.txt")
corpus = loader.load()
print(f' Total de documentos antes do text_split = {len(corpus)}')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=400)
docs = text_splitter.split_documents(corpus)
num_total_characters = sum([len(x.page_content) for x in docs])
print(f" Total de chunks depois do text_split = {len(docs)}")
print(f" Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")
dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
for doc in docs:
filename = os.path.basename(doc.metadata["source"])
doc.metadata["link"] = dict_filename_url.get(filename)
#print('docs')
#print(docs)
if model == 'openai':
fc_embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_KEY'])
vectorstore = Chroma.from_documents(docs, fc_embeddings)
else:
#fc_embeddings = HuggingFaceEmbeddings(model_name = 'intfloat/multilingual-e5-large-instruct')
#vectorstore = Chroma.from_documents(docs, fc_embeddings)
fc_embeddings = TogetherEmbeddings(model = 'togethercomputer/m2-bert-80M-8k-retrieval', together_api_key = os.environ['TOGETHER_KEY'])
for doc in docs:
vectorstore = Chroma.from_documents(documents=[doc], embedding=fc_embeddings)
time.sleep(1.1)
print('total de docs no vectorstore=',len(vectorstore.get()['documents']))
return vectorstore
except Exception as e:
print(str(e))
return str(e)
class Rag:
def __init__(self, vectorstore, min_words, max_words, model):
self.text = None
self.vectorstore = vectorstore
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
if model == 'openai':
prompt_template = """Your task is to create news for a newspaper based on pieces of text delimited by <> and a question delimited by <>.
Do not use only your knowledge to make the news. Make the news based on the question, but using the pieces of text.
If the pieces of text don't enough information about the question to create the news, just say that you need more sources of information, nothing more.
The news should have a title.
The news should be written in a formal language.
The news should have between {min_words} and {max_words} words and it should be in Portuguese language.
The news should be about the following context: <{context}>
Question: <{question}>
Answer here:"""
self.prompt = PromptTemplate(template=prompt_template,
input_variables=["context", "question"],
partial_variables={"min_words": min_words, "max_words": max_words})
self.qa = ConversationalRetrievalChain.from_llm(
llm=ChatOpenAI(model_name="gpt-3.5-turbo-0125", #0125 #1106
temperature=0,
openai_api_key=os.environ['OPENAI_KEY'],
max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
memory=self.memory,
# retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
# search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
retriever=vectorstore.as_retriever(),
combine_docs_chain_kwargs={"prompt": self.prompt},
chain_type="stuff",#map_reduce, refine, map_rerank
return_source_documents=True,
)
else:
prompt_template = """Your task is to create news for a newspaper based on pieces of text delimited by <> and a question delimited by <>.
The news should be written in Portuguese language.
Do not use only your knowledge to make the news. Make the news based on the question, but using the pieces of text.
If the pieces of text don't enough information about the question to create the news, just say that you need more sources of information, nothing more.
The news should have a title.
The news should be written in a formal language.
The news should have between {min_words} and {max_words} words.
The source should not be shown in the news.
The total of words should no be shown in the news.
The news should be written in Portuguese language.
Answer the title and the news, nothing more.
The news should be about the following context: <{context}>
Question: <{question}>
Answer here: """
self.prompt = PromptTemplate(template=prompt_template,
input_variables=["context", "question"],
partial_variables={"min_words": min_words, "max_words": max_words})
self.qa = ConversationalRetrievalChain.from_llm(
llm=Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", #0125 #1106
temperature=0,
together_api_key=os.environ['TOGETHER_KEY'],
max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
memory=self.memory,
# retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
# search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
retriever=vectorstore.as_retriever(),
combine_docs_chain_kwargs={"prompt": self.prompt},
chain_type="stuff",#map_reduce, refine, map_rerank
return_source_documents=True,
)
# from langchain_together import Together
# self.qa = ConversationalRetrievalChain.from_llm(
# llm=Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", # 0125 #1106
# temperature=0,
# #top_k=20,
# together_api_key=os.environ['TOGETHER_KEY'],
# max_tokens=int(int(max_words) + (int(max_words) / 2))),
# # número máximo de tokens para a resposta
# memory=self.memory,
# # retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
# # search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
# retriever=vectorstore.as_retriever(),
# combine_docs_chain_kwargs={"prompt": self.prompt},
# chain_type="stuff", # map_reduce, refine, map_rerank
# return_source_documents=True,
# )
def generate_text(self, subject):
try:
query = f"Elabore uma nova notícia sobre {subject}."
result_text = self.qa.invoke({"question": query})
print('##### result', result_text)
list_result_sources = []
str_result_sources = ''
for doc in result_text["source_documents"]:
list_result_sources.append(doc.metadata['link'])
result_sources = list(set(list_result_sources))
for i in range(len(result_sources)):
str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'
self.vectorstore.delete_collection()
return (result_text["answer"], str_result_sources)
except Exception as e:
self.vectorstore.delete_collection()
return str(e)
|