Spaces:
Runtime error
Runtime error
leandrocarneiro
commited on
Commit
•
a2ee974
1
Parent(s):
bc76af9
Upload 7 files
Browse files- app.py +66 -0
- constants.py +9 -0
- llm.py +44 -0
- main.py +56 -0
- rag.py +104 -0
- requirements.txt +9 -0
- search_engine.py +65 -0
app.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by Leandro Carneiro at 22/01/2024
|
2 |
+
# Description:
|
3 |
+
# ------------------------------------------------
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
import main
|
8 |
+
|
9 |
+
# Define two separate functions for each button
|
10 |
+
def call_generate_news(subject, sites, min_words, max_words):
|
11 |
+
if subject == '':
|
12 |
+
return 'Erro: Assunto não informado', ''
|
13 |
+
if min_words == '':
|
14 |
+
return 'Erro: Mínimo de palavras não informado', ''
|
15 |
+
if max_words == '':
|
16 |
+
return 'Erro: Máximo de palavras não informado', ''
|
17 |
+
if sites == '':
|
18 |
+
return 'Erro: Domínios para pesquisa não informados', ''
|
19 |
+
if max_words < min_words:
|
20 |
+
return 'Erro: Máximo de palavras menor que o mínimo de palavras'
|
21 |
+
|
22 |
+
|
23 |
+
list_sites = sites.split('\n')
|
24 |
+
result_news = main.generate_news(subject, min_words, max_words, list_sites)
|
25 |
+
return result_news
|
26 |
+
|
27 |
+
def call_invoke_llm(context, prompt):
|
28 |
+
result = main.call_llm(context, prompt)
|
29 |
+
return result
|
30 |
+
|
31 |
+
# Create the Gradio interface using Blocks
|
32 |
+
with gr.Blocks(title='BotNews') as page:
|
33 |
+
gr.Markdown("# BotNews")
|
34 |
+
gr.Markdown("## Gerar Notícia por IA")
|
35 |
+
gr.Markdown(" ")
|
36 |
+
gr.Markdown("Instrução: Preencha abaixo com um assunto (Por exemplo: 'Guerra em Israel' ou 'Economia do Brasil').")
|
37 |
+
gr.Markdown(" ")
|
38 |
+
with gr.Row():
|
39 |
+
input1 = gr.Textbox(label="Assunto:", lines=1)
|
40 |
+
with gr.Row():
|
41 |
+
input2 = gr.Textbox(label="Domínios para pesquisa (coloque um domínio por linha):",
|
42 |
+
lines=10, value='https://www.cnnbrasil.com.br/\nhttps://g1.globo.com/\nhttps://www.metropoles.com/\nhttps://www.bbc.com/portuguese/\nhttps://www.instagram.com/')
|
43 |
+
with gr.Row():
|
44 |
+
input3 = gr.Textbox(label="Mínimo de palavras:", lines=1, value="300")
|
45 |
+
input4 = gr.Textbox(label="Máximo de palavras:", lines=1, value="700")
|
46 |
+
with gr.Row():
|
47 |
+
button1 = gr.Button("Gerar notícia")
|
48 |
+
with gr.Row():
|
49 |
+
output1 = gr.Textbox(label="Notícia gerada por IA", lines=25)
|
50 |
+
gr.Markdown("<hr>")
|
51 |
+
gr.Markdown("## Prompt para a Notícia")
|
52 |
+
gr.Markdown(" ")
|
53 |
+
gr.Markdown("Instrução: Preencha abaixo com um comando para ser executado sobre a notícia (Por exemplo: 'Resuma em tópicos' ou 'Adicione um tom sarcástico').")
|
54 |
+
gr.Markdown(" ")
|
55 |
+
with gr.Row():
|
56 |
+
input5 = gr.Textbox(label="Prompt para a notícia")
|
57 |
+
with gr.Row():
|
58 |
+
button2 = gr.Button("Gerar resposta")
|
59 |
+
with gr.Row():
|
60 |
+
output2 = gr.Textbox(label="Resposta gerada por IA", lines=25)
|
61 |
+
|
62 |
+
button1.click(call_generate_news, inputs=[input1, input2, input3, input4], outputs=[output1])
|
63 |
+
button2.click(call_invoke_llm, inputs=[output1, input5], outputs=[output2])
|
64 |
+
|
65 |
+
# Launch the interface
|
66 |
+
page.launch(share=True)
|
constants.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by Leandro Carneiro at 19/01/2024
|
2 |
+
# Description:
|
3 |
+
# ------------------------------------------------
|
4 |
+
|
5 |
+
#subject = 'Guerra entre Irã e Paquistão'
|
6 |
+
|
7 |
+
#sites = ['https://www.cnnbrasil.com.br/']#, 'https://g1.globo.com/', 'https://www.metropoles.com/']
|
8 |
+
num_sites = 1#5
|
9 |
+
local_base = 'local_base'
|
llm.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by Leandro Carneiro at 19/01/2024
|
2 |
+
# Description:
|
3 |
+
# ------------------------------------------------
|
4 |
+
from langchain_openai import ChatOpenAI
|
5 |
+
|
6 |
+
import api_key
|
7 |
+
|
8 |
+
def invoke_llm(context, task):
|
9 |
+
prompt = f"""You are an assistant of a newspaper.
|
10 |
+
Do not make up any information, execute the task just based on the given context.
|
11 |
+
The task is delimited by ### and the context is delimited by $$$
|
12 |
+
Write in a formal language and in portuguese language.
|
13 |
+
Execute the task just based on the given context.
|
14 |
+
Your task is: ###{task}###
|
15 |
+
The context is: $$${context}$$$
|
16 |
+
"""
|
17 |
+
|
18 |
+
llm=ChatOpenAI(model_name="gpt-3.5-turbo",
|
19 |
+
temperature=0,
|
20 |
+
openai_api_key=api_key.OPENAI_KEY,
|
21 |
+
max_tokens=1000)
|
22 |
+
result = llm.invoke(prompt)
|
23 |
+
return result.content
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
# def generate_topics(text):
|
29 |
+
# prompt = f"""You are an assistant of a newspaper.
|
30 |
+
# Your task is to extract relevant topics of a news and build a concise road map.
|
31 |
+
# Do not make up any information, create the road map just based on the given information.
|
32 |
+
# The road map should be written in a formal language and in portuguese language.
|
33 |
+
# Answer just the information, do not put any title.
|
34 |
+
# The road map should be about the following context: {text}
|
35 |
+
# """
|
36 |
+
#
|
37 |
+
# llm=ChatOpenAI(model_name="gpt-3.5-turbo",
|
38 |
+
# temperature=0,
|
39 |
+
# openai_api_key=api_key.OPENAI_KEY,
|
40 |
+
# max_tokens=1000)
|
41 |
+
# result = llm.invoke(prompt)
|
42 |
+
# return result.content
|
43 |
+
|
44 |
+
|
main.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by Leandro Carneiro at 19/01/2024
|
2 |
+
# Description:
|
3 |
+
# ------------------------------------------------
|
4 |
+
|
5 |
+
import search_engine
|
6 |
+
import rag
|
7 |
+
import constants
|
8 |
+
import llm
|
9 |
+
|
10 |
+
|
11 |
+
def generate_news(subject, min_words, max_words, sites):
|
12 |
+
print('\n\n' + '*' * 50)
|
13 |
+
print('\n\nInício do Programa: \n')
|
14 |
+
|
15 |
+
print('\nBuscando sites relevantes...')
|
16 |
+
retrieved_sites = search_engine.search_google(subject, sites)
|
17 |
+
if type(retrieved_sites) == str:
|
18 |
+
return 'Erro: ' + retrieved_sites
|
19 |
+
|
20 |
+
print('\nBaixando as notícias...')
|
21 |
+
retrieved_text_from_sites = search_engine.retrieve_text_from_site(retrieved_sites)
|
22 |
+
if type(retrieved_text_from_sites) == str:
|
23 |
+
return 'Erro: ' + retrieved_text_from_sites
|
24 |
+
|
25 |
+
print('\nSalvando as notícias em base local...')
|
26 |
+
ret = search_engine.delete_base(constants.local_base)
|
27 |
+
if ret != 0:
|
28 |
+
return 'Erro: ' + ret
|
29 |
+
ret = search_engine.save_on_base(retrieved_sites, retrieved_text_from_sites, constants.local_base)
|
30 |
+
if ret != 0:
|
31 |
+
return 'Erro: ' + ret
|
32 |
+
|
33 |
+
print('\nGerando embeddings e vectorstore...')
|
34 |
+
vectorstore = rag.generate_embeddings_and_vectorstore(constants.local_base)
|
35 |
+
if type(vectorstore) == str:
|
36 |
+
return 'Erro: ' + vectorstore
|
37 |
+
|
38 |
+
print('\nGerando a notícia (RAG)...')
|
39 |
+
print(' Assunto: ' + subject)
|
40 |
+
obj_rag = rag.Rag(vectorstore, min_words, max_words)
|
41 |
+
result_news = obj_rag.generate_text(subject)
|
42 |
+
|
43 |
+
print('\n\n' + '*' * 50 + '\n\n')
|
44 |
+
print(result_news[0])
|
45 |
+
|
46 |
+
print('\n\nFontes: ')
|
47 |
+
print(result_news[1])
|
48 |
+
|
49 |
+
return result_news[0] + '\n\n' + 'Fontes: ' + '\n' + result_news[1]
|
50 |
+
|
51 |
+
|
52 |
+
def call_llm(context, prompt):
|
53 |
+
print('\nChamando o modelo de linguagem...')
|
54 |
+
result = llm.invoke_llm(context, prompt)
|
55 |
+
|
56 |
+
return result
|
rag.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by Leandro Carneiro at 19/01/2024
|
2 |
+
# Description:
|
3 |
+
# ------------------------------------------------
|
4 |
+
#from langchain.embeddings import OpenAIEmbeddings
|
5 |
+
from langchain_openai import OpenAIEmbeddings
|
6 |
+
from langchain_community.vectorstores import Chroma
|
7 |
+
from langchain_community.document_loaders import DirectoryLoader
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
+
from langchain_openai import ChatOpenAI
|
11 |
+
from langchain.memory import ConversationBufferMemory
|
12 |
+
from langchain.chains import ConversationalRetrievalChain
|
13 |
+
import os
|
14 |
+
import csv
|
15 |
+
|
16 |
+
import api_key
|
17 |
+
|
18 |
+
def read_csv_to_dict(filename):
|
19 |
+
data_dict = {}
|
20 |
+
with open(filename, mode='r', encoding='utf-8') as file:
|
21 |
+
csv_reader = csv.reader(file)
|
22 |
+
for row in csv_reader:
|
23 |
+
key, value = row[0].split(';')
|
24 |
+
data_dict[key] = value
|
25 |
+
return data_dict
|
26 |
+
|
27 |
+
def generate_embeddings_and_vectorstore(path):
|
28 |
+
try:
|
29 |
+
loader = DirectoryLoader(path=path, glob="**/*.txt")
|
30 |
+
corpus = loader.load()
|
31 |
+
print(f' Total de documentos antes do text_split = {len(corpus)}')
|
32 |
+
|
33 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
|
34 |
+
docs = text_splitter.split_documents(corpus)
|
35 |
+
num_total_characters = sum([len(x.page_content) for x in docs])
|
36 |
+
print(f" Total de chunks depois do text_split = {len(docs)}")
|
37 |
+
print(f" Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")
|
38 |
+
|
39 |
+
dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
|
40 |
+
for doc in docs:
|
41 |
+
filename = os.path.basename(doc.metadata["source"])
|
42 |
+
doc.metadata["link"] = dict_filename_url.get(filename)
|
43 |
+
|
44 |
+
#print('docs')
|
45 |
+
#print(docs)
|
46 |
+
|
47 |
+
openai_api_key = api_key.OPENAI_KEY
|
48 |
+
fc_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
49 |
+
vectorstore = Chroma.from_documents(docs, fc_embeddings)
|
50 |
+
|
51 |
+
return vectorstore
|
52 |
+
except Exception as e:
|
53 |
+
print(str(e))
|
54 |
+
return str(e)
|
55 |
+
|
56 |
+
class Rag:
|
57 |
+
def __init__(self, vectorstore, min_words, max_words):
|
58 |
+
self.text = None
|
59 |
+
self.vectorstore = vectorstore
|
60 |
+
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
|
61 |
+
|
62 |
+
prompt_template = """Your task is to create news to a newspaper based on pieces of texts delimited by <> and a question delimited by <>.
|
63 |
+
Do not make up any information, create the news just based on the given information on the pieces of texts delimited by <>.
|
64 |
+
The news should have a tittle.
|
65 |
+
The news should be written in a formal language.
|
66 |
+
The news should have between {min_words} and {max_words} words and it should be in portuguese language.
|
67 |
+
The news should be about the following context: <{context}>
|
68 |
+
Question: <{question}>
|
69 |
+
Answer here:"""
|
70 |
+
self.prompt = PromptTemplate(template=prompt_template,
|
71 |
+
input_variables=["context", "question"],
|
72 |
+
partial_variables={"min_words": min_words, "max_words": max_words})
|
73 |
+
|
74 |
+
self.qa = ConversationalRetrievalChain.from_llm(
|
75 |
+
llm=ChatOpenAI(model_name="gpt-3.5-turbo",
|
76 |
+
temperature=0.3,
|
77 |
+
openai_api_key=api_key.OPENAI_KEY,
|
78 |
+
max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
|
79 |
+
memory=self.memory,
|
80 |
+
retriever=vectorstore.as_retriever(), #search_kwargs={'k': 3}
|
81 |
+
combine_docs_chain_kwargs={"prompt": self.prompt},
|
82 |
+
chain_type="stuff",#map_reduce, refine, map_rerank
|
83 |
+
return_source_documents=True,
|
84 |
+
)
|
85 |
+
def generate_text(self, subject):
|
86 |
+
query = f"Elabore uma nova notícia sobre {subject}."
|
87 |
+
result_text = self.qa.invoke({"question": query})
|
88 |
+
|
89 |
+
list_result_sources = []
|
90 |
+
str_result_sources = ''
|
91 |
+
for doc in result_text["source_documents"]:
|
92 |
+
list_result_sources.append(doc.metadata['link'])
|
93 |
+
result_sources = list(set(list_result_sources))
|
94 |
+
for i in range(len(result_sources)):
|
95 |
+
str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'
|
96 |
+
|
97 |
+
return (result_text["answer"], str_result_sources)
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
googlesearch-python
|
2 |
+
langchain
|
3 |
+
langchain-community
|
4 |
+
langchain_openai
|
5 |
+
openai
|
6 |
+
unstructured
|
7 |
+
chromadb
|
8 |
+
tiktoken
|
9 |
+
gradio
|
search_engine.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by Leandro Carneiro at 19/01/2024
|
2 |
+
# Description:
|
3 |
+
# ------------------------------------------------
|
4 |
+
import os.path
|
5 |
+
import time
|
6 |
+
|
7 |
+
from googlesearch import search
|
8 |
+
import requests
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
|
11 |
+
import constants
|
12 |
+
|
13 |
+
|
14 |
+
def search_google(subject, sites):
|
15 |
+
try:
|
16 |
+
|
17 |
+
results = []
|
18 |
+
for site in sites:
|
19 |
+
print(' Buscando notícias no domínio: ' + site)
|
20 |
+
query = f"{subject} site:{site}"
|
21 |
+
sites_searched = search(query, num_results=constants.num_sites)
|
22 |
+
for s in sites_searched:
|
23 |
+
results.append(s)
|
24 |
+
#time.sleep(3)
|
25 |
+
print(' Total de sites encontrados: ' + str(len(results)))
|
26 |
+
|
27 |
+
return results
|
28 |
+
except Exception as e:
|
29 |
+
print(str(e))
|
30 |
+
return str(e)
|
31 |
+
|
32 |
+
def retrieve_text_from_site(sites):
|
33 |
+
try:
|
34 |
+
result = []
|
35 |
+
for site in sites:
|
36 |
+
print(' Baixando texto do site: ' + site)
|
37 |
+
response = requests.get(site)
|
38 |
+
response.raise_for_status()
|
39 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
40 |
+
result.append(soup.get_text())
|
41 |
+
return result
|
42 |
+
except Exception as e:
|
43 |
+
return str(e)
|
44 |
+
|
45 |
+
def delete_base(local_base):
|
46 |
+
try:
|
47 |
+
for i in os.listdir(local_base):
|
48 |
+
file_path = os.path.join(local_base, i)
|
49 |
+
os.remove(file_path)
|
50 |
+
return 0
|
51 |
+
except Exception as e:
|
52 |
+
return str(e)
|
53 |
+
|
54 |
+
def save_on_base(sites, texts, local_base):
|
55 |
+
try:
|
56 |
+
for i in range(len(sites)):
|
57 |
+
filename = f'news{i}.txt'
|
58 |
+
with open(os.path.join(local_base, filename), 'w', encoding='utf-8') as file:
|
59 |
+
file.write(texts[i])
|
60 |
+
with open(os.path.join(local_base, 'filename_url.csv'), 'a', encoding='utf-8') as file:
|
61 |
+
file.write(filename + ';' + sites[i] + '\n')
|
62 |
+
|
63 |
+
return 0
|
64 |
+
except Exception as e:
|
65 |
+
return str(e)
|