leandrocarneiro commited on
Commit
a2ee974
1 Parent(s): bc76af9

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +66 -0
  2. constants.py +9 -0
  3. llm.py +44 -0
  4. main.py +56 -0
  5. rag.py +104 -0
  6. requirements.txt +9 -0
  7. search_engine.py +65 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by Leandro Carneiro at 22/01/2024
2
+ # Description:
3
+ # ------------------------------------------------
4
+
5
+ import gradio as gr
6
+
7
+ import main
8
+
9
+ # Define two separate functions for each button
10
+ def call_generate_news(subject, sites, min_words, max_words):
11
+ if subject == '':
12
+ return 'Erro: Assunto não informado', ''
13
+ if min_words == '':
14
+ return 'Erro: Mínimo de palavras não informado', ''
15
+ if max_words == '':
16
+ return 'Erro: Máximo de palavras não informado', ''
17
+ if sites == '':
18
+ return 'Erro: Domínios para pesquisa não informados', ''
19
+ if max_words < min_words:
20
+ return 'Erro: Máximo de palavras menor que o mínimo de palavras'
21
+
22
+
23
+ list_sites = sites.split('\n')
24
+ result_news = main.generate_news(subject, min_words, max_words, list_sites)
25
+ return result_news
26
+
27
+ def call_invoke_llm(context, prompt):
28
+ result = main.call_llm(context, prompt)
29
+ return result
30
+
31
+ # Create the Gradio interface using Blocks
32
+ with gr.Blocks(title='BotNews') as page:
33
+ gr.Markdown("# BotNews")
34
+ gr.Markdown("## Gerar Notícia por IA")
35
+ gr.Markdown(" ")
36
+ gr.Markdown("Instrução: Preencha abaixo com um assunto (Por exemplo: 'Guerra em Israel' ou 'Economia do Brasil').")
37
+ gr.Markdown(" ")
38
+ with gr.Row():
39
+ input1 = gr.Textbox(label="Assunto:", lines=1)
40
+ with gr.Row():
41
+ input2 = gr.Textbox(label="Domínios para pesquisa (coloque um domínio por linha):",
42
+ lines=10, value='https://www.cnnbrasil.com.br/\nhttps://g1.globo.com/\nhttps://www.metropoles.com/\nhttps://www.bbc.com/portuguese/\nhttps://www.instagram.com/')
43
+ with gr.Row():
44
+ input3 = gr.Textbox(label="Mínimo de palavras:", lines=1, value="300")
45
+ input4 = gr.Textbox(label="Máximo de palavras:", lines=1, value="700")
46
+ with gr.Row():
47
+ button1 = gr.Button("Gerar notícia")
48
+ with gr.Row():
49
+ output1 = gr.Textbox(label="Notícia gerada por IA", lines=25)
50
+ gr.Markdown("<hr>")
51
+ gr.Markdown("## Prompt para a Notícia")
52
+ gr.Markdown(" ")
53
+ gr.Markdown("Instrução: Preencha abaixo com um comando para ser executado sobre a notícia (Por exemplo: 'Resuma em tópicos' ou 'Adicione um tom sarcástico').")
54
+ gr.Markdown(" ")
55
+ with gr.Row():
56
+ input5 = gr.Textbox(label="Prompt para a notícia")
57
+ with gr.Row():
58
+ button2 = gr.Button("Gerar resposta")
59
+ with gr.Row():
60
+ output2 = gr.Textbox(label="Resposta gerada por IA", lines=25)
61
+
62
+ button1.click(call_generate_news, inputs=[input1, input2, input3, input4], outputs=[output1])
63
+ button2.click(call_invoke_llm, inputs=[output1, input5], outputs=[output2])
64
+
65
+ # Launch the interface
66
+ page.launch(share=True)
constants.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Created by Leandro Carneiro at 19/01/2024
2
+ # Description:
3
+ # ------------------------------------------------
4
+
5
+ #subject = 'Guerra entre Irã e Paquistão'
6
+
7
+ #sites = ['https://www.cnnbrasil.com.br/']#, 'https://g1.globo.com/', 'https://www.metropoles.com/']
8
+ num_sites = 1#5
9
+ local_base = 'local_base'
llm.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by Leandro Carneiro at 19/01/2024
2
+ # Description:
3
+ # ------------------------------------------------
4
+ from langchain_openai import ChatOpenAI
5
+
6
+ import api_key
7
+
8
+ def invoke_llm(context, task):
9
+ prompt = f"""You are an assistant of a newspaper.
10
+ Do not make up any information, execute the task just based on the given context.
11
+ The task is delimited by ### and the context is delimited by $$$
12
+ Write in a formal language and in portuguese language.
13
+ Execute the task just based on the given context.
14
+ Your task is: ###{task}###
15
+ The context is: $$${context}$$$
16
+ """
17
+
18
+ llm=ChatOpenAI(model_name="gpt-3.5-turbo",
19
+ temperature=0,
20
+ openai_api_key=api_key.OPENAI_KEY,
21
+ max_tokens=1000)
22
+ result = llm.invoke(prompt)
23
+ return result.content
24
+
25
+
26
+
27
+
28
+ # def generate_topics(text):
29
+ # prompt = f"""You are an assistant of a newspaper.
30
+ # Your task is to extract relevant topics of a news and build a concise road map.
31
+ # Do not make up any information, create the road map just based on the given information.
32
+ # The road map should be written in a formal language and in portuguese language.
33
+ # Answer just the information, do not put any title.
34
+ # The road map should be about the following context: {text}
35
+ # """
36
+ #
37
+ # llm=ChatOpenAI(model_name="gpt-3.5-turbo",
38
+ # temperature=0,
39
+ # openai_api_key=api_key.OPENAI_KEY,
40
+ # max_tokens=1000)
41
+ # result = llm.invoke(prompt)
42
+ # return result.content
43
+
44
+
main.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by Leandro Carneiro at 19/01/2024
2
+ # Description:
3
+ # ------------------------------------------------
4
+
5
+ import search_engine
6
+ import rag
7
+ import constants
8
+ import llm
9
+
10
+
11
+ def generate_news(subject, min_words, max_words, sites):
12
+ print('\n\n' + '*' * 50)
13
+ print('\n\nInício do Programa: \n')
14
+
15
+ print('\nBuscando sites relevantes...')
16
+ retrieved_sites = search_engine.search_google(subject, sites)
17
+ if type(retrieved_sites) == str:
18
+ return 'Erro: ' + retrieved_sites
19
+
20
+ print('\nBaixando as notícias...')
21
+ retrieved_text_from_sites = search_engine.retrieve_text_from_site(retrieved_sites)
22
+ if type(retrieved_text_from_sites) == str:
23
+ return 'Erro: ' + retrieved_text_from_sites
24
+
25
+ print('\nSalvando as notícias em base local...')
26
+ ret = search_engine.delete_base(constants.local_base)
27
+ if ret != 0:
28
+ return 'Erro: ' + ret
29
+ ret = search_engine.save_on_base(retrieved_sites, retrieved_text_from_sites, constants.local_base)
30
+ if ret != 0:
31
+ return 'Erro: ' + ret
32
+
33
+ print('\nGerando embeddings e vectorstore...')
34
+ vectorstore = rag.generate_embeddings_and_vectorstore(constants.local_base)
35
+ if type(vectorstore) == str:
36
+ return 'Erro: ' + vectorstore
37
+
38
+ print('\nGerando a notícia (RAG)...')
39
+ print(' Assunto: ' + subject)
40
+ obj_rag = rag.Rag(vectorstore, min_words, max_words)
41
+ result_news = obj_rag.generate_text(subject)
42
+
43
+ print('\n\n' + '*' * 50 + '\n\n')
44
+ print(result_news[0])
45
+
46
+ print('\n\nFontes: ')
47
+ print(result_news[1])
48
+
49
+ return result_news[0] + '\n\n' + 'Fontes: ' + '\n' + result_news[1]
50
+
51
+
52
+ def call_llm(context, prompt):
53
+ print('\nChamando o modelo de linguagem...')
54
+ result = llm.invoke_llm(context, prompt)
55
+
56
+ return result
rag.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by Leandro Carneiro at 19/01/2024
2
+ # Description:
3
+ # ------------------------------------------------
4
+ #from langchain.embeddings import OpenAIEmbeddings
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain_community.document_loaders import DirectoryLoader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chains import ConversationalRetrievalChain
13
+ import os
14
+ import csv
15
+
16
+ import api_key
17
+
18
+ def read_csv_to_dict(filename):
19
+ data_dict = {}
20
+ with open(filename, mode='r', encoding='utf-8') as file:
21
+ csv_reader = csv.reader(file)
22
+ for row in csv_reader:
23
+ key, value = row[0].split(';')
24
+ data_dict[key] = value
25
+ return data_dict
26
+
27
+ def generate_embeddings_and_vectorstore(path):
28
+ try:
29
+ loader = DirectoryLoader(path=path, glob="**/*.txt")
30
+ corpus = loader.load()
31
+ print(f' Total de documentos antes do text_split = {len(corpus)}')
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
34
+ docs = text_splitter.split_documents(corpus)
35
+ num_total_characters = sum([len(x.page_content) for x in docs])
36
+ print(f" Total de chunks depois do text_split = {len(docs)}")
37
+ print(f" Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")
38
+
39
+ dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
40
+ for doc in docs:
41
+ filename = os.path.basename(doc.metadata["source"])
42
+ doc.metadata["link"] = dict_filename_url.get(filename)
43
+
44
+ #print('docs')
45
+ #print(docs)
46
+
47
+ openai_api_key = api_key.OPENAI_KEY
48
+ fc_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
49
+ vectorstore = Chroma.from_documents(docs, fc_embeddings)
50
+
51
+ return vectorstore
52
+ except Exception as e:
53
+ print(str(e))
54
+ return str(e)
55
+
56
+ class Rag:
57
+ def __init__(self, vectorstore, min_words, max_words):
58
+ self.text = None
59
+ self.vectorstore = vectorstore
60
+ self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
61
+
62
+ prompt_template = """Your task is to create news to a newspaper based on pieces of texts delimited by <> and a question delimited by <>.
63
+ Do not make up any information, create the news just based on the given information on the pieces of texts delimited by <>.
64
+ The news should have a tittle.
65
+ The news should be written in a formal language.
66
+ The news should have between {min_words} and {max_words} words and it should be in portuguese language.
67
+ The news should be about the following context: <{context}>
68
+ Question: <{question}>
69
+ Answer here:"""
70
+ self.prompt = PromptTemplate(template=prompt_template,
71
+ input_variables=["context", "question"],
72
+ partial_variables={"min_words": min_words, "max_words": max_words})
73
+
74
+ self.qa = ConversationalRetrievalChain.from_llm(
75
+ llm=ChatOpenAI(model_name="gpt-3.5-turbo",
76
+ temperature=0.3,
77
+ openai_api_key=api_key.OPENAI_KEY,
78
+ max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
79
+ memory=self.memory,
80
+ retriever=vectorstore.as_retriever(), #search_kwargs={'k': 3}
81
+ combine_docs_chain_kwargs={"prompt": self.prompt},
82
+ chain_type="stuff",#map_reduce, refine, map_rerank
83
+ return_source_documents=True,
84
+ )
85
+ def generate_text(self, subject):
86
+ query = f"Elabore uma nova notícia sobre {subject}."
87
+ result_text = self.qa.invoke({"question": query})
88
+
89
+ list_result_sources = []
90
+ str_result_sources = ''
91
+ for doc in result_text["source_documents"]:
92
+ list_result_sources.append(doc.metadata['link'])
93
+ result_sources = list(set(list_result_sources))
94
+ for i in range(len(result_sources)):
95
+ str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'
96
+
97
+ return (result_text["answer"], str_result_sources)
98
+
99
+
100
+
101
+
102
+
103
+
104
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ googlesearch-python
2
+ langchain
3
+ langchain-community
4
+ langchain_openai
5
+ openai
6
+ unstructured
7
+ chromadb
8
+ tiktoken
9
+ gradio
search_engine.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by Leandro Carneiro at 19/01/2024
2
+ # Description:
3
+ # ------------------------------------------------
4
+ import os.path
5
+ import time
6
+
7
+ from googlesearch import search
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+
11
+ import constants
12
+
13
+
14
+ def search_google(subject, sites):
15
+ try:
16
+
17
+ results = []
18
+ for site in sites:
19
+ print(' Buscando notícias no domínio: ' + site)
20
+ query = f"{subject} site:{site}"
21
+ sites_searched = search(query, num_results=constants.num_sites)
22
+ for s in sites_searched:
23
+ results.append(s)
24
+ #time.sleep(3)
25
+ print(' Total de sites encontrados: ' + str(len(results)))
26
+
27
+ return results
28
+ except Exception as e:
29
+ print(str(e))
30
+ return str(e)
31
+
32
+ def retrieve_text_from_site(sites):
33
+ try:
34
+ result = []
35
+ for site in sites:
36
+ print(' Baixando texto do site: ' + site)
37
+ response = requests.get(site)
38
+ response.raise_for_status()
39
+ soup = BeautifulSoup(response.content, 'html.parser')
40
+ result.append(soup.get_text())
41
+ return result
42
+ except Exception as e:
43
+ return str(e)
44
+
45
+ def delete_base(local_base):
46
+ try:
47
+ for i in os.listdir(local_base):
48
+ file_path = os.path.join(local_base, i)
49
+ os.remove(file_path)
50
+ return 0
51
+ except Exception as e:
52
+ return str(e)
53
+
54
+ def save_on_base(sites, texts, local_base):
55
+ try:
56
+ for i in range(len(sites)):
57
+ filename = f'news{i}.txt'
58
+ with open(os.path.join(local_base, filename), 'w', encoding='utf-8') as file:
59
+ file.write(texts[i])
60
+ with open(os.path.join(local_base, 'filename_url.csv'), 'a', encoding='utf-8') as file:
61
+ file.write(filename + ';' + sites[i] + '\n')
62
+
63
+ return 0
64
+ except Exception as e:
65
+ return str(e)