Spaces:

leandrocarneiro
/

BotNews

Runtime error

App Files Files Community

leandrocarneiro commited on Jan 31

Commit

a2ee974

•

1 Parent(s): bc76af9

Upload 7 files

Browse files

Files changed (7) hide show

app.py +66 -0
constants.py +9 -0
llm.py +44 -0
main.py +56 -0
rag.py +104 -0
requirements.txt +9 -0
search_engine.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Created by Leandro Carneiro at 22/01/2024
+# Description:
+# ------------------------------------------------
+import gradio as gr
+import main
+# Define two separate functions for each button
+def call_generate_news(subject, sites, min_words, max_words):
+    if subject == '':
+        return 'Erro: Assunto não informado', ''
+    if min_words == '':
+        return 'Erro: Mínimo de palavras não informado', ''
+    if max_words == '':
+        return 'Erro: Máximo de palavras não informado', ''
+    if sites == '':
+        return 'Erro: Domínios para pesquisa não informados', ''
+    if max_words < min_words:
+        return 'Erro: Máximo de palavras menor que o mínimo de palavras'
+    list_sites = sites.split('\n')
+    result_news = main.generate_news(subject, min_words, max_words, list_sites)
+    return result_news
+def call_invoke_llm(context, prompt):
+    result = main.call_llm(context, prompt)
+    return result
+# Create the Gradio interface using Blocks
+with gr.Blocks(title='BotNews') as page:
+    gr.Markdown("# BotNews")
+    gr.Markdown("## Gerar Notícia por IA")
+    gr.Markdown(" ")
+    gr.Markdown("Instrução: Preencha abaixo com um assunto (Por exemplo: 'Guerra em Israel' ou 'Economia do Brasil').")
+    gr.Markdown(" ")
+    with gr.Row():
+        input1 = gr.Textbox(label="Assunto:", lines=1)
+    with gr.Row():
+        input2 = gr.Textbox(label="Domínios para pesquisa (coloque um domínio por linha):",
+                            lines=10, value='https://www.cnnbrasil.com.br/\nhttps://g1.globo.com/\nhttps://www.metropoles.com/\nhttps://www.bbc.com/portuguese/\nhttps://www.instagram.com/')
+    with gr.Row():
+        input3 = gr.Textbox(label="Mínimo de palavras:", lines=1, value="300")
+        input4 = gr.Textbox(label="Máximo de palavras:", lines=1, value="700")
+    with gr.Row():
+        button1 = gr.Button("Gerar notícia")
+    with gr.Row():
+        output1 = gr.Textbox(label="Notícia gerada por IA", lines=25)
+    gr.Markdown("<hr>")
+    gr.Markdown("## Prompt para a Notícia")
+    gr.Markdown(" ")
+    gr.Markdown("Instrução: Preencha abaixo com um comando para ser executado sobre a notícia (Por exemplo: 'Resuma em tópicos' ou 'Adicione um tom sarcástico').")
+    gr.Markdown(" ")
+    with gr.Row():
+        input5 = gr.Textbox(label="Prompt para a notícia")
+    with gr.Row():
+        button2 = gr.Button("Gerar resposta")
+    with gr.Row():
+        output2 = gr.Textbox(label="Resposta gerada por IA", lines=25)
+    button1.click(call_generate_news, inputs=[input1, input2, input3, input4], outputs=[output1])
+    button2.click(call_invoke_llm, inputs=[output1, input5], outputs=[output2])
+# Launch the interface
+page.launch(share=True)

constants.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Created by Leandro Carneiro at 19/01/2024
+# Description:
+# ------------------------------------------------
+#subject = 'Guerra entre Irã e Paquistão'
+#sites = ['https://www.cnnbrasil.com.br/']#, 'https://g1.globo.com/', 'https://www.metropoles.com/']
+num_sites = 1#5
+local_base = 'local_base'

llm.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Created by Leandro Carneiro at 19/01/2024
+# Description:
+# ------------------------------------------------
+from langchain_openai import ChatOpenAI
+import api_key
+def invoke_llm(context, task):
+    prompt = f"""You are an assistant of a newspaper.
+                 Do not make up any information, execute the task just based on the given context.
+                 The task is delimited by ### and the context is delimited by $$$
+                 Write in a formal language and in portuguese language.
+                 Execute the task just based on the given context.
+                 Your task is: ###{task}###
+                 The context is: $$${context}$$$
+              """
+    llm=ChatOpenAI(model_name="gpt-3.5-turbo",
+                       temperature=0,
+                       openai_api_key=api_key.OPENAI_KEY,
+                       max_tokens=1000)
+    result = llm.invoke(prompt)
+    return result.content
+# def generate_topics(text):
+#     prompt = f"""You are an assistant of a newspaper.
+#                  Your task is to extract relevant topics of a news and build a concise road map.
+#                  Do not make up any information, create the road map just based on the given information.
+#                  The road map should be written in a formal language and in portuguese language.
+#                  Answer just the information, do not put any title.
+#                  The road map should be about the following context: {text}
+#               """
+#
+#     llm=ChatOpenAI(model_name="gpt-3.5-turbo",
+#                        temperature=0,
+#                        openai_api_key=api_key.OPENAI_KEY,
+#                        max_tokens=1000)
+#     result = llm.invoke(prompt)
+#     return result.content

main.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Created by Leandro Carneiro at 19/01/2024
+# Description:
+# ------------------------------------------------
+import search_engine
+import rag
+import constants
+import llm
+def generate_news(subject, min_words, max_words, sites):
+    print('\n\n' + '*' * 50)
+    print('\n\nInício do Programa: \n')
+    print('\nBuscando sites relevantes...')
+    retrieved_sites = search_engine.search_google(subject, sites)
+    if type(retrieved_sites) == str:
+        return 'Erro: ' + retrieved_sites
+    print('\nBaixando as notícias...')
+    retrieved_text_from_sites = search_engine.retrieve_text_from_site(retrieved_sites)
+    if type(retrieved_text_from_sites) == str:
+        return 'Erro: ' + retrieved_text_from_sites
+    print('\nSalvando as notícias em base local...')
+    ret = search_engine.delete_base(constants.local_base)
+    if ret != 0:
+        return 'Erro: ' + ret
+    ret = search_engine.save_on_base(retrieved_sites, retrieved_text_from_sites, constants.local_base)
+    if ret != 0:
+        return 'Erro: ' + ret
+    print('\nGerando embeddings e vectorstore...')
+    vectorstore = rag.generate_embeddings_and_vectorstore(constants.local_base)
+    if type(vectorstore) == str:
+        return 'Erro: ' + vectorstore
+    print('\nGerando a notícia (RAG)...')
+    print('    Assunto: ' + subject)
+    obj_rag = rag.Rag(vectorstore, min_words, max_words)
+    result_news = obj_rag.generate_text(subject)
+    print('\n\n' + '*' * 50 + '\n\n')
+    print(result_news[0])
+    print('\n\nFontes: ')
+    print(result_news[1])
+    return result_news[0] + '\n\n' + 'Fontes: ' + '\n' + result_news[1]
+def call_llm(context, prompt):
+    print('\nChamando o modelo de linguagem...')
+    result = llm.invoke_llm(context, prompt)
+    return result

rag.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Created by Leandro Carneiro at 19/01/2024
+# Description:
+# ------------------------------------------------
+#from langchain.embeddings import OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.prompts import PromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+import os
+import csv
+import api_key
+def read_csv_to_dict(filename):
+    data_dict = {}
+    with open(filename, mode='r', encoding='utf-8') as file:
+        csv_reader = csv.reader(file)
+        for row in csv_reader:
+            key, value = row[0].split(';')
+            data_dict[key] = value
+    return data_dict
+def generate_embeddings_and_vectorstore(path):
+    try:
+        loader = DirectoryLoader(path=path, glob="**/*.txt")
+        corpus = loader.load()
+        print(f'    Total de documentos antes do text_split = {len(corpus)}')
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
+        docs = text_splitter.split_documents(corpus)
+        num_total_characters = sum([len(x.page_content) for x in docs])
+        print(f"    Total de chunks depois do text_split = {len(docs)}")
+        print(f"    Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")
+        dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
+        for doc in docs:
+            filename = os.path.basename(doc.metadata["source"])
+            doc.metadata["link"] = dict_filename_url.get(filename)
+        #print('docs')
+        #print(docs)
+        openai_api_key = api_key.OPENAI_KEY
+        fc_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+        vectorstore = Chroma.from_documents(docs, fc_embeddings)
+        return vectorstore
+    except Exception as e:
+        print(str(e))
+        return str(e)
+class Rag:
+    def __init__(self, vectorstore, min_words, max_words):
+        self.text = None
+        self.vectorstore = vectorstore
+        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
+        prompt_template = """Your task is to create news to a newspaper based on pieces of texts delimited by <> and a question delimited by <>.
+                    Do not make up any information, create the news just based on the given information on the pieces of texts delimited by <>.
+                    The news should have a tittle.
+                    The news should be written in a formal language.
+                    The news should have between {min_words} and {max_words} words and it should be in portuguese language.
+                    The news should be about the following context: <{context}>
+                    Question: <{question}>
+                    Answer here:"""
+        self.prompt = PromptTemplate(template=prompt_template,
+                                     input_variables=["context", "question"],
+                                     partial_variables={"min_words": min_words, "max_words": max_words})
+        self.qa = ConversationalRetrievalChain.from_llm(
+                    llm=ChatOpenAI(model_name="gpt-3.5-turbo",
+                                   temperature=0.3,
+                                   openai_api_key=api_key.OPENAI_KEY,
+                                   max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
+                    memory=self.memory,
+                    retriever=vectorstore.as_retriever(), #search_kwargs={'k': 3}
+                    combine_docs_chain_kwargs={"prompt": self.prompt},
+                    chain_type="stuff",#map_reduce, refine, map_rerank
+                    return_source_documents=True,
+                )
+    def generate_text(self, subject):
+        query = f"Elabore uma nova notícia sobre {subject}."
+        result_text = self.qa.invoke({"question": query})
+        list_result_sources = []
+        str_result_sources = ''
+        for doc in result_text["source_documents"]:
+            list_result_sources.append(doc.metadata['link'])
+        result_sources = list(set(list_result_sources))
+        for i in range(len(result_sources)):
+            str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'
+        return (result_text["answer"], str_result_sources)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+googlesearch-python
+langchain
+langchain-community
+langchain_openai
+openai
+unstructured
+chromadb
+tiktoken
+gradio

search_engine.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Created by Leandro Carneiro at 19/01/2024
+# Description:
+# ------------------------------------------------
+import os.path
+import time
+from googlesearch import search
+import requests
+from bs4 import BeautifulSoup
+import constants
+def search_google(subject, sites):
+    try:
+        results = []
+        for site in sites:
+            print('    Buscando notícias no domínio: ' + site)
+            query = f"{subject} site:{site}"
+            sites_searched = search(query, num_results=constants.num_sites)
+            for s in sites_searched:
+                results.append(s)
+                #time.sleep(3)
+        print('    Total de sites encontrados: ' + str(len(results)))
+        return results
+    except Exception as e:
+        print(str(e))
+        return str(e)
+def retrieve_text_from_site(sites):
+    try:
+        result = []
+        for site in sites:
+            print('    Baixando texto do site: ' + site)
+            response = requests.get(site)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            result.append(soup.get_text())
+        return result
+    except Exception as e:
+        return str(e)
+def delete_base(local_base):
+    try:
+        for i in os.listdir(local_base):
+            file_path = os.path.join(local_base, i)
+            os.remove(file_path)
+        return 0
+    except Exception as e:
+        return str(e)
+def save_on_base(sites, texts, local_base):
+    try:
+        for i in range(len(sites)):
+            filename = f'news{i}.txt'
+            with open(os.path.join(local_base, filename), 'w', encoding='utf-8') as file:
+                file.write(texts[i])
+            with open(os.path.join(local_base, 'filename_url.csv'), 'a', encoding='utf-8') as file:
+                file.write(filename + ';' + sites[i] + '\n')
+        return 0
+    except Exception as e:
+        return str(e)