{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ad4030a6-e372-4353-936b-18c29f5eeda5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from dotenv import load_dotenv, find_dotenv\n", "load_dotenv(find_dotenv(), override=True)" ] }, { "cell_type": "code", "execution_count": 18, "id": "7b8a555e-af55-4abb-8811-af91e5816b2a", "metadata": {}, "outputs": [], "source": [ "def load_document(file):\n", " import os\n", " name, extension = os.path.splitext(file)\n", "\n", " if extension == '.pdf':\n", " from langchain.document_loaders import PyPDFLoader\n", " print(f'Carregando {file}')\n", " loader = PyPDFLoader(file)\n", " elif extension == '.docx':\n", " from langchain.document_loaders import Docx2txtLoader\n", " print(f'Carregando {file}')\n", " loader = Docx2txtLoader(file)\n", " else:\n", " print('Formato não suportado!')\n", " return None\n", "\n", " data = loader.load()\n", " return data\n", "\n", "# Wikipedia Loader\n", "def load_from_wikipedia(query, lang='pt', load_max_docs=2):\n", " from langchain.document_loaders import WikipediaLoader\n", " loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)\n", " data = loader.load()\n", " return data" ] }, { "cell_type": "code", "execution_count": 24, "id": "d0a680f5-34ff-4e25-9c82-ffee69eb7e96", "metadata": {}, "outputs": [], "source": [ "# Teste carregando dados de um pdf\n", "# documento = load_document('docs/CLT.pdf')\n", "# documento[100].page_content" ] }, { "cell_type": "code", "execution_count": 25, "id": "86329dea-7d2b-4527-9fdb-af354d8dbb56", "metadata": {}, "outputs": [], "source": [ "# Teste carregando dados da wikipedia\n", "# data = load_from_wikipedia('GPT-4')\n", "# print(data[0].page_content)" ] }, { "cell_type": "code", "execution_count": 26, "id": "f6cd5a76-7467-47d3-853d-7538979a84e1", "metadata": {}, "outputs": [], "source": [ "def chunk_data(data, chunk_size=1000):\n", " from langchain.text_splitter import RecursiveCharacterTextSplitter\n", " text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)\n", " chunks = text_splitter.split_documents(data)\n", " return chunks" ] }, { "cell_type": "code", "execution_count": 32, "id": "a066a7cb-de35-44c7-a47d-e7f0ad674172", "metadata": {}, "outputs": [], "source": [ "def embedding_cost(texts):\n", " import tiktoken\n", " enc = tiktoken.encoding_for_model('text-embedding-ada-002')\n", " total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])\n", " print(f'Total de tokens: {total_tokens}')\n", " print(f'Custo de Embedding em USD: {total_tokens / 1000 * 0.0001:.6f}')" ] }, { "cell_type": "code", "execution_count": 36, "id": "bcfe04be-9886-4b13-af30-9e91e520d04e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Carregando docs/CLT.pdf\n" ] } ], "source": [ "data = load_document('docs/CLT.pdf')\n", "chunks = chunk_data(data)" ] }, { "cell_type": "code", "execution_count": 38, "id": "8914219c-c640-4e7f-b278-8852802e6676", "metadata": {}, "outputs": [], "source": [ "# Teste print chunks\n", "# print(chunks[100].page_content)\n", "# print(len(chunks))" ] }, { "cell_type": "code", "execution_count": 39, "id": "a14b9cd9-9da8-4534-b59e-580f6a1d90a5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total de tokens: 252697\n", "Custo de Embedding em USD: 0.025270\n" ] } ], "source": [ "# Custo total para transformar os chunks em embeddings\n", "embedding_cost(chunks)" ] }, { "cell_type": "code", "execution_count": 40, "id": "dd8cfb4d-1f88-4e49-9507-b77198e33434", "metadata": {}, "outputs": [], "source": [ "def insert_embeddings(index_name):\n", " import pinecone\n", " from langchain.vectorstores import Pinecone\n", " from langchain.embeddings.openai import OpenAIEmbeddings\n", "\n", " embeddings = OpenAIEmbeddings()\n", " pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))\n", "\n", " if index_name in pinecone.list_indexes():\n", " print(f'Index {index_name}')\n", " vector_store = Pinecone.from_existing_index(index_name, embeddings)\n", " print('Ok')\n", " else:\n", " print(f'Criando index {index_name}')\n", " pinecone.create_index(index_name, dimension=1536, metric='cosine')\n", " vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)\n", " print('Ok')\n", " return vector_store" ] }, { "cell_type": "code", "execution_count": 41, "id": "17d5a6d7-0f50-4cb5-8f99-68f7da27ef69", "metadata": {}, "outputs": [], "source": [ "def delete_index(index_name='all'):\n", " import pinecone\n", " pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))\n", "\n", " if index_name == 'all':\n", " indexes = pinecone.list_indexes()\n", " print('Deletando todos os índices...')\n", " for index in indexes:\n", " pinecone.delete_index(index)\n", " else:\n", " print(f'Deletando índice {index_name}...')\n", " pinecone.delete_index(index_name)" ] }, { "cell_type": "code", "execution_count": 44, "id": "4c39ef6f-1fc8-424b-aaa6-a1c6c7f45f20", "metadata": {}, "outputs": [], "source": [ "# Teste deletando um índice existente\n", "# delete_index()" ] }, { "cell_type": "code", "execution_count": 43, "id": "cabeb040-0fd3-4433-9c04-208b55b060f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Criando index linuxtips\n", "Ok\n" ] } ], "source": [ "index_name = 'linuxtips'\n", "vector_store = insert_embeddings(index_name)" ] }, { "cell_type": "code", "execution_count": 63, "id": "384f79f7-17af-4569-b3fa-8bd9578882ca", "metadata": {}, "outputs": [], "source": [ "def get_answer(vector_store, q):\n", " from langchain.chains import RetrievalQA\n", " from langchain.chat_models import ChatOpenAI\n", "\n", " llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)\n", "\n", " retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})\n", "\n", " chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)\n", "\n", " answer = chain.run(q)\n", " return answer\n", "\n", "def ask_with_memory(vector_store, question, chat_history=[]):\n", " from langchain.chains import ConversationalRetrievalChain\n", " from langchain.chat_models import ChatOpenAI\n", " \n", " llm = ChatOpenAI(temperature=1)\n", " retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})\n", " \n", " crc = ConversationalRetrievalChain.from_llm(llm, retriever)\n", " result = crc({'question': question, 'chat_history': chat_history})\n", " chat_history.append((question, result['answer']))\n", " \n", " return result, chat_history" ] }, { "cell_type": "code", "execution_count": 68, "id": "a4d4929d-7625-4222-a97a-3b4928e4e785", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A CLT, Consolidação das Leis do Trabalho, foi criada em 1943.\n" ] } ], "source": [ "q = 'em que ano a clt foi criada?'\n", "answer = get_answer(vector_store, q)\n", "print(answer)" ] }, { "cell_type": "code", "execution_count": 48, "id": "ba4fff3b-4632-4d1d-9266-01e8426eebf4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Digite sair para encerrar.\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "Pergunta: #1: o que é o décimo terceiro salário?\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Resposta: O décimo terceiro salário é um benefício pago aos trabalhadores no Brasil, também conhecido como \"gratificação de Natal\". Ele corresponde a um salário extra que é pago uma vez por ano, geralmente em dezembro, e equivale a 1/12 avos da remuneração total recebida pelo trabalhador no ano, incluindo salário-base, horas extras, comissões e adicionais. Esse benefício tem como objetivo proporcionar aos trabalhadores uma renda adicional para despesas extras no período de fim de ano.\n", "\n", " -------------------------------------------------- \n", "\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "Pergunta: #2: qual foi a minha pergunta anterior?\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Resposta: Desculpe, mas eu não tenho acesso à pergunta anterior. Posso ajudar com alguma outra informação?\n", "\n", " -------------------------------------------------- \n", "\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "Pergunta: #3: sair\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Encerrando...\n" ] } ], "source": [ "import time\n", "i = 1\n", "print('Digite sair para encerrar.')\n", "while True:\n", " q = input(f'Pergunta: #{i}: ')\n", " i = i+1\n", " if q.lower() in ['sair']:\n", " print('Encerrando...')\n", " time.sleep(2)\n", " break\n", "\n", " answer = get_answer(vector_store, q)\n", " print(f'\\nResposta: {answer}')\n", " print(f'\\n {\"-\" * 50} \\n')" ] }, { "cell_type": "code", "execution_count": 75, "id": "eba0c508-453f-416a-9d8c-b0edb7dfcc51", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A Consolidação das Leis do Trabalho (CLT) foi criada em 1943.\n" ] } ], "source": [ "chat_history = []\n", "question = 'em que ano a clt foi criada?'\n", "result, chat_history = ask_with_memory(vector_store, question, chat_history)\n", "print(result['answer'])" ] }, { "cell_type": "code", "execution_count": 76, "id": "b34ff3ab-6de4-44b0-a441-6462b85d2d14", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Desde 1943 até 2023, se passarão 80 anos.\n" ] } ], "source": [ "question = 'estamos em 2023, quantos anos passaram?'\n", "result, chat_history = ask_with_memory(vector_store, question, chat_history)\n", "print(result['answer'])" ] }, { "cell_type": "code", "execution_count": 84, "id": "504fbca7-f317-4cfb-99cd-c36a3dd4039d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('em que ano a clt foi criada?', 'A Consolidação das Leis do Trabalho (CLT) foi criada em 1943.'), ('estamos em 2023, quantos anos passaram?', 'Desde 1943 até 2023, se passarão 80 anos.')]\n" ] } ], "source": [ "print(chat_history)" ] }, { "cell_type": "code", "execution_count": null, "id": "c26b532e-e78e-4101-9261-d0a79d5ebdf5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }