""" Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro Link de estudo --> https://cloudatlas.me/query-your-pdfs-with-openai-langchain-and-faiss-7e8221791c62 """ # Substitua sua chave de API OpenAI: import openai import os from dotenv import load_dotenv, find_dotenv _ = load_dotenv(find_dotenv()) # read local .env file openai.api_key = os.environ['OPENAI_API_KEY'] from pypdf import PdfReader from langchain.text_splitter import CharacterTextSplitter doc_reader = PdfReader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/spacy_teste.pdf") raw_text = '' for i, page in enumerate(doc_reader.pages): text = page.extract_text() if text: raw_text += text #print(raw_text) print("") print(len(raw_text)) # Splitting into smaller chunks: text_splitter = CharacterTextSplitter(separator = "\n", chunk_size = 1000, chunk_overlap = 200, length_function = len, ) texts = text_splitter.split_text(raw_text) #print(texts) # Normalize e limpe o texto para incorporações: import re def normalize_text(eddy_text, sep_token = "\n"): eddy_text = re.sub(r'\s+', ' ', eddy_text).strip() eddy_text = re.sub(r". ,", "", eddy_text) # Remover todas as instancias de múltiplos espaços eddy_text = eddy_text.replace("..", ".") eddy_text = eddy_text.replace(". .", ".") eddy_text = eddy_text.replace("\n", "") eddy_text = eddy_text.strip() return eddy_text texts = list(map(normalize_text, texts)) #print(texts) from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() docsearch = FAISS.from_texts(texts, embeddings) docsearch.embedding_function # Cadeia (chain) LangChain: from langchain.chains.question_answering import load_qa_chain from langchain.llms import OpenAI chain = load_qa_chain(OpenAI(), chain_type="stuff") # Testando, queries: query = "Qual é o objetivo do problema de classificação" #"O que é entropia?" docs = docsearch.similarity_search(query, k=3) response = chain.run(input_documents=docs, question=query) print(response)