Spaces:

EddyGiusepe
/

12_LangChain_Router_Chains_and_other_stuff_too

No application file

App Files Files Community

EddyGiusepe commited on Jul 14

Commit

d50b019

•

1 Parent(s): 064aa92

Scripts sobre LangChain

Browse files

Files changed (3) hide show

QA_PDF_teste.py +21 -3
multi_file.py +9 -3
query_PDF_with_OpenAI_LangChain_Faiss.py +72 -0

QA_PDF_teste.py CHANGED Viewed

@@ -40,18 +40,36 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # "0" para usar GPU "-1" para CPU
 # documents = loader.load_and_split()
-loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/carta01.txt")
 documents = loader.load()
 # Dividir os documentos em chunks:
 #text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,
-                                               chunk_overlap=50,
-                                               separators="\n\n"
                                               )
 texts = text_splitter.split_documents(documents=documents) # Para .pdf e .txt
 persist_directory = './chromadb'

 # documents = loader.load_and_split()
+#loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/carta01.txt")
+#loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/9.2 - Secretaria de Saúde - Empresas.txt")
+loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/1-Administracao_digital.txt")
 documents = loader.load()
 # Dividir os documentos em chunks:
 #text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,
+                                               chunk_overlap=200,
+                                               #separators=["\n\n", "\n", " ", ""] #"\n\n"
+                                               #length_function = len,
+                                               #is_separator_regex = False
                                               )
 texts = text_splitter.split_documents(documents=documents) # Para .pdf e .txt
+print("🤗")
+print(texts)
+print("🤗🤗")
+print("🦆", texts[0])
+print("")
+print("🦆🦆", texts[1])
+print("")
+print("🦆🦆🦆", texts[2])
+print("")
+#print("🦆🦆🦆🦆", texts[3])
+print("")
+#print("🦆🦆🦆🦆🦆", texts[4])
+print("")
+#print("🦆🦆🦆🦆🦆🦆", texts[5])
 persist_directory = './chromadb'

multi_file.py CHANGED Viewed

@@ -67,7 +67,7 @@ for filename in os.listdir(docs_dir):
         elif filename.endswith('.pdf'):
             loader = PyPDFLoader(os.path.join(docs_dir, filename))
             doc = loader.load_and_split()
-            print(doc)
         if doc is not None:
             # Crie um novo Chroma VectorStore e salve-o no disco:
@@ -89,5 +89,11 @@ chain = MultiRetrievalQAChain.from_retrievers(OpenAI(), retriever_names, retriev
 # print(chain.run("Quais são as diferenças entre Newton e Feynman?"))
 while True:
-    print(chain.run(input("\033[033mO que você gostaria de saber? 🤓\033[m ")))

         elif filename.endswith('.pdf'):
             loader = PyPDFLoader(os.path.join(docs_dir, filename))
             doc = loader.load_and_split()
+            #print(doc)
         if doc is not None:
             # Crie um novo Chroma VectorStore e salve-o no disco:
 # print(chain.run("Quais são as diferenças entre Newton e Feynman?"))
 while True:
+    #print(chain.run(input("\033[033mO que você gostaria de saber? 🤓\033[m ")))
+    query = input("\033[033mUsuário:\033[m ")
+    print("")
+    response = chain.run(query)
+    print("\033[032mA resposta mais SIMILAR é: \033[m", response)
+    print("")
+    if not query:
+        break

query_PDF_with_OpenAI_LangChain_Faiss.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro
+Link de estudo --> https://cloudatlas.me/query-your-pdfs-with-openai-langchain-and-faiss-7e8221791c62
+"""
+# Substitua sua chave de API OpenAI:
+import openai
+import os
+from dotenv import load_dotenv, find_dotenv
+_ = load_dotenv(find_dotenv()) # read local .env file
+openai.api_key  = os.environ['OPENAI_API_KEY']
+from pypdf import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+doc_reader = PdfReader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/spacy_teste.pdf")
+raw_text = ''
+for i, page in enumerate(doc_reader.pages):
+    text = page.extract_text()
+    if text:
+        raw_text += text
+#print(raw_text)
+print("")
+print(len(raw_text))
+# Splitting into smaller chunks:
+text_splitter = CharacterTextSplitter(separator = "\n",
+                                      chunk_size = 1000,
+                                      chunk_overlap = 200,
+                                      length_function = len,
+                                     )
+texts = text_splitter.split_text(raw_text)
+#print(texts)
+# Normalize e limpe o texto para incorporações:
+import re
+def normalize_text(eddy_text, sep_token = "\n"):
+    eddy_text = re.sub(r'\s+', ' ', eddy_text).strip()
+    eddy_text = re.sub(r". ,", "", eddy_text)
+    # Remover todas as instancias de múltiplos espaços
+    eddy_text = eddy_text.replace("..", ".")
+    eddy_text = eddy_text.replace(". .", ".")
+    eddy_text = eddy_text.replace("\n", "")
+    eddy_text = eddy_text.strip()
+    return eddy_text
+texts = list(map(normalize_text, texts))
+#print(texts)
+from langchain.vectorstores import FAISS
+from langchain.embeddings import OpenAIEmbeddings
+embeddings = OpenAIEmbeddings()
+docsearch = FAISS.from_texts(texts, embeddings)
+docsearch.embedding_function
+# Cadeia (chain) LangChain:
+from langchain.chains.question_answering import load_qa_chain
+from langchain.llms import OpenAI
+chain = load_qa_chain(OpenAI(), chain_type="stuff")
+# Testando, queries:
+query = "Qual é o objetivo do problema de classificação" #"O que é entropia?"
+docs = docsearch.similarity_search(query, k=3)
+response = chain.run(input_documents=docs, question=query)
+print(response)