EddyGiusepe commited on
Commit
d50b019
1 Parent(s): 064aa92

Scripts sobre LangChain

Browse files
QA_PDF_teste.py CHANGED
@@ -40,18 +40,36 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # "0" para usar GPU "-1" para CPU
40
  # documents = loader.load_and_split()
41
 
42
 
43
- loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/carta01.txt")
 
 
44
  documents = loader.load()
45
 
46
 
47
  # Dividir os documentos em chunks:
48
  #text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
49
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,
50
- chunk_overlap=50,
51
- separators="\n\n"
 
 
52
  )
53
  texts = text_splitter.split_documents(documents=documents) # Para .pdf e .txt
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  persist_directory = './chromadb'
57
 
 
40
  # documents = loader.load_and_split()
41
 
42
 
43
+ #loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/carta01.txt")
44
+ #loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/9.2 - Secretaria de Saúde - Empresas.txt")
45
+ loader = TextLoader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/1-Administracao_digital.txt")
46
  documents = loader.load()
47
 
48
 
49
  # Dividir os documentos em chunks:
50
  #text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
51
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,
52
+ chunk_overlap=200,
53
+ #separators=["\n\n", "\n", " ", ""] #"\n\n"
54
+ #length_function = len,
55
+ #is_separator_regex = False
56
  )
57
  texts = text_splitter.split_documents(documents=documents) # Para .pdf e .txt
58
 
59
+ print("🤗")
60
+ print(texts)
61
+ print("🤗🤗")
62
+ print("🦆", texts[0])
63
+ print("")
64
+ print("🦆🦆", texts[1])
65
+ print("")
66
+ print("🦆🦆🦆", texts[2])
67
+ print("")
68
+ #print("🦆🦆🦆🦆", texts[3])
69
+ print("")
70
+ #print("🦆🦆🦆🦆🦆", texts[4])
71
+ print("")
72
+ #print("🦆🦆🦆🦆🦆🦆", texts[5])
73
 
74
  persist_directory = './chromadb'
75
 
multi_file.py CHANGED
@@ -67,7 +67,7 @@ for filename in os.listdir(docs_dir):
67
  elif filename.endswith('.pdf'):
68
  loader = PyPDFLoader(os.path.join(docs_dir, filename))
69
  doc = loader.load_and_split()
70
- print(doc)
71
 
72
  if doc is not None:
73
  # Crie um novo Chroma VectorStore e salve-o no disco:
@@ -89,5 +89,11 @@ chain = MultiRetrievalQAChain.from_retrievers(OpenAI(), retriever_names, retriev
89
  # print(chain.run("Quais são as diferenças entre Newton e Feynman?"))
90
 
91
  while True:
92
- print(chain.run(input("\033[033mO que você gostaria de saber? 🤓\033[m ")))
93
-
 
 
 
 
 
 
 
67
  elif filename.endswith('.pdf'):
68
  loader = PyPDFLoader(os.path.join(docs_dir, filename))
69
  doc = loader.load_and_split()
70
+ #print(doc)
71
 
72
  if doc is not None:
73
  # Crie um novo Chroma VectorStore e salve-o no disco:
 
89
  # print(chain.run("Quais são as diferenças entre Newton e Feynman?"))
90
 
91
  while True:
92
+ #print(chain.run(input("\033[033mO que você gostaria de saber? 🤓\033[m ")))
93
+ query = input("\033[033mUsuário:\033[m ")
94
+ print("")
95
+ response = chain.run(query)
96
+ print("\033[032mA resposta mais SIMILAR é: \033[m", response)
97
+ print("")
98
+ if not query:
99
+ break
query_PDF_with_OpenAI_LangChain_Faiss.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro
3
+
4
+ Link de estudo --> https://cloudatlas.me/query-your-pdfs-with-openai-langchain-and-faiss-7e8221791c62
5
+ """
6
+ # Substitua sua chave de API OpenAI:
7
+ import openai
8
+ import os
9
+ from dotenv import load_dotenv, find_dotenv
10
+ _ = load_dotenv(find_dotenv()) # read local .env file
11
+ openai.api_key = os.environ['OPENAI_API_KEY']
12
+
13
+ from pypdf import PdfReader
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+
16
+
17
+ doc_reader = PdfReader("/home/eddygiusepe/1_Eddy_Giusepe/6_REPO_HuggingFace/12_LangChain_Router_Chains_and_other_stuff_too/docs/spacy_teste.pdf")
18
+
19
+ raw_text = ''
20
+ for i, page in enumerate(doc_reader.pages):
21
+ text = page.extract_text()
22
+ if text:
23
+ raw_text += text
24
+ #print(raw_text)
25
+ print("")
26
+ print(len(raw_text))
27
+
28
+ # Splitting into smaller chunks:
29
+ text_splitter = CharacterTextSplitter(separator = "\n",
30
+ chunk_size = 1000,
31
+ chunk_overlap = 200,
32
+ length_function = len,
33
+ )
34
+
35
+ texts = text_splitter.split_text(raw_text)
36
+ #print(texts)
37
+
38
+ # Normalize e limpe o texto para incorporações:
39
+ import re
40
+ def normalize_text(eddy_text, sep_token = "\n"):
41
+ eddy_text = re.sub(r'\s+', ' ', eddy_text).strip()
42
+ eddy_text = re.sub(r". ,", "", eddy_text)
43
+ # Remover todas as instancias de múltiplos espaços
44
+ eddy_text = eddy_text.replace("..", ".")
45
+ eddy_text = eddy_text.replace(". .", ".")
46
+ eddy_text = eddy_text.replace("\n", "")
47
+ eddy_text = eddy_text.strip()
48
+ return eddy_text
49
+
50
+ texts = list(map(normalize_text, texts))
51
+ #print(texts)
52
+
53
+
54
+ from langchain.vectorstores import FAISS
55
+ from langchain.embeddings import OpenAIEmbeddings
56
+
57
+ embeddings = OpenAIEmbeddings()
58
+ docsearch = FAISS.from_texts(texts, embeddings)
59
+ docsearch.embedding_function
60
+
61
+ # Cadeia (chain) LangChain:
62
+ from langchain.chains.question_answering import load_qa_chain
63
+ from langchain.llms import OpenAI
64
+
65
+ chain = load_qa_chain(OpenAI(), chain_type="stuff")
66
+
67
+ # Testando, queries:
68
+ query = "Qual é o objetivo do problema de classificação" #"O que é entropia?"
69
+ docs = docsearch.similarity_search(query, k=3)
70
+
71
+ response = chain.run(input_documents=docs, question=query)
72
+ print(response)