Spaces:
Build error
Build error
optimize code
Browse files
README.md
CHANGED
@@ -43,4 +43,7 @@ After that, you can starting chat with your custom bot about the topic in your U
|
|
43 |
|
44 |
The vector index storage by this structure:
|
45 |
|
46 |
-
![Screenshot 2023-11-13 at 20.03.04.png](..%2F..%2F..%2F..%2Fvar%2Ffolders%2Fzc%2Fcsmhsgrd0bz3bbkycljwdk2c0000gn%2FT%2FTemporaryItems%2FNSIRD_screencaptureui_ZTP7r9%2FScreenshot%202023-11-13%20at%2020.03.04.png)
|
|
|
|
|
|
|
|
43 |
|
44 |
The vector index storage by this structure:
|
45 |
|
46 |
+
![Screenshot 2023-11-13 at 20.03.04.png](..%2F..%2F..%2F..%2Fvar%2Ffolders%2Fzc%2Fcsmhsgrd0bz3bbkycljwdk2c0000gn%2FT%2FTemporaryItems%2FNSIRD_screencaptureui_ZTP7r9%2FScreenshot%202023-11-13%20at%2020.03.04.png)
|
47 |
+
|
48 |
+
I've already adding the data from this page: https://www.presight.io/privacy-policy.html
|
49 |
+
You can check and chat with bot for Information in that or You can training more knowledge for it
|
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from typing import Optional, Tuple
|
2 |
from queue import Empty, Queue
|
3 |
from threading import Thread
|
@@ -9,15 +10,15 @@ from bot.web_scrapping.default import *
|
|
9 |
from langchain.chat_models import ChatOpenAI
|
10 |
from langchain.prompts import HumanMessagePromptTemplate
|
11 |
from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
12 |
-
import gradio as gr
|
13 |
|
14 |
-
set_api_key()
|
15 |
human_message_prompt_template = HumanMessagePromptTemplate.from_template("{text}")
|
16 |
|
17 |
|
18 |
def bot_learning(urls, file_formats, llm, prompt, chat_mode=False):
|
|
|
19 |
if chat_mode:
|
20 |
-
return
|
21 |
else:
|
22 |
return 'Training Completed'
|
23 |
|
@@ -63,7 +64,6 @@ def chat_start(
|
|
63 |
except Empty:
|
64 |
continue
|
65 |
messages.append(AIMessage(content=content))
|
66 |
-
logger.info(f"Done!")
|
67 |
return chat, "", chatbot_messages, messages
|
68 |
|
69 |
|
|
|
1 |
+
import gradio as gr
|
2 |
from typing import Optional, Tuple
|
3 |
from queue import Empty, Queue
|
4 |
from threading import Thread
|
|
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
from langchain.prompts import HumanMessagePromptTemplate
|
12 |
from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
|
|
13 |
|
14 |
+
set_api_key(api_key='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')
|
15 |
human_message_prompt_template = HumanMessagePromptTemplate.from_template("{text}")
|
16 |
|
17 |
|
18 |
def bot_learning(urls, file_formats, llm, prompt, chat_mode=False):
|
19 |
+
index = content_crawler_and_index(url=str(urls), llm=llm, prompt=prompt, file_format=file_formats)
|
20 |
if chat_mode:
|
21 |
+
return index
|
22 |
else:
|
23 |
return 'Training Completed'
|
24 |
|
|
|
64 |
except Empty:
|
65 |
continue
|
66 |
messages.append(AIMessage(content=content))
|
|
|
67 |
return chat, "", chatbot_messages, messages
|
68 |
|
69 |
|
bot/utils/constanst.py
CHANGED
@@ -5,4 +5,4 @@ API_KEY = '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'
|
|
5 |
|
6 |
def set_api_key(api_key=API_KEY):
|
7 |
os.environ['OPENAI_API_KEY'] = api_key
|
8 |
-
return
|
|
|
5 |
|
6 |
def set_api_key(api_key=API_KEY):
|
7 |
os.environ['OPENAI_API_KEY'] = api_key
|
8 |
+
return True
|
bot/web_scrapping/crawler_and_indexer.py
CHANGED
@@ -2,22 +2,16 @@ from bs4 import BeautifulSoup
|
|
2 |
from urllib import request
|
3 |
from bot.web_scrapping.searchable_index import SearchableIndex
|
4 |
from bot.utils.show_log import logger
|
5 |
-
from bot.utils.constanst import set_api_key
|
6 |
-
import pandas as pd
|
7 |
import requests
|
8 |
import os
|
9 |
|
10 |
-
set_api_key(api_key='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')
|
11 |
-
|
12 |
|
13 |
def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
|
14 |
file_path = os.path.join(output_folder, f"combined_content.{file_format}")
|
15 |
|
16 |
write_functions = {
|
17 |
'txt': lambda: write_text(file_path, text),
|
18 |
-
'pdf': lambda: write_pdf(url, file_path)
|
19 |
-
'csv': lambda: write_csv(file_path, text),
|
20 |
-
'xml': lambda: write_xml(file_path, text)
|
21 |
}
|
22 |
|
23 |
write_function = write_functions.get(file_format)
|
@@ -40,17 +34,6 @@ def write_pdf(url, file_path):
|
|
40 |
request.urlretrieve(url, file_path)
|
41 |
|
42 |
|
43 |
-
def write_csv(file_path, text):
|
44 |
-
df = pd.DataFrame({'Content': [t.text for t in text]})
|
45 |
-
df.to_csv(file_path, mode='a', index=False, header=False)
|
46 |
-
|
47 |
-
|
48 |
-
def write_xml(file_path, text):
|
49 |
-
xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
|
50 |
-
with open(file_path, "a", encoding="utf-8") as file:
|
51 |
-
file.write(xml_content)
|
52 |
-
|
53 |
-
|
54 |
def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'):
|
55 |
if url == 'NO_URL':
|
56 |
file_path = output_folder
|
@@ -74,15 +57,3 @@ def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder
|
|
74 |
|
75 |
if __name__ == '__main__':
|
76 |
pass
|
77 |
-
# Example usage:
|
78 |
-
# First URL
|
79 |
-
# idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
|
80 |
-
#
|
81 |
-
# Second URL (appends content to existing files)
|
82 |
-
# idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
|
83 |
-
# # example get response chatbot
|
84 |
-
# prompt = 'explain the paper'
|
85 |
-
# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
86 |
-
# response = SearchableIndex.query(prompt, llm, idx)
|
87 |
-
# print(response)
|
88 |
-
# logger.info(response)
|
|
|
2 |
from urllib import request
|
3 |
from bot.web_scrapping.searchable_index import SearchableIndex
|
4 |
from bot.utils.show_log import logger
|
|
|
|
|
5 |
import requests
|
6 |
import os
|
7 |
|
|
|
|
|
8 |
|
9 |
def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
|
10 |
file_path = os.path.join(output_folder, f"combined_content.{file_format}")
|
11 |
|
12 |
write_functions = {
|
13 |
'txt': lambda: write_text(file_path, text),
|
14 |
+
'pdf': lambda: write_pdf(url, file_path)
|
|
|
|
|
15 |
}
|
16 |
|
17 |
write_function = write_functions.get(file_format)
|
|
|
34 |
request.urlretrieve(url, file_path)
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'):
|
38 |
if url == 'NO_URL':
|
39 |
file_path = output_folder
|
|
|
57 |
|
58 |
if __name__ == '__main__':
|
59 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bot/web_scrapping/searchable_index.py
CHANGED
@@ -1,19 +1,13 @@
|
|
1 |
from langchain.vectorstores import FAISS
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
-
from langchain.document_loaders import
|
4 |
-
PyPDFLoader,
|
5 |
-
DataFrameLoader,
|
6 |
-
)
|
7 |
-
from langchain.document_loaders.csv_loader import CSVLoader
|
8 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
9 |
from langchain.chains.retrieval_qa.base import RetrievalQA
|
10 |
from langchain.chat_models import ChatOpenAI
|
11 |
from bot.utils.show_log import logger
|
12 |
-
import pandas as pd
|
13 |
import threading
|
14 |
import glob
|
15 |
import os
|
16 |
-
import asyncio
|
17 |
import queue
|
18 |
|
19 |
|
@@ -24,7 +18,6 @@ class Query:
|
|
24 |
self.index = index
|
25 |
|
26 |
def query(self):
|
27 |
-
"""Query the vectorstore."""
|
28 |
llm = self.llm or ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
29 |
chain = RetrievalQA.from_chain_type(
|
30 |
llm, retriever=self.index.as_retriever()
|
@@ -37,7 +30,7 @@ class SearchableIndex:
|
|
37 |
self.path = path
|
38 |
|
39 |
@classmethod
|
40 |
-
def get_splits(cls, path
|
41 |
extension = os.path.splitext(path)[1].lower()
|
42 |
doc_list = None
|
43 |
if extension == ".txt":
|
@@ -57,19 +50,12 @@ class SearchableIndex:
|
|
57 |
for pg in pages:
|
58 |
pg_splits = text_split.split_text(pg.page_content)
|
59 |
doc_list.extend(pg_splits)
|
60 |
-
elif extension == ".xml":
|
61 |
-
df = pd.read_excel(io=path, engine='openpyxl', sheet_name=sheet_name)
|
62 |
-
df_loader = DataFrameLoader(df, page_content_column=target_col)
|
63 |
-
doc_list = df_loader.load()
|
64 |
-
elif extension == ".csv":
|
65 |
-
csv_loader = CSVLoader(path)
|
66 |
-
doc_list = csv_loader.load()
|
67 |
if doc_list is None:
|
68 |
raise ValueError("Unsupported file format")
|
69 |
return doc_list
|
70 |
|
71 |
@classmethod
|
72 |
-
def merge_or_create_index(cls, index_store, faiss_db, embeddings,
|
73 |
if os.path.exists(index_store):
|
74 |
local_db = FAISS.load_local(index_store, embeddings)
|
75 |
local_db.merge_from(faiss_db)
|
@@ -79,39 +65,35 @@ class SearchableIndex:
|
|
79 |
operation_info = "New store creation"
|
80 |
|
81 |
local_db.save_local(index_store)
|
82 |
-
|
83 |
return local_db
|
84 |
|
85 |
@classmethod
|
86 |
-
def
|
87 |
if index_files:
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
return None
|
91 |
|
92 |
@classmethod
|
93 |
-
def
|
94 |
-
local_db = cls.load_index(index_files, embeddings, logger)
|
95 |
-
result_queue.put(local_db)
|
96 |
-
|
97 |
-
@classmethod
|
98 |
-
def load_index_asynchronously(cls, index_files, embeddings, logger):
|
99 |
result_queue = queue.Queue()
|
100 |
thread = threading.Thread(
|
101 |
-
target=cls.
|
102 |
-
args=(index_files, embeddings,
|
103 |
)
|
104 |
thread.start()
|
105 |
-
thread.join() # Wait for the thread to finish
|
106 |
return result_queue.get()
|
107 |
|
108 |
@classmethod
|
109 |
-
def embed_index(cls, url, path, llm, prompt
|
110 |
embeddings = OpenAIEmbeddings()
|
111 |
|
112 |
if path:
|
113 |
if url != 'NO_URL':
|
114 |
-
doc_list = cls.get_splits(path
|
115 |
faiss_db = FAISS.from_texts(doc_list, embeddings)
|
116 |
index_store = os.path.splitext(path)[0] + "_index"
|
117 |
local_db = cls.merge_or_create_index(index_store, faiss_db, embeddings, logger)
|
@@ -124,10 +106,3 @@ class SearchableIndex:
|
|
124 |
|
125 |
if __name__ == '__main__':
|
126 |
pass
|
127 |
-
# Examples for search query
|
128 |
-
# index = SearchableIndex.embed_index(
|
129 |
-
# path="/Users/macbook/Downloads/AI_test_exam/ChatBot/learning_documents/combined_content.txt")
|
130 |
-
# prompt = 'show more detail about types of data collected'
|
131 |
-
# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
132 |
-
# result = SearchableIndex.query(prompt, llm=llm, index=index)
|
133 |
-
# print(result)
|
|
|
1 |
from langchain.vectorstores import FAISS
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.document_loaders import PyPDFLoader
|
|
|
|
|
|
|
|
|
4 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
from langchain.chains.retrieval_qa.base import RetrievalQA
|
6 |
from langchain.chat_models import ChatOpenAI
|
7 |
from bot.utils.show_log import logger
|
|
|
8 |
import threading
|
9 |
import glob
|
10 |
import os
|
|
|
11 |
import queue
|
12 |
|
13 |
|
|
|
18 |
self.index = index
|
19 |
|
20 |
def query(self):
|
|
|
21 |
llm = self.llm or ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
|
22 |
chain = RetrievalQA.from_chain_type(
|
23 |
llm, retriever=self.index.as_retriever()
|
|
|
30 |
self.path = path
|
31 |
|
32 |
@classmethod
|
33 |
+
def get_splits(cls, path):
|
34 |
extension = os.path.splitext(path)[1].lower()
|
35 |
doc_list = None
|
36 |
if extension == ".txt":
|
|
|
50 |
for pg in pages:
|
51 |
pg_splits = text_split.split_text(pg.page_content)
|
52 |
doc_list.extend(pg_splits)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
if doc_list is None:
|
54 |
raise ValueError("Unsupported file format")
|
55 |
return doc_list
|
56 |
|
57 |
@classmethod
|
58 |
+
def merge_or_create_index(cls, index_store, faiss_db, embeddings, loggers):
|
59 |
if os.path.exists(index_store):
|
60 |
local_db = FAISS.load_local(index_store, embeddings)
|
61 |
local_db.merge_from(faiss_db)
|
|
|
65 |
operation_info = "New store creation"
|
66 |
|
67 |
local_db.save_local(index_store)
|
68 |
+
loggers.info(f"{operation_info} index completed")
|
69 |
return local_db
|
70 |
|
71 |
@classmethod
|
72 |
+
def load_or_check_index(cls, index_files, embeddings, loggers, result_queue):
|
73 |
if index_files:
|
74 |
+
local_db = FAISS.load_local(index_files[0], embeddings)
|
75 |
+
result_queue.put(local_db)
|
76 |
+
return local_db
|
77 |
+
loggers.warning("Index store does not exist")
|
78 |
return None
|
79 |
|
80 |
@classmethod
|
81 |
+
def load_index_asynchronously(cls, index_files, embeddings, loggers):
|
|
|
|
|
|
|
|
|
|
|
82 |
result_queue = queue.Queue()
|
83 |
thread = threading.Thread(
|
84 |
+
target=cls.load_or_check_index,
|
85 |
+
args=(index_files, embeddings, loggers, result_queue)
|
86 |
)
|
87 |
thread.start()
|
|
|
88 |
return result_queue.get()
|
89 |
|
90 |
@classmethod
|
91 |
+
def embed_index(cls, url, path, llm, prompt):
|
92 |
embeddings = OpenAIEmbeddings()
|
93 |
|
94 |
if path:
|
95 |
if url != 'NO_URL':
|
96 |
+
doc_list = cls.get_splits(path)
|
97 |
faiss_db = FAISS.from_texts(doc_list, embeddings)
|
98 |
index_store = os.path.splitext(path)[0] + "_index"
|
99 |
local_db = cls.merge_or_create_index(index_store, faiss_db, embeddings, logger)
|
|
|
106 |
|
107 |
if __name__ == '__main__':
|
108 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
learning_documents/combined_content_index/index.faiss
CHANGED
Binary files a/learning_documents/combined_content_index/index.faiss and b/learning_documents/combined_content_index/index.faiss differ
|
|
learning_documents/combined_content_index/index.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5895a023300d06204d031fa44543d35fa977de8f8808b2e6691775a95ae1ae84
|
3 |
+
size 6059
|