myn0908 commited on
Commit
bad3833
1 Parent(s): e44f2dc

optimize code

Browse files
README.md CHANGED
@@ -43,4 +43,7 @@ After that, you can starting chat with your custom bot about the topic in your U
43
 
44
  The vector index storage by this structure:
45
 
46
- ![Screenshot 2023-11-13 at 20.03.04.png](..%2F..%2F..%2F..%2Fvar%2Ffolders%2Fzc%2Fcsmhsgrd0bz3bbkycljwdk2c0000gn%2FT%2FTemporaryItems%2FNSIRD_screencaptureui_ZTP7r9%2FScreenshot%202023-11-13%20at%2020.03.04.png)
 
 
 
 
43
 
44
  The vector index storage by this structure:
45
 
46
+ ![Screenshot 2023-11-13 at 20.03.04.png](..%2F..%2F..%2F..%2Fvar%2Ffolders%2Fzc%2Fcsmhsgrd0bz3bbkycljwdk2c0000gn%2FT%2FTemporaryItems%2FNSIRD_screencaptureui_ZTP7r9%2FScreenshot%202023-11-13%20at%2020.03.04.png)
47
+
48
+ I've already adding the data from this page: https://www.presight.io/privacy-policy.html
49
+ You can check and chat with bot for Information in that or You can training more knowledge for it
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import Optional, Tuple
2
  from queue import Empty, Queue
3
  from threading import Thread
@@ -9,15 +10,15 @@ from bot.web_scrapping.default import *
9
  from langchain.chat_models import ChatOpenAI
10
  from langchain.prompts import HumanMessagePromptTemplate
11
  from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
12
- import gradio as gr
13
 
14
- set_api_key()
15
  human_message_prompt_template = HumanMessagePromptTemplate.from_template("{text}")
16
 
17
 
18
  def bot_learning(urls, file_formats, llm, prompt, chat_mode=False):
 
19
  if chat_mode:
20
- return content_crawler_and_index(url=str(urls), llm=llm, prompt=prompt, file_format=file_formats)
21
  else:
22
  return 'Training Completed'
23
 
@@ -63,7 +64,6 @@ def chat_start(
63
  except Empty:
64
  continue
65
  messages.append(AIMessage(content=content))
66
- logger.info(f"Done!")
67
  return chat, "", chatbot_messages, messages
68
 
69
 
 
1
+ import gradio as gr
2
  from typing import Optional, Tuple
3
  from queue import Empty, Queue
4
  from threading import Thread
 
10
  from langchain.chat_models import ChatOpenAI
11
  from langchain.prompts import HumanMessagePromptTemplate
12
  from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
 
13
 
14
+ set_api_key(api_key='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')
15
  human_message_prompt_template = HumanMessagePromptTemplate.from_template("{text}")
16
 
17
 
18
  def bot_learning(urls, file_formats, llm, prompt, chat_mode=False):
19
+ index = content_crawler_and_index(url=str(urls), llm=llm, prompt=prompt, file_format=file_formats)
20
  if chat_mode:
21
+ return index
22
  else:
23
  return 'Training Completed'
24
 
 
64
  except Empty:
65
  continue
66
  messages.append(AIMessage(content=content))
 
67
  return chat, "", chatbot_messages, messages
68
 
69
 
bot/utils/constanst.py CHANGED
@@ -5,4 +5,4 @@ API_KEY = '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'
5
 
6
  def set_api_key(api_key=API_KEY):
7
  os.environ['OPENAI_API_KEY'] = api_key
8
- return 'API KEY SUCCESSFULLY'
 
5
 
6
  def set_api_key(api_key=API_KEY):
7
  os.environ['OPENAI_API_KEY'] = api_key
8
+ return True
bot/web_scrapping/crawler_and_indexer.py CHANGED
@@ -2,22 +2,16 @@ from bs4 import BeautifulSoup
2
  from urllib import request
3
  from bot.web_scrapping.searchable_index import SearchableIndex
4
  from bot.utils.show_log import logger
5
- from bot.utils.constanst import set_api_key
6
- import pandas as pd
7
  import requests
8
  import os
9
 
10
- set_api_key(api_key='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')
11
-
12
 
13
  def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
14
  file_path = os.path.join(output_folder, f"combined_content.{file_format}")
15
 
16
  write_functions = {
17
  'txt': lambda: write_text(file_path, text),
18
- 'pdf': lambda: write_pdf(url, file_path),
19
- 'csv': lambda: write_csv(file_path, text),
20
- 'xml': lambda: write_xml(file_path, text)
21
  }
22
 
23
  write_function = write_functions.get(file_format)
@@ -40,17 +34,6 @@ def write_pdf(url, file_path):
40
  request.urlretrieve(url, file_path)
41
 
42
 
43
- def write_csv(file_path, text):
44
- df = pd.DataFrame({'Content': [t.text for t in text]})
45
- df.to_csv(file_path, mode='a', index=False, header=False)
46
-
47
-
48
- def write_xml(file_path, text):
49
- xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
50
- with open(file_path, "a", encoding="utf-8") as file:
51
- file.write(xml_content)
52
-
53
-
54
  def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'):
55
  if url == 'NO_URL':
56
  file_path = output_folder
@@ -74,15 +57,3 @@ def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder
74
 
75
  if __name__ == '__main__':
76
  pass
77
- # Example usage:
78
- # First URL
79
- # idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
80
- #
81
- # Second URL (appends content to existing files)
82
- # idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
83
- # # example get response chatbot
84
- # prompt = 'explain the paper'
85
- # llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
86
- # response = SearchableIndex.query(prompt, llm, idx)
87
- # print(response)
88
- # logger.info(response)
 
2
  from urllib import request
3
  from bot.web_scrapping.searchable_index import SearchableIndex
4
  from bot.utils.show_log import logger
 
 
5
  import requests
6
  import os
7
 
 
 
8
 
9
  def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
10
  file_path = os.path.join(output_folder, f"combined_content.{file_format}")
11
 
12
  write_functions = {
13
  'txt': lambda: write_text(file_path, text),
14
+ 'pdf': lambda: write_pdf(url, file_path)
 
 
15
  }
16
 
17
  write_function = write_functions.get(file_format)
 
34
  request.urlretrieve(url, file_path)
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
37
  def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'):
38
  if url == 'NO_URL':
39
  file_path = output_folder
 
57
 
58
  if __name__ == '__main__':
59
  pass
 
 
 
 
 
 
 
 
 
 
 
 
bot/web_scrapping/searchable_index.py CHANGED
@@ -1,19 +1,13 @@
1
  from langchain.vectorstores import FAISS
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from langchain.document_loaders import (
4
- PyPDFLoader,
5
- DataFrameLoader,
6
- )
7
- from langchain.document_loaders.csv_loader import CSVLoader
8
  from langchain.embeddings.openai import OpenAIEmbeddings
9
  from langchain.chains.retrieval_qa.base import RetrievalQA
10
  from langchain.chat_models import ChatOpenAI
11
  from bot.utils.show_log import logger
12
- import pandas as pd
13
  import threading
14
  import glob
15
  import os
16
- import asyncio
17
  import queue
18
 
19
 
@@ -24,7 +18,6 @@ class Query:
24
  self.index = index
25
 
26
  def query(self):
27
- """Query the vectorstore."""
28
  llm = self.llm or ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
29
  chain = RetrievalQA.from_chain_type(
30
  llm, retriever=self.index.as_retriever()
@@ -37,7 +30,7 @@ class SearchableIndex:
37
  self.path = path
38
 
39
  @classmethod
40
- def get_splits(cls, path, target_col=None, sheet_name=None):
41
  extension = os.path.splitext(path)[1].lower()
42
  doc_list = None
43
  if extension == ".txt":
@@ -57,19 +50,12 @@ class SearchableIndex:
57
  for pg in pages:
58
  pg_splits = text_split.split_text(pg.page_content)
59
  doc_list.extend(pg_splits)
60
- elif extension == ".xml":
61
- df = pd.read_excel(io=path, engine='openpyxl', sheet_name=sheet_name)
62
- df_loader = DataFrameLoader(df, page_content_column=target_col)
63
- doc_list = df_loader.load()
64
- elif extension == ".csv":
65
- csv_loader = CSVLoader(path)
66
- doc_list = csv_loader.load()
67
  if doc_list is None:
68
  raise ValueError("Unsupported file format")
69
  return doc_list
70
 
71
  @classmethod
72
- def merge_or_create_index(cls, index_store, faiss_db, embeddings, logger):
73
  if os.path.exists(index_store):
74
  local_db = FAISS.load_local(index_store, embeddings)
75
  local_db.merge_from(faiss_db)
@@ -79,39 +65,35 @@ class SearchableIndex:
79
  operation_info = "New store creation"
80
 
81
  local_db.save_local(index_store)
82
- logger.info(f"{operation_info} index completed")
83
  return local_db
84
 
85
  @classmethod
86
- def load_index(cls, index_files, embeddings, logger):
87
  if index_files:
88
- return FAISS.load_local(index_files[0], embeddings)
89
- logger.warning("Index store does not exist")
 
 
90
  return None
91
 
92
  @classmethod
93
- def check_and_load_index(cls, index_files, embeddings, logger, result_queue):
94
- local_db = cls.load_index(index_files, embeddings, logger)
95
- result_queue.put(local_db)
96
-
97
- @classmethod
98
- def load_index_asynchronously(cls, index_files, embeddings, logger):
99
  result_queue = queue.Queue()
100
  thread = threading.Thread(
101
- target=cls.check_and_load_index,
102
- args=(index_files, embeddings, logger, result_queue)
103
  )
104
  thread.start()
105
- thread.join() # Wait for the thread to finish
106
  return result_queue.get()
107
 
108
  @classmethod
109
- def embed_index(cls, url, path, llm, prompt, target_col=None, sheet_name=None):
110
  embeddings = OpenAIEmbeddings()
111
 
112
  if path:
113
  if url != 'NO_URL':
114
- doc_list = cls.get_splits(path, target_col, sheet_name)
115
  faiss_db = FAISS.from_texts(doc_list, embeddings)
116
  index_store = os.path.splitext(path)[0] + "_index"
117
  local_db = cls.merge_or_create_index(index_store, faiss_db, embeddings, logger)
@@ -124,10 +106,3 @@ class SearchableIndex:
124
 
125
  if __name__ == '__main__':
126
  pass
127
- # Examples for search query
128
- # index = SearchableIndex.embed_index(
129
- # path="/Users/macbook/Downloads/AI_test_exam/ChatBot/learning_documents/combined_content.txt")
130
- # prompt = 'show more detail about types of data collected'
131
- # llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
132
- # result = SearchableIndex.query(prompt, llm=llm, index=index)
133
- # print(result)
 
1
  from langchain.vectorstores import FAISS
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import PyPDFLoader
 
 
 
 
4
  from langchain.embeddings.openai import OpenAIEmbeddings
5
  from langchain.chains.retrieval_qa.base import RetrievalQA
6
  from langchain.chat_models import ChatOpenAI
7
  from bot.utils.show_log import logger
 
8
  import threading
9
  import glob
10
  import os
 
11
  import queue
12
 
13
 
 
18
  self.index = index
19
 
20
  def query(self):
 
21
  llm = self.llm or ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
22
  chain = RetrievalQA.from_chain_type(
23
  llm, retriever=self.index.as_retriever()
 
30
  self.path = path
31
 
32
  @classmethod
33
+ def get_splits(cls, path):
34
  extension = os.path.splitext(path)[1].lower()
35
  doc_list = None
36
  if extension == ".txt":
 
50
  for pg in pages:
51
  pg_splits = text_split.split_text(pg.page_content)
52
  doc_list.extend(pg_splits)
 
 
 
 
 
 
 
53
  if doc_list is None:
54
  raise ValueError("Unsupported file format")
55
  return doc_list
56
 
57
  @classmethod
58
+ def merge_or_create_index(cls, index_store, faiss_db, embeddings, loggers):
59
  if os.path.exists(index_store):
60
  local_db = FAISS.load_local(index_store, embeddings)
61
  local_db.merge_from(faiss_db)
 
65
  operation_info = "New store creation"
66
 
67
  local_db.save_local(index_store)
68
+ loggers.info(f"{operation_info} index completed")
69
  return local_db
70
 
71
  @classmethod
72
+ def load_or_check_index(cls, index_files, embeddings, loggers, result_queue):
73
  if index_files:
74
+ local_db = FAISS.load_local(index_files[0], embeddings)
75
+ result_queue.put(local_db)
76
+ return local_db
77
+ loggers.warning("Index store does not exist")
78
  return None
79
 
80
  @classmethod
81
+ def load_index_asynchronously(cls, index_files, embeddings, loggers):
 
 
 
 
 
82
  result_queue = queue.Queue()
83
  thread = threading.Thread(
84
+ target=cls.load_or_check_index,
85
+ args=(index_files, embeddings, loggers, result_queue)
86
  )
87
  thread.start()
 
88
  return result_queue.get()
89
 
90
  @classmethod
91
+ def embed_index(cls, url, path, llm, prompt):
92
  embeddings = OpenAIEmbeddings()
93
 
94
  if path:
95
  if url != 'NO_URL':
96
+ doc_list = cls.get_splits(path)
97
  faiss_db = FAISS.from_texts(doc_list, embeddings)
98
  index_store = os.path.splitext(path)[0] + "_index"
99
  local_db = cls.merge_or_create_index(index_store, faiss_db, embeddings, logger)
 
106
 
107
  if __name__ == '__main__':
108
  pass
 
 
 
 
 
 
 
learning_documents/combined_content_index/index.faiss CHANGED
Binary files a/learning_documents/combined_content_index/index.faiss and b/learning_documents/combined_content_index/index.faiss differ
 
learning_documents/combined_content_index/index.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0d19bc7afb8227f67225b52afd8c746bc67aceca43fb5e5c84a19e94cda0e9d
3
- size 3959
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5895a023300d06204d031fa44543d35fa977de8f8808b2e6691775a95ae1ae84
3
+ size 6059