myn0908 commited on
Commit
d97a6fa
β€’
1 Parent(s): def188e

own knowledge gpt

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .idea
2
+ .DS_Store
3
+ __pycache__
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Own Knowledge GPT
3
- emoji: πŸ’»
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.2.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Presight GPT
3
+ emoji: πŸš€
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.2.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple
2
+ from queue import Empty, Queue
3
+ from threading import Thread
4
+ from bot.web_scrapping.crawler_and_indexer import content_crawler_and_index
5
+ from bot.web_scrapping.searchable_index import SearchableIndex
6
+ from bot.utils.callbacks import QueueCallback
7
+ from bot.utils.constanst import set_api_key
8
+ from bot.utils.show_log import logger
9
+ from langchain.chat_models import ChatOpenAI
10
+ from langchain.prompts import HumanMessagePromptTemplate
11
+ from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
12
+ import gradio as gr
13
+
14
+ set_api_key()
15
+ MODELS_NAMES = ["gpt-3.5-turbo"]
16
+ DEFAULT_TEMPERATURE = 0.7
17
+
18
+ ChatHistory = List[str]
19
+
20
+ default_system_prompt = 'Put your prompt here'
21
+ default_system_format = 'txt'
22
+ human_message_prompt_template = HumanMessagePromptTemplate.from_template("{text}")
23
+
24
+
25
+ def learning_feedbacks():
26
+ return 'Training Completed'
27
+
28
+
29
+ def bot_learning(urls, file_formats, chat_mode=False):
30
+ index = content_crawler_and_index(url=str(urls), file_format=file_formats)
31
+ if chat_mode:
32
+ return index
33
+ else:
34
+ fb = learning_feedbacks()
35
+ return fb
36
+
37
+
38
+ def chat_start(
39
+ chat: Optional[ChatOpenAI],
40
+ message: str,
41
+ chatbot_messages: ChatHistory,
42
+ messages: List[BaseMessage], ) -> Tuple[str, str, ChatOpenAI, ChatHistory, List[BaseMessage]]:
43
+ if not chat:
44
+ queue = Queue()
45
+ chat = ChatOpenAI(
46
+ model_name=MODELS_NAMES[0],
47
+ temperature=DEFAULT_TEMPERATURE,
48
+ streaming=True,
49
+ callbacks=([QueueCallback(queue)])
50
+ )
51
+ else:
52
+ queue = chat.callbacks[0].queue
53
+
54
+ job_done = object()
55
+ messages.append(HumanMessage(content=f':{message}'))
56
+ chatbot_messages.append((message, ""))
57
+ index = bot_learning(urls='NO_URL', file_formats='txt', chat_mode=True)
58
+
59
+ def query_retrieval():
60
+ response = SearchableIndex.query(message, chat, index)
61
+ chatbot_message = AIMessage(content=response)
62
+ messages.append(chatbot_message)
63
+ queue.put(job_done)
64
+
65
+ t = Thread(target=query_retrieval)
66
+ t.start()
67
+ content = ""
68
+ while True:
69
+ try:
70
+ next_token = queue.get(True, timeout=1)
71
+ if next_token is job_done:
72
+ break
73
+ content += next_token
74
+ chatbot_messages[-1] = (message, content)
75
+ yield chat, "", chatbot_messages, messages
76
+ except Empty:
77
+ continue
78
+ messages.append(AIMessage(content=content))
79
+ logger.info(f"Done!")
80
+ return chat, "", chatbot_messages, messages
81
+
82
+
83
+ def system_prompt_handler(value: str) -> str:
84
+ return value
85
+
86
+
87
+ def on_clear_button_click(system_prompt: str) -> Tuple[str, List, List]:
88
+ return "", [], [SystemMessage(content=system_prompt)]
89
+
90
+
91
+ def on_apply_settings_button_click(
92
+ system_prompt: str, model_name: str, temperature: float
93
+ ):
94
+ logger.info(
95
+ f"Applying settings: model_name={model_name}, temperature={temperature}"
96
+ )
97
+ chat = ChatOpenAI(
98
+ model_name=model_name,
99
+ temperature=temperature,
100
+ streaming=True,
101
+ callbacks=[QueueCallback(Queue())],
102
+ max_tokens=1000,
103
+ )
104
+ chat.callbacks[0].queue.empty()
105
+ return chat, *on_clear_button_click(system_prompt)
106
+
107
+
108
+ with gr.Blocks() as demo:
109
+ system_prompt = gr.State(default_system_prompt)
110
+ messages = gr.State([SystemMessage(content=default_system_prompt)])
111
+ chat = gr.State(None)
112
+
113
+ with gr.Column(elem_id="col_container"):
114
+ gr.Markdown("# Welcome to OWN-GPT! πŸ€–")
115
+ gr.Markdown(
116
+ "Demo Chat Bot Platform"
117
+ )
118
+
119
+ chatbot = gr.Chatbot()
120
+ with gr.Column():
121
+ message = gr.Textbox(label="Type some message")
122
+ message.submit(
123
+ chat_start,
124
+ [chat, message, chatbot, messages],
125
+ [chat, message, chatbot, messages],
126
+ queue=True,
127
+ )
128
+ message_button = gr.Button("Submit", variant="primary")
129
+ message_button.click(
130
+ chat_start,
131
+ [chat, message, chatbot, messages],
132
+ [chat, message, chatbot, messages],
133
+ )
134
+ with gr.Column():
135
+ learning_status = gr.Textbox(label='Training Status')
136
+ url = gr.Textbox(label="URL to Documents")
137
+ file_format = gr.Textbox(label="Set your file format:", placeholder='Example: pdf, txt')
138
+ url.submit(
139
+ bot_learning,
140
+ [url, file_format],
141
+ [learning_status]
142
+ )
143
+ training_button = gr.Button("Training", variant="primary")
144
+ training_button.click(
145
+ bot_learning,
146
+ [url, file_format],
147
+ [learning_status]
148
+ )
149
+ with gr.Row():
150
+ with gr.Column():
151
+ clear_button = gr.Button("Clear")
152
+ clear_button.click(
153
+ on_clear_button_click,
154
+ [system_prompt],
155
+ [message, chatbot, messages],
156
+ queue=False,
157
+ )
158
+ with gr.Accordion("Settings", open=False):
159
+ model_name = gr.Dropdown(
160
+ choices=MODELS_NAMES, value=MODELS_NAMES[0], label="model"
161
+ )
162
+ temperature = gr.Slider(
163
+ minimum=0.0,
164
+ maximum=1.0,
165
+ value=0.7,
166
+ step=0.1,
167
+ label="temperature",
168
+ interactive=True,
169
+ )
170
+ apply_settings_button = gr.Button("Apply")
171
+ apply_settings_button.click(
172
+ on_apply_settings_button_click,
173
+ [system_prompt, model_name, temperature],
174
+ [chat, message, chatbot, messages],
175
+ )
176
+
177
+ with gr.Column():
178
+ system_prompt_area = gr.TextArea(
179
+ default_system_prompt, lines=4, label="prompt", interactive=True
180
+ )
181
+ system_prompt_area.input(
182
+ system_prompt_handler,
183
+ inputs=[system_prompt_area],
184
+ outputs=[system_prompt],
185
+ )
186
+ system_prompt_button = gr.Button("Set")
187
+ system_prompt_button.click(
188
+ on_apply_settings_button_click,
189
+ [system_prompt, model_name, temperature],
190
+ [chat, message, chatbot, messages],
191
+ )
192
+
193
+ demo.queue()
194
+ demo.launch()
bot/utils/callbacks.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from queue import Queue
2
+ from typing import Any
3
+
4
+ from langchain.callbacks.base import BaseCallbackHandler
5
+
6
+
7
+ class QueueCallback(BaseCallbackHandler):
8
+ """Callback handler for streaming LLM responses to a queue."""
9
+
10
+ def __init__(self, queue: Queue):
11
+ self.queue = queue
12
+
13
+ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
14
+ self.queue.put(token)
15
+
16
+ def on_llm_end(self, *args, **kwargs: Any) -> None:
17
+ return self.queue.empty()
bot/utils/constanst.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ API_KEY = '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'
4
+
5
+
6
+ def set_api_key(api_key=API_KEY):
7
+ os.environ['OPENAI_API_KEY'] = api_key
8
+ return 'API KEY SUCCESSFULLY'
bot/utils/show_log.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ # Configure logging to display in terminal only
4
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
5
+
6
+ # Create a stream handler to output to the terminal
7
+ stream_handler = logging.StreamHandler()
8
+ stream_handler.setLevel(logging.INFO)
9
+
10
+ # Get the root logger and add the stream handler
11
+ logger = logging.getLogger()
12
+ logger.addHandler(stream_handler)
bot/web_scrapping/crawler_and_indexer.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from urllib import request
3
+ from bot.web_scrapping.searchable_index import SearchableIndex
4
+ from bot.utils.show_log import logger
5
+ from bot.utils.constanst import set_api_key
6
+ import pandas as pd
7
+ import requests
8
+ import os
9
+
10
+ set_api_key(api_key='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')
11
+
12
+
13
+ def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
14
+ file_path = os.path.join(output_folder, f"combined_content.{file_format}")
15
+ if file_format == 'txt':
16
+ with open(f"{file_path}", "a", encoding="utf-8") as file:
17
+ for t in text:
18
+ file.write(f'{t.text}\n')
19
+ logger.info(f"Content appended to {file_path}")
20
+ elif file_format == 'pdf':
21
+ request.urlretrieve(url, file_path)
22
+ logger.info(f"Content appended to {file_path}")
23
+ elif file_format == 'csv':
24
+ df = pd.DataFrame({'Content': [t.text for t in text]})
25
+ df.to_csv(f"{file_path}", mode='a', index=False, header=False)
26
+ logger.info(f"Content appended to {file_path}")
27
+ elif file_format == 'xml':
28
+ xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
29
+ with open(f"{file_path}", "a", encoding="utf-8") as file:
30
+ file.write(xml_content)
31
+ logger.info(f"Content appended to {file_path}")
32
+ else:
33
+ logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml")
34
+ return file_path
35
+
36
+
37
+ def content_crawler_and_index(url, file_format='txt', output_folder='learning_documents'):
38
+ if url != 'NO_URL':
39
+ # Send an HTTP GET request to the URL
40
+ responses = requests.get(url)
41
+ # Check if the request was successful
42
+ if responses.status_code == 200:
43
+ # Create output folder if it doesn't exist
44
+ if not os.path.exists(output_folder):
45
+ os.makedirs(output_folder)
46
+ # Parse the HTML content using BeautifulSoup
47
+ soup = BeautifulSoup(responses.text, "html.parser")
48
+ text = soup.find_all(['h2', 'p', 'i', 'ul'])
49
+ if text:
50
+ # Save content based on the specified file format
51
+ file_path = save_content_to_file(text=text, output_folder=output_folder, file_format=file_format)
52
+
53
+ # Create or update the index
54
+ index = SearchableIndex.embed_index(url, file_path)
55
+ if os.path.isfile(file_path):
56
+ os.remove(file_path)
57
+ return index
58
+ else:
59
+ file_path = save_content_to_file(url=url, output_folder=output_folder, file_format=file_format)
60
+ index = SearchableIndex.embed_index(url, file_path)
61
+ if os.path.isfile(file_path):
62
+ os.remove(file_path)
63
+ return index
64
+
65
+ else:
66
+ logger.warning("Failed to retrieve content from the URL.")
67
+ else:
68
+ index = SearchableIndex.embed_index(url=url, path=output_folder)
69
+ return index
70
+
71
+
72
+ if __name__ == '__main__':
73
+ pass
74
+ # Example usage:
75
+ # First URL
76
+ # idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
77
+ #
78
+ # Second URL (appends content to existing files)
79
+ # idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
80
+ # # example get response chatbot
81
+ # prompt = 'explain the paper'
82
+ # llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
83
+ # response = SearchableIndex.query(prompt, llm, idx)
84
+ # print(response)
85
+ # logger.info(response)
bot/web_scrapping/searchable_index.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import FAISS
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import (
4
+ PyPDFLoader,
5
+ DataFrameLoader,
6
+ )
7
+ from langchain.document_loaders.csv_loader import CSVLoader
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.chains.retrieval_qa.base import RetrievalQA
10
+ from langchain.chat_models import ChatOpenAI
11
+ from bot.utils.show_log import logger
12
+ import pandas as pd
13
+ import threading
14
+ import glob
15
+ import os
16
+ import queue
17
+
18
+
19
+ class SearchableIndex:
20
+ def __init__(self, path):
21
+ self.path = path
22
+
23
+ def get_text_splits(self):
24
+ with open(self.path, 'r') as txt:
25
+ data = txt.read()
26
+
27
+ text_split = RecursiveCharacterTextSplitter(chunk_size=1000,
28
+ chunk_overlap=0,
29
+ length_function=len)
30
+ doc_list = text_split.split_text(data)
31
+ return doc_list
32
+
33
+ def get_pdf_splits(self):
34
+ loader = PyPDFLoader(self.path)
35
+ pages = loader.load_and_split()
36
+ text_split = RecursiveCharacterTextSplitter(chunk_size=1000,
37
+ chunk_overlap=0,
38
+ length_function=len)
39
+ doc_list = []
40
+ for pg in pages:
41
+ pg_splits = text_split.split_text(pg.page_content)
42
+ doc_list.extend(pg_splits)
43
+ return doc_list
44
+
45
+ def get_xml_splits(self, target_col, sheet_name):
46
+ df = pd.read_excel(io=self.path,
47
+ engine='openpyxl',
48
+ sheet_name=sheet_name)
49
+
50
+ df_loader = DataFrameLoader(df,
51
+ page_content_column=target_col)
52
+
53
+ excel_docs = df_loader.load()
54
+
55
+ return excel_docs
56
+
57
+ def get_csv_splits(self):
58
+ csv_loader = CSVLoader(self.path)
59
+ csv_docs = csv_loader.load()
60
+ return csv_docs
61
+
62
+ @classmethod
63
+ def merge_or_create_index(cls, index_store, faiss_db, embeddings, logger):
64
+ if os.path.exists(index_store):
65
+ local_db = FAISS.load_local(index_store, embeddings)
66
+ local_db.merge_from(faiss_db)
67
+ logger.info("Merge index completed")
68
+ local_db.save_local(index_store)
69
+ return local_db
70
+ else:
71
+ faiss_db.save_local(folder_path=index_store)
72
+ logger.info("New store created and loaded...")
73
+ local_db = FAISS.load_local(index_store, embeddings)
74
+ return local_db
75
+
76
+ @classmethod
77
+ def check_and_load_index(cls, index_files, embeddings, logger, path, result_queue):
78
+ if index_files:
79
+ local_db = FAISS.load_local(index_files[0], embeddings)
80
+ file_to_remove = os.path.join(path, 'combined_content.txt')
81
+ if os.path.exists(file_to_remove):
82
+ os.remove(file_to_remove)
83
+ else:
84
+ raise logger.warning("Index store does not exist")
85
+ result_queue.put(local_db) # Put the result in the queue
86
+
87
+ @classmethod
88
+ def embed_index(cls, url, path, target_col=None, sheet_name=None):
89
+ embeddings = OpenAIEmbeddings()
90
+
91
+ def process_docs(queues, extension):
92
+ nonlocal doc_list
93
+ instance = cls(path)
94
+ if extension == ".txt":
95
+ doc_list = instance.get_text_splits()
96
+ elif extension == ".pdf":
97
+ doc_list = instance.get_pdf_splits()
98
+ elif extension == ".xml":
99
+ doc_list = instance.get_xml_splits(target_col, sheet_name)
100
+ elif extension == ".csv":
101
+ doc_list = instance.get_csv_splits()
102
+ else:
103
+ doc_list = None
104
+ queues.put(doc_list)
105
+
106
+ if url != 'NO_URL' and path:
107
+ file_extension = os.path.splitext(path)[1].lower()
108
+ data_queue = queue.Queue()
109
+ thread = threading.Thread(target=process_docs, args=(data_queue, file_extension))
110
+ thread.start()
111
+ doc_list = data_queue.get()
112
+ if not doc_list:
113
+ raise ValueError("Unsupported file format")
114
+
115
+ faiss_db = FAISS.from_texts(doc_list, embeddings)
116
+ index_store = os.path.splitext(path)[0] + "_index"
117
+ local_db = cls.merge_or_create_index(index_store, faiss_db, embeddings, logger)
118
+ return local_db, index_store
119
+ elif url == 'NO_URL' and path:
120
+ index_files = glob.glob(os.path.join(path, '*_index'))
121
+
122
+ result_queue = queue.Queue() # Create a queue to store the result
123
+
124
+ thread = threading.Thread(target=cls.check_and_load_index,
125
+ args=(index_files, embeddings, logger, path, result_queue))
126
+ thread.start()
127
+ local_db = result_queue.get() # Retrieve the result from the queue
128
+ return local_db
129
+
130
+ @classmethod
131
+ def query(cls, question: str, llm, index):
132
+ """Query the vectorstore."""
133
+ llm = llm or ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
134
+ chain = RetrievalQA.from_chain_type(
135
+ llm, retriever=index.as_retriever()
136
+ )
137
+ return chain.run(question)
138
+
139
+
140
+ if __name__ == '__main__':
141
+ pass
142
+ # Examples for search query
143
+ # index = SearchableIndex.embed_index(
144
+ # path="/Users/macbook/Downloads/AI_test_exam/ChatBot/learning_documents/combined_content.txt")
145
+ # prompt = 'show more detail about types of data collected'
146
+ # llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
147
+ # result = SearchableIndex.query(prompt, llm=llm, index=index)
148
+ # print(result)
bot/web_scrapping/single_crawler.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ from fpdf import FPDF
6
+
7
+
8
+ def content_crawler(url, file_format='txt', output_file='privacy_policy'):
9
+ # Send an HTTP GET request to the URL
10
+ response = requests.get(url)
11
+
12
+ # Check if the request was successful
13
+ if response.status_code == 200:
14
+ # Parse the HTML content using BeautifulSoup
15
+ soup = BeautifulSoup(response.text, "html.parser")
16
+ text = soup.find_all(['h2', 'p', 'i', 'ul'])
17
+
18
+ # Create output folder if it doesn't exist
19
+ if not os.path.exists('../learning_documents'):
20
+ os.makedirs('../learning_documents')
21
+
22
+ # Save content based on the specified file format
23
+ output_path = os.path.join('../learning_documents', output_file)
24
+
25
+ if file_format == 'txt':
26
+ with open(f"{output_path}.txt", "w", encoding="utf-8") as file:
27
+ for t in text:
28
+ file.write(f'{t.text}\n')
29
+ print(f"Content saved to {output_path}.txt")
30
+ elif file_format == 'pdf':
31
+ pdf = FPDF()
32
+ pdf.set_auto_page_break(auto=True, margin=15)
33
+ pdf.add_page()
34
+ pdf.set_font("Arial", "B", 8)
35
+ for t in text:
36
+ pdf.cell(0, 10, t.text, ln=True)
37
+ pdf.output(f"{output_path}.pdf")
38
+ print(f"Content saved to {output_path}.pdf")
39
+ elif file_format == 'csv':
40
+ df = pd.DataFrame({'Content': [t.text for t in text]})
41
+ df.to_csv(f"{output_path}.csv", index=False)
42
+ print(f"Content saved to {output_path}.csv")
43
+ elif file_format == 'xml':
44
+ xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
45
+ with open(f"{output_path}.xml", "w", encoding="utf-8") as file:
46
+ file.write(f'<root>{xml_content}</root>')
47
+ print(f"Content saved to {output_path}.xml")
48
+ else:
49
+ print("Invalid file format. Supported formats: txt, pdf, csv, xml")
50
+ else:
51
+ print("Failed to retrieve content from the URL.")
52
+
53
+
54
+ if __name__ == '__main__':
55
+ pass
56
+ # Example usage:
57
+ # content_crawler("https://www.presight.io/privacy-policy.html", file_format='pdf', output_file='privacy_policy')
requirements.txt ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.8.6
3
+ aiosignal==1.3.1
4
+ altair==5.1.2
5
+ annotated-types==0.6.0
6
+ anyio==3.7.1
7
+ async-timeout==4.0.3
8
+ attrs==23.1.0
9
+ backoff==2.2.1
10
+ bcrypt==4.0.1
11
+ beautifulsoup4==4.12.2
12
+ cachetools==5.3.2
13
+ certifi==2023.7.22
14
+ chardet==5.2.0
15
+ charset-normalizer==3.3.2
16
+ chroma-hnswlib==0.7.3
17
+ chromadb==0.4.16
18
+ ci-info==0.3.0
19
+ click==8.1.7
20
+ colorama==0.4.6
21
+ coloredlogs==15.0.1
22
+ configobj==5.0.8
23
+ configparser==6.0.0
24
+ contourpy==1.2.0
25
+ cycler==0.12.1
26
+ dataclasses-json==0.6.2
27
+ Deprecated==1.2.14
28
+ emoji==2.8.0
29
+ etelemetry==0.3.1
30
+ exceptiongroup==1.1.3
31
+ faiss-cpu==1.7.4
32
+ fastapi==0.104.1
33
+ ffmpy==0.3.1
34
+ filelock==3.13.1
35
+ filetype==1.2.0
36
+ flatbuffers==23.5.26
37
+ fonttools==4.44.0
38
+ fpdf==1.7.2
39
+ frozenlist==1.4.0
40
+ fsspec==2023.10.0
41
+ future==0.18.3
42
+ google-auth==2.23.4
43
+ googleapis-common-protos==1.61.0
44
+ gradio==3.45.2
45
+ gradio_client==0.5.3
46
+ grpcio==1.59.2
47
+ h11==0.14.0
48
+ httpcore==1.0.2
49
+ httplib2==0.22.0
50
+ httptools==0.6.1
51
+ httpx==0.25.1
52
+ huggingface-hub==0.17.3
53
+ humanfriendly==10.0
54
+ idna==3.4
55
+ importlib-metadata==6.8.0
56
+ importlib-resources==6.1.1
57
+ install==1.3.5
58
+ isodate==0.6.1
59
+ Jinja2==3.1.2
60
+ joblib==1.3.2
61
+ jsonpatch==1.33
62
+ jsonpointer==2.4
63
+ jsonschema==4.19.2
64
+ jsonschema-specifications==2023.7.1
65
+ kiwisolver==1.4.5
66
+ kubernetes==28.1.0
67
+ langchain==0.0.334
68
+ langdetect==1.0.9
69
+ langsmith==0.0.63
70
+ looseversion==1.3.0
71
+ lxml==4.9.3
72
+ markdown-it-py==3.0.0
73
+ MarkupSafe==2.1.3
74
+ marshmallow==3.20.1
75
+ matplotlib==3.8.1
76
+ mdurl==0.1.2
77
+ monotonic==1.6
78
+ mpmath==1.3.0
79
+ multidict==6.0.4
80
+ mypy-extensions==1.0.0
81
+ networkx==3.2.1
82
+ nibabel==5.1.0
83
+ nipype==1.8.6
84
+ nltk==3.8.1
85
+ numpy==1.26.1
86
+ oauthlib==3.2.2
87
+ onnxruntime==1.16.2
88
+ openai==0.27.3
89
+ opentelemetry-api==1.21.0
90
+ opentelemetry-exporter-otlp-proto-common==1.21.0
91
+ opentelemetry-exporter-otlp-proto-grpc==1.21.0
92
+ opentelemetry-proto==1.21.0
93
+ opentelemetry-sdk==1.21.0
94
+ opentelemetry-semantic-conventions==0.42b0
95
+ orjson==3.9.10
96
+ overrides==7.4.0
97
+ packaging==23.2
98
+ pandas==2.1.2
99
+ pathlib==1.0.1
100
+ pdfminer==20191125
101
+ Pillow==10.1.0
102
+ posthog==3.0.2
103
+ protobuf==4.25.0
104
+ prov==2.0.0
105
+ pulsar-client==3.3.0
106
+ pyasn1==0.5.0
107
+ pyasn1-modules==0.3.0
108
+ pycryptodome==3.19.0
109
+ pydantic==2.4.2
110
+ pydantic_core==2.10.1
111
+ pydot==1.4.2
112
+ pydub==0.25.1
113
+ Pygments==2.16.1
114
+ pyparsing==3.1.1
115
+ pypdf==3.17.0
116
+ PyPDF2==3.0.1
117
+ PyPika==0.48.9
118
+ python-dateutil==2.8.2
119
+ python-dotenv==1.0.0
120
+ python-iso639==2023.6.15
121
+ python-magic==0.4.27
122
+ python-multipart==0.0.6
123
+ pytz==2023.3.post1
124
+ pyxnat==1.6
125
+ PyYAML==6.0.1
126
+ rapidfuzz==3.5.2
127
+ rdflib==7.0.0
128
+ referencing==0.30.2
129
+ regex==2023.10.3
130
+ requests==2.31.0
131
+ requests-oauthlib==1.3.1
132
+ rich==13.6.0
133
+ rpds-py==0.12.0
134
+ rsa==4.9
135
+ scipy==1.11.3
136
+ semantic-version==2.10.0
137
+ shellingham==1.5.4
138
+ simplejson==3.19.2
139
+ six==1.16.0
140
+ sniffio==1.3.0
141
+ soupsieve==2.5
142
+ SQLAlchemy==2.0.23
143
+ starlette==0.27.0
144
+ sympy==1.12
145
+ tabulate==0.9.0
146
+ tenacity==8.2.3
147
+ tiktoken==0.5.1
148
+ tokenizers==0.14.1
149
+ tomlkit==0.12.0
150
+ toolz==0.12.0
151
+ tqdm==4.66.1
152
+ traits==6.3.2
153
+ typer==0.9.0
154
+ typing-inspect==0.9.0
155
+ typing_extensions==4.8.0
156
+ tzdata==2023.3
157
+ unstructured==0.10.29
158
+ urllib3==1.26.18
159
+ uvicorn==0.24.0.post1
160
+ uvloop==0.19.0
161
+ watchfiles==0.21.0
162
+ websocket-client==1.6.4
163
+ websockets==11.0.3
164
+ wrapt==1.16.0
165
+ yarl==1.9.2
166
+ zipp==3.17.0