acecalisto3 commited on
Commit
e9a41cf
1 Parent(s): bf70dc8

Update agent.py

Browse files
Files changed (1) hide show
  1. agent.py +377 -120
agent.py CHANGED
@@ -4,6 +4,7 @@ import hashlib
4
  import logging
5
  import datetime
6
  import csv
 
7
  from urllib.parse import urlparse
8
  from selenium import webdriver
9
  from selenium.webdriver.chrome.service import Service
@@ -11,67 +12,130 @@ from selenium.webdriver.chrome.options import Options
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
- from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
15
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
16
- from transformers import pipeline
 
 
 
 
17
  import feedparser
 
 
18
 
19
  # Configure logging
20
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
21
 
22
  # Define constants
23
  DEFAULT_FILE_PATH = "scraped_data"
24
- PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
 
 
 
25
  HISTORY = []
26
  CURRENT_TASK = None
 
27
 
28
  # Function to monitor URLs for changes
29
  def monitor_urls(storage_location, urls, scrape_interval, content_type, selector=None):
30
- global HISTORY
31
- previous_hashes = {url: "" for url in urls}
 
 
 
 
 
32
 
33
  try:
34
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
35
- while True:
36
- for url in urls:
37
- try:
38
- driver.get(url)
39
- WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) # Wait for basic page load
40
- time.sleep(2) # Additional wait for dynamic content
41
-
42
- if content_type == "text":
43
- current_content = driver.page_source
44
- elif content_type == "media":
45
- if selector:
46
- try:
47
- elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
48
- current_content = [element.get_attribute('src') for element in elements]
49
- except TimeoutException:
50
- logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
51
- current_content = []
52
- else:
53
- current_content = driver.find_elements(By.TAG_NAME, "img")
54
- else:
55
- current_content = driver.page_source
56
-
57
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
58
- if current_hash != previous_hashes[url]:
59
- previous_hashes[url] = current_hash
60
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
61
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
62
- with open(os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv"), "a", newline="") as csvfile:
63
- csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
64
- csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
65
- logging.info(f"Change detected at {url} on {date_time_str}")
66
- except (NoSuchElementException, StaleElementReferenceException, Exception) as e:
67
- logging.error(f"Error accessing {url}: {e}")
68
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
69
  except Exception as e:
70
- logging.error(f"Error starting ChromeDriver: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Function to start scraping
73
  def start_scraping(storage_location, urls, scrape_interval, content_type, selector=None):
74
- global CURRENT_TASK, HISTORY
 
 
 
75
 
76
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
77
  HISTORY.append(f"Task started: {CURRENT_TASK}")
@@ -84,123 +148,316 @@ def start_scraping(storage_location, urls, scrape_interval, content_type, select
84
 
85
  # Log the initial observation
86
  try:
87
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
88
- driver.get(url)
89
- WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) # Wait for basic page load
90
- time.sleep(2) # Additional wait for dynamic content
91
-
92
- if content_type == "text":
93
- initial_content = driver.page_source
94
- elif content_type == "media":
95
- if selector:
96
- try:
97
- elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
98
- initial_content = [element.get_attribute('src') for element in elements]
99
- except TimeoutException:
100
- logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
101
- initial_content = []
102
- else:
103
- initial_content = driver.find_elements(By.TAG_NAME, "img")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  else:
105
- initial_content = driver.page_source
 
 
 
106
 
107
- initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
108
- HISTORY.append(f"Initial observation at {url}: {initial_hash}")
109
- with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
110
- file.write(f"Initial observation at {url}: {initial_hash}")
111
- except (NoSuchElementException, StaleElementReferenceException, Exception) as e:
 
 
 
 
 
 
 
 
 
 
 
112
  HISTORY.append(f"Error accessing {url}: {e}")
 
 
 
113
 
114
- # Monitor the URLs
115
- monitor_urls(storage_location, urls, scrape_interval, content_type, selector)
 
 
 
 
 
 
116
 
117
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
118
 
 
 
 
 
 
 
 
 
119
  # Function to display CSV content
120
  def display_csv(storage_location, url):
121
  hostname = urlparse(url).hostname
122
- folder_path = os.path.join(storage_location, hostname)
123
- csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
124
  if os.path.exists(csv_path):
125
- with open(csv_path, "r") as file:
126
- return file.read()
 
 
 
 
 
127
  else:
128
  return "No data available."
129
 
130
  # Function to generate RSS feed for a given URL
131
  def generate_rss_feed(storage_location, url):
132
  hostname = urlparse(url).hostname
133
- folder_path = os.path.join(storage_location, hostname)
134
- csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
135
  if os.path.exists(csv_path):
136
- with open(csv_path, "r") as file:
137
- reader = csv.DictReader(file)
138
- feed = feedparser.parse(f"rss.xml") # Create a new feed object
139
- feed.feed.title = f"Changes for {hostname}"
140
- feed.feed.link = url
141
- feed.feed.description = "Recent changes detected on the website."
142
- feed.entries = []
143
- for row in reader:
144
- feed.entries.append({
145
- "title": f"Change detected at {row['url']}",
146
- "link": row['url'],
147
- "description": f"Content changed on {row['date']} at {row['time']}",
148
- "published": datetime.datetime.strptime(f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S").isoformat(),
149
- })
150
- return feed.entries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  else:
152
  return "No data available."
153
 
154
  # Function to define the chat response function using the Mistral model
155
  def respond(message, history, system_message, max_tokens, temperature, top_p):
156
- model = AutoModelForSeq2SeqLM.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
157
- tokenizer = AutoTokenizer.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
158
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
159
- response = pipe(f"User: {message}\nHistory: {history}\nSystem: {system_message}", max_length=max_tokens, temperature=temperature, top_p=top_p)[0]
160
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  # Define the Gradio interface
163
  def create_interface():
164
  with gr.Blocks() as demo:
 
 
165
  with gr.Row():
166
  with gr.Column():
167
- message = gr.Textbox(label="Message")
168
- system_message = gr.Textbox(value="You are a helpful assistant.", label="System message")
169
- max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
170
- temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
171
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
172
- storage_location = gr.Textbox(value="scraped_data", label="Storage Location")
173
- urls = gr.Textbox(label="URLs (comma separated)")
174
- scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
175
- content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  start_button = gr.Button("Start Scraping")
177
  stop_button = gr.Button("Stop Scraping")
178
- csv_output = gr.Textbox(label="CSV Output", interactive=False)
 
 
179
 
180
  with gr.Column():
181
  chat_history = gr.Chatbot(label="Chat History")
182
- response_box = gr.Textbox(label="Response")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- # Function to stop scraping
185
- def stop_scraping(stop_scraping_flag):
186
- stop_scraping_flag[0] = True
187
- return "Scraping stopped."
188
-
189
- # Add a button to display the CSV content for a selected URL
190
  with gr.Row():
191
- selected_url = gr.Textbox(label="Select URL for CSV Content")
 
 
 
192
  csv_button = gr.Button("Display CSV Content")
193
- csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
194
-
195
- csv_button.click(display_csv, inputs=[storage_location, selected_url], outputs=csv_output)
196
 
197
- # Add a button to display the RSS feed for a selected URL
198
  with gr.Row():
199
- selected_url = gr.Textbox(label="Select URL for RSS Feed")
 
 
 
200
  rss_button = gr.Button("Generate RSS Feed")
201
- rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- rss_button.click(generate_rss_feed, inputs=[storage_location, selected_url], outputs=rss_output)
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  return demo
206
 
 
4
  import logging
5
  import datetime
6
  import csv
7
+ import threading
8
  from urllib.parse import urlparse
9
  from selenium import webdriver
10
  from selenium.webdriver.chrome.service import Service
 
12
  from selenium.webdriver.common.by import By
13
  from selenium.webdriver.support.ui import WebDriverWait
14
  from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.common.exceptions import (
16
+ TimeoutException,
17
+ NoSuchElementException,
18
+ StaleElementReferenceException,
19
+ )
20
+ from webdriver_manager.chrome import ChromeDriverManager # Added import
21
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
22
  import feedparser
23
+ import gradio as gr
24
+ import xml.etree.ElementTree as ET
25
 
26
  # Configure logging
27
+ logging.basicConfig(
28
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
29
+ )
30
 
31
  # Define constants
32
  DEFAULT_FILE_PATH = "scraped_data"
33
+ PURPOSE = (
34
+ "You go to Culvers sites, you continuously seek changes on them since your last observation. "
35
+ "Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
36
+ )
37
  HISTORY = []
38
  CURRENT_TASK = None
39
+ STOP_THREADS = False # Flag to stop scraping threads
40
 
41
  # Function to monitor URLs for changes
42
  def monitor_urls(storage_location, urls, scrape_interval, content_type, selector=None):
43
+ global HISTORY, STOP_THREADS
44
+ previous_hashes = {url: "" for url in urls}
45
+
46
+ options = Options()
47
+ options.add_argument("--headless") # Run Chrome in headless mode
48
+ options.add_argument("--no-sandbox")
49
+ options.add_argument("--disable-dev-shm-usage")
50
 
51
  try:
52
+ driver = webdriver.Chrome(
53
+ service=Service(ChromeDriverManager().install()), options=options
54
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  except Exception as e:
56
+ logging.error(f"Error initializing ChromeDriver: {e}")
57
+ return
58
+
59
+ try:
60
+ while not STOP_THREADS:
61
+ for url in urls:
62
+ try:
63
+ driver.get(url)
64
+ WebDriverWait(driver, 10).until(
65
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
66
+ ) # Wait for basic page load
67
+ time.sleep(2) # Additional wait for dynamic content
68
+
69
+ if content_type == "text":
70
+ current_content = driver.page_source
71
+ elif content_type == "media":
72
+ if selector:
73
+ try:
74
+ elements = WebDriverWait(driver, 5).until(
75
+ EC.presence_of_all_elements_located(
76
+ (By.CSS_SELECTOR, selector)
77
+ )
78
+ )
79
+ current_content = [
80
+ element.get_attribute("src") for element in elements
81
+ ]
82
+ except TimeoutException:
83
+ logging.warning(
84
+ f"Timeout waiting for media elements with selector '{selector}' on {url}"
85
+ )
86
+ current_content = []
87
+ else:
88
+ elements = driver.find_elements(By.TAG_NAME, "img")
89
+ current_content = [element.get_attribute("src") for element in elements]
90
+ else:
91
+ current_content = driver.page_source
92
+
93
+ current_hash = hashlib.md5(
94
+ str(current_content).encode("utf-8")
95
+ ).hexdigest()
96
+ if current_hash != previous_hashes[url]:
97
+ previous_hashes[url] = current_hash
98
+ date_time_str = datetime.datetime.now().strftime(
99
+ "%Y-%m-%d %H:%M:%S"
100
+ )
101
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
102
+ csv_file_path = os.path.join(
103
+ storage_location, f"{urlparse(url).hostname}_changes.csv"
104
+ )
105
+ os.makedirs(storage_location, exist_ok=True)
106
+ file_exists = os.path.isfile(csv_file_path)
107
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
108
+ fieldnames = ["date", "time", "url", "change"]
109
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
110
+ if not file_exists:
111
+ writer.writeheader()
112
+ writer.writerow(
113
+ {
114
+ "date": date_time_str.split()[0],
115
+ "time": date_time_str.split()[1],
116
+ "url": url,
117
+ "change": "Content changed",
118
+ }
119
+ )
120
+ logging.info(f"Change detected at {url} on {date_time_str}")
121
+ except (
122
+ NoSuchElementException,
123
+ StaleElementReferenceException,
124
+ TimeoutException,
125
+ Exception,
126
+ ) as e:
127
+ logging.error(f"Error accessing {url}: {e}")
128
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
129
+ finally:
130
+ driver.quit()
131
+ logging.info("ChromeDriver session ended.")
132
 
133
  # Function to start scraping
134
  def start_scraping(storage_location, urls, scrape_interval, content_type, selector=None):
135
+ global CURRENT_TASK, HISTORY, STOP_THREADS
136
+
137
+ if STOP_THREADS:
138
+ STOP_THREADS = False # Reset the flag if previously stopped
139
 
140
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
141
  HISTORY.append(f"Task started: {CURRENT_TASK}")
 
148
 
149
  # Log the initial observation
150
  try:
151
+ options = Options()
152
+ options.add_argument("--headless") # Run Chrome in headless mode
153
+ options.add_argument("--no-sandbox")
154
+ options.add_argument("--disable-dev-shm-usage")
155
+
156
+ driver = webdriver.Chrome(
157
+ service=Service(ChromeDriverManager().install()), options=options
158
+ )
159
+ driver.get(url)
160
+ WebDriverWait(driver, 10).until(
161
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
162
+ ) # Wait for basic page load
163
+ time.sleep(2) # Additional wait for dynamic content
164
+
165
+ if content_type == "text":
166
+ initial_content = driver.page_source
167
+ elif content_type == "media":
168
+ if selector:
169
+ try:
170
+ elements = WebDriverWait(driver, 5).until(
171
+ EC.presence_of_all_elements_located(
172
+ (By.CSS_SELECTOR, selector)
173
+ )
174
+ )
175
+ initial_content = [
176
+ element.get_attribute("src") for element in elements
177
+ ]
178
+ except TimeoutException:
179
+ logging.warning(
180
+ f"Timeout waiting for media elements with selector '{selector}' on {url}"
181
+ )
182
+ initial_content = []
183
  else:
184
+ elements = driver.find_elements(By.TAG_NAME, "img")
185
+ initial_content = [element.get_attribute("src") for element in elements]
186
+ else:
187
+ initial_content = driver.page_source
188
 
189
+ initial_hash = hashlib.md5(
190
+ str(initial_content).encode("utf-8")
191
+ ).hexdigest()
192
+ HISTORY.append(f"Initial observation at {url}: {initial_hash}")
193
+ initial_observation_path = os.path.join(
194
+ folder_path, f"{hostname}_initial_observation.txt"
195
+ )
196
+ with open(initial_observation_path, "w", encoding="utf-8") as file:
197
+ file.write(f"Initial observation at {url}: {initial_hash}")
198
+ logging.info(f"Initial observation logged for {url}")
199
+ except (
200
+ NoSuchElementException,
201
+ StaleElementReferenceException,
202
+ TimeoutException,
203
+ Exception,
204
+ ) as e:
205
  HISTORY.append(f"Error accessing {url}: {e}")
206
+ logging.error(f"Error accessing {url}: {e}")
207
+ finally:
208
+ driver.quit()
209
 
210
+ # Start a new thread for monitoring URLs
211
+ monitor_thread = threading.Thread(
212
+ target=monitor_urls,
213
+ args=(storage_location, urls, scrape_interval, content_type, selector),
214
+ daemon=True,
215
+ )
216
+ monitor_thread.start()
217
+ logging.info("Started scraping thread.")
218
 
219
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
220
 
221
+ # Function to stop scraping
222
+ def stop_scraping():
223
+ global STOP_THREADS
224
+ STOP_THREADS = True
225
+ HISTORY.append("Scraping stopped by user.")
226
+ logging.info("Scraping stop signal sent.")
227
+ return "Scraping has been stopped."
228
+
229
  # Function to display CSV content
230
  def display_csv(storage_location, url):
231
  hostname = urlparse(url).hostname
232
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
 
233
  if os.path.exists(csv_path):
234
+ try:
235
+ with open(csv_path, "r", encoding="utf-8") as file:
236
+ content = file.read()
237
+ return content
238
+ except Exception as e:
239
+ logging.error(f"Error reading CSV file for {url}: {e}")
240
+ return f"Error reading CSV file for {url}: {e}"
241
  else:
242
  return "No data available."
243
 
244
  # Function to generate RSS feed for a given URL
245
  def generate_rss_feed(storage_location, url):
246
  hostname = urlparse(url).hostname
247
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
 
248
  if os.path.exists(csv_path):
249
+ try:
250
+ # Parse the CSV file
251
+ with open(csv_path, "r", encoding="utf-8") as file:
252
+ reader = csv.DictReader(file)
253
+ changes = list(reader)
254
+
255
+ # Create the root RSS element
256
+ rss = ET.Element("rss", version="2.0")
257
+ channel = ET.SubElement(rss, "channel")
258
+
259
+ # Add channel elements
260
+ title = ET.SubElement(channel, "title")
261
+ title.text = f"RSS Feed for {hostname}"
262
+
263
+ link = ET.SubElement(channel, "link")
264
+ link.text = url
265
+
266
+ description = ET.SubElement(channel, "description")
267
+ description.text = "Recent changes detected on the website."
268
+
269
+ # Add items to the feed
270
+ for change in changes[-10:]: # Last 10 changes
271
+ item = ET.SubElement(channel, "item")
272
+
273
+ item_title = ET.SubElement(item, "title")
274
+ item_title.text = f"Change detected at {change['url']}"
275
+
276
+ item_link = ET.SubElement(item, "link")
277
+ item_link.text = change["url"]
278
+
279
+ item_description = ET.SubElement(item, "description")
280
+ item_description.text = f"Content changed on {change['date']} at {change['time']}"
281
+
282
+ pub_date = ET.SubElement(item, "pubDate")
283
+ pub_date.text = datetime.datetime.strptime(
284
+ f"{change['date']} {change['time']}", "%Y-%m-%d %H:%M:%S"
285
+ ).strftime("%a, %d %b %Y %H:%M:%S +0000")
286
+
287
+ # Generate the XML string
288
+ rss_feed = ET.tostring(rss, encoding="utf-8")
289
+ return rss_feed.decode("utf-8")
290
+ except Exception as e:
291
+ logging.error(f"Error generating RSS feed for {url}: {e}")
292
+ return f"Error generating RSS feed for {url}: {e}"
293
  else:
294
  return "No data available."
295
 
296
  # Function to define the chat response function using the Mistral model
297
  def respond(message, history, system_message, max_tokens, temperature, top_p):
298
+ # Load the model and tokenizer once
299
+ if not hasattr(respond, "pipe"):
300
+ try:
301
+ model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
302
+ respond.tokenizer = AutoTokenizer.from_pretrained(model_name)
303
+ respond.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
304
+ respond.pipe = pipeline(
305
+ "text-generation",
306
+ model=respond.model,
307
+ tokenizer=respond.tokenizer,
308
+ device=0 if torch.cuda.is_available() else -1,
309
+ )
310
+ logging.info("Model loaded successfully.")
311
+ except Exception as e:
312
+ logging.error(f"Error loading model: {e}")
313
+ return "Error loading the response model."
314
+
315
+ try:
316
+ prompt = (
317
+ f"System: {system_message}\n"
318
+ f"History: {history}\n"
319
+ f"User: {message}\n"
320
+ f"Assistant:"
321
+ )
322
+ response = respond.pipe(
323
+ prompt, max_length=max_tokens, temperature=temperature, top_p=top_p
324
+ )[0]["generated_text"]
325
+ return response
326
+ except Exception as e:
327
+ logging.error(f"Error generating response: {e}")
328
+ return "Error generating response."
329
 
330
  # Define the Gradio interface
331
  def create_interface():
332
  with gr.Blocks() as demo:
333
+ gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
334
+
335
  with gr.Row():
336
  with gr.Column():
337
+ storage_location = gr.Textbox(
338
+ value=DEFAULT_FILE_PATH, label="Storage Location"
339
+ )
340
+ urls = gr.Textbox(
341
+ label="URLs (comma separated)",
342
+ placeholder="https://example.com, https://anotherexample.com",
343
+ )
344
+ scrape_interval = gr.Slider(
345
+ minimum=1,
346
+ maximum=60,
347
+ value=5,
348
+ step=1,
349
+ label="Scrape Interval (minutes)",
350
+ )
351
+ content_type = gr.Radio(
352
+ choices=["text", "media", "both"],
353
+ value="text",
354
+ label="Content Type",
355
+ )
356
+ selector = gr.Textbox(
357
+ label="CSS Selector for Media (Optional)",
358
+ placeholder="e.g., img.main-image",
359
+ )
360
  start_button = gr.Button("Start Scraping")
361
  stop_button = gr.Button("Stop Scraping")
362
+ csv_output = gr.Textbox(
363
+ label="CSV Output", interactive=False, lines=2
364
+ )
365
 
366
  with gr.Column():
367
  chat_history = gr.Chatbot(label="Chat History")
368
+ with gr.Row():
369
+ message = gr.Textbox(label="Message", placeholder="Type your message here...")
370
+ system_message = gr.Textbox(
371
+ value="You are a helpful assistant.", label="System message"
372
+ )
373
+ max_tokens = gr.Slider(
374
+ minimum=1,
375
+ maximum=2048,
376
+ value=512,
377
+ step=1,
378
+ label="Max new tokens",
379
+ )
380
+ temperature = gr.Slider(
381
+ minimum=0.1,
382
+ maximum=4.0,
383
+ value=0.7,
384
+ step=0.1,
385
+ label="Temperature",
386
+ )
387
+ top_p = gr.Slider(
388
+ minimum=0.1,
389
+ maximum=1.0,
390
+ value=0.95,
391
+ step=0.05,
392
+ label="Top-p (nucleus sampling)",
393
+ )
394
+ response_box = gr.Textbox(label="Response", interactive=False, lines=2)
395
 
 
 
 
 
 
 
396
  with gr.Row():
397
+ selected_url_csv = gr.Textbox(
398
+ label="Select URL for CSV Content",
399
+ placeholder="https://example.com",
400
+ )
401
  csv_button = gr.Button("Display CSV Content")
402
+ csv_content_output = gr.Textbox(
403
+ label="CSV Content Output", interactive=False, lines=10
404
+ )
405
 
 
406
  with gr.Row():
407
+ selected_url_rss = gr.Textbox(
408
+ label="Select URL for RSS Feed",
409
+ placeholder="https://example.com",
410
+ )
411
  rss_button = gr.Button("Generate RSS Feed")
412
+ rss_output = gr.Textbox(
413
+ label="RSS Feed Output", interactive=False, lines=20
414
+ )
415
+
416
+ # Connect buttons to their respective functions
417
+ start_button.click(
418
+ fn=start_scraping,
419
+ inputs=[
420
+ storage_location,
421
+ gr.Textbox.value,
422
+ scrape_interval,
423
+ content_type,
424
+ selector,
425
+ ],
426
+ outputs=csv_output,
427
+ )
428
+
429
+ stop_button.click(fn=stop_scraping, outputs=csv_output)
430
+
431
+ csv_button.click(
432
+ fn=display_csv,
433
+ inputs=[storage_location, selected_url_csv],
434
+ outputs=csv_content_output,
435
+ )
436
+
437
+ rss_button.click(
438
+ fn=generate_rss_feed,
439
+ inputs=[storage_location, selected_url_rss],
440
+ outputs=rss_output,
441
+ )
442
+
443
+ # Connect message submission to the chat interface
444
+ def update_chat(message, history, system_message, max_tokens, temperature, top_p):
445
+ response = respond(message, history, system_message, max_tokens, temperature, top_p)
446
+ history.append((message, response))
447
+ return history, response
448
 
449
+ message.submit(
450
+ update_chat,
451
+ inputs=[
452
+ message,
453
+ chat_history,
454
+ system_message,
455
+ max_tokens,
456
+ temperature,
457
+ top_p,
458
+ ],
459
+ outputs=[chat_history, response_box],
460
+ )
461
 
462
  return demo
463