Spaces:
Running
Running
siddhartharya
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -8,13 +8,13 @@ import numpy as np
|
|
8 |
import requests
|
9 |
import time
|
10 |
import re
|
11 |
-
import base64
|
12 |
import logging
|
13 |
import os
|
14 |
import sys
|
15 |
-
import concurrent.futures
|
16 |
-
from concurrent.futures import ThreadPoolExecutor
|
17 |
import threading
|
|
|
|
|
|
|
18 |
|
19 |
# Import OpenAI library
|
20 |
import openai
|
@@ -83,9 +83,82 @@ if not GROQ_API_KEY:
|
|
83 |
openai.api_key = GROQ_API_KEY
|
84 |
openai.api_base = "https://api.groq.com/openai/v1"
|
85 |
|
86 |
-
#
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
def extract_main_content(soup):
|
91 |
"""
|
@@ -156,171 +229,169 @@ def get_page_metadata(soup):
|
|
156 |
|
157 |
return metadata
|
158 |
|
159 |
-
def
|
160 |
"""
|
161 |
-
|
162 |
"""
|
163 |
-
logger.info(
|
164 |
-
|
165 |
-
|
166 |
-
retry_count = 0
|
167 |
-
|
168 |
-
while retry_count < max_retries:
|
169 |
try:
|
170 |
-
#
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
#
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
prompt
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
You are an assistant that creates concise webpage summaries and assigns categories.
|
226 |
-
Content:
|
227 |
-
{content_text}
|
228 |
-
Provide:
|
229 |
-
1. A concise summary (max two sentences) focusing on the main topic.
|
230 |
-
2. Assign the most appropriate category from the list below.
|
231 |
-
Categories:
|
232 |
-
{', '.join([f'"{cat}"' for cat in CATEGORIES])}
|
233 |
-
Format:
|
234 |
-
Summary: [Your summary]
|
235 |
-
Category: [One category]
|
236 |
-
"""
|
237 |
-
|
238 |
-
def estimate_tokens(text):
|
239 |
-
return len(text) / 4
|
240 |
-
|
241 |
-
prompt_tokens = estimate_tokens(prompt)
|
242 |
-
max_tokens = 150
|
243 |
-
total_tokens = prompt_tokens + max_tokens
|
244 |
-
|
245 |
-
tokens_per_minute = 40000
|
246 |
-
tokens_per_second = tokens_per_minute / 60
|
247 |
-
required_delay = total_tokens / tokens_per_second
|
248 |
-
sleep_time = max(required_delay, 2)
|
249 |
-
|
250 |
-
response = openai.ChatCompletion.create(
|
251 |
-
model='llama-3.1-70b-versatile',
|
252 |
-
messages=[
|
253 |
-
{"role": "user", "content": prompt}
|
254 |
-
],
|
255 |
-
max_tokens=int(max_tokens),
|
256 |
-
temperature=0.5,
|
257 |
-
)
|
258 |
-
|
259 |
-
content = response['choices'][0]['message']['content'].strip()
|
260 |
-
if not content:
|
261 |
-
raise ValueError("Empty response received from the model.")
|
262 |
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
break
|
291 |
-
|
292 |
-
except openai.error.RateLimitError as e:
|
293 |
-
retry_count += 1
|
294 |
-
wait_time = int(e.headers.get("Retry-After", 5))
|
295 |
-
logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
|
296 |
-
time.sleep(wait_time)
|
297 |
-
except Exception as e:
|
298 |
-
logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
|
299 |
-
bookmark['summary'] = 'No summary available.'
|
300 |
-
bookmark['category'] = 'Uncategorized'
|
301 |
-
break
|
302 |
|
303 |
-
def
|
304 |
"""
|
305 |
-
|
306 |
"""
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
logger.info(f"Skipping non-http/https URL: {url}")
|
319 |
-
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
|
320 |
-
return extracted_bookmarks
|
321 |
-
except Exception as e:
|
322 |
-
logger.error("Error parsing bookmarks: %s", e, exc_info=True)
|
323 |
-
raise
|
324 |
|
325 |
def fetch_url_info(bookmark):
|
326 |
"""
|
@@ -382,6 +453,28 @@ def fetch_url_info(bookmark):
|
|
382 |
'slow_link': bookmark.get('slow_link', False),
|
383 |
}
|
384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
def vectorize_and_index(bookmarks_list):
|
386 |
"""
|
387 |
Create vector embeddings for bookmarks and build FAISS index with ID mapping.
|
@@ -453,6 +546,14 @@ def display_bookmarks():
|
|
453 |
logger.info("HTML display generated")
|
454 |
return cards
|
455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
def process_uploaded_file(file, state_bookmarks):
|
457 |
"""
|
458 |
Process the uploaded bookmarks file.
|
@@ -489,10 +590,14 @@ def process_uploaded_file(file, state_bookmarks):
|
|
489 |
with ThreadPoolExecutor(max_workers=10) as executor:
|
490 |
executor.map(fetch_url_info, bookmarks)
|
491 |
|
492 |
-
#
|
493 |
-
logger.info("
|
494 |
-
|
495 |
-
|
|
|
|
|
|
|
|
|
496 |
|
497 |
try:
|
498 |
faiss_index = vectorize_and_index(bookmarks)
|
@@ -619,15 +724,12 @@ def chatbot_response(user_query, chat_history):
|
|
619 |
try:
|
620 |
chat_history.append({"role": "user", "content": user_query})
|
621 |
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
|
629 |
-
time.sleep(sleep_duration)
|
630 |
-
last_api_call_time = time.time()
|
631 |
|
632 |
query_vector = embedding_model.encode([user_query]).astype('float32')
|
633 |
k = 5
|
@@ -635,7 +737,7 @@ def chatbot_response(user_query, chat_history):
|
|
635 |
ids = ids.flatten()
|
636 |
|
637 |
id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
|
638 |
-
matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
|
639 |
|
640 |
if not matching_bookmarks:
|
641 |
answer = "No relevant bookmarks found for your query."
|
@@ -655,30 +757,17 @@ Bookmarks:
|
|
655 |
Provide a concise and helpful response.
|
656 |
"""
|
657 |
|
658 |
-
def estimate_tokens(text):
|
659 |
-
return len(text) / 4
|
660 |
-
|
661 |
-
prompt_tokens = estimate_tokens(prompt)
|
662 |
-
max_tokens = 300
|
663 |
-
total_tokens = prompt_tokens + max_tokens
|
664 |
-
|
665 |
-
tokens_per_minute = 40000
|
666 |
-
tokens_per_second = tokens_per_minute / 60
|
667 |
-
required_delay = total_tokens / tokens_per_second
|
668 |
-
sleep_time = max(required_delay, 2)
|
669 |
-
|
670 |
response = openai.ChatCompletion.create(
|
671 |
-
model='llama-3.1-70b-versatile',
|
672 |
messages=[
|
673 |
{"role": "user", "content": prompt}
|
674 |
],
|
675 |
-
max_tokens=
|
676 |
temperature=0.7,
|
677 |
)
|
678 |
|
679 |
answer = response['choices'][0]['message']['content'].strip()
|
680 |
logger.info("Chatbot response generated")
|
681 |
-
time.sleep(sleep_time)
|
682 |
|
683 |
chat_history.append({"role": "assistant", "content": answer})
|
684 |
return chat_history
|
@@ -809,7 +898,7 @@ Navigate through the tabs to explore each feature in detail.
|
|
809 |
""")
|
810 |
|
811 |
manage_output = gr.Textbox(label="🔄 Status", interactive=False)
|
812 |
-
|
813 |
# CheckboxGroup for selecting bookmarks
|
814 |
bookmark_selector = gr.CheckboxGroup(
|
815 |
label="✅ Select Bookmarks",
|
@@ -870,8 +959,12 @@ Navigate through the tabs to explore each feature in detail.
|
|
870 |
logger.info("Launching Gradio app")
|
871 |
demo.launch(debug=True)
|
872 |
except Exception as e:
|
873 |
-
logger.error(f"Error building
|
874 |
-
print(f"Error building
|
875 |
|
876 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
877 |
build_app()
|
|
|
8 |
import requests
|
9 |
import time
|
10 |
import re
|
|
|
11 |
import logging
|
12 |
import os
|
13 |
import sys
|
|
|
|
|
14 |
import threading
|
15 |
+
from queue import Queue, Empty
|
16 |
+
import json
|
17 |
+
from concurrent.futures import ThreadPoolExecutor
|
18 |
|
19 |
# Import OpenAI library
|
20 |
import openai
|
|
|
83 |
openai.api_key = GROQ_API_KEY
|
84 |
openai.api_base = "https://api.groq.com/openai/v1"
|
85 |
|
86 |
+
# Rate Limiter Configuration
|
87 |
+
RPM_LIMIT = 60 # Requests per minute (adjust based on your API's limit)
|
88 |
+
TPM_LIMIT = 60000 # Tokens per minute (adjust based on your API's limit)
|
89 |
+
BATCH_SIZE = 5 # Number of bookmarks per batch
|
90 |
+
|
91 |
+
# Implementing a Token Bucket Rate Limiter
|
92 |
+
class TokenBucket:
|
93 |
+
def __init__(self, rate, capacity):
|
94 |
+
self.rate = rate # tokens per second
|
95 |
+
self.capacity = capacity
|
96 |
+
self.tokens = capacity
|
97 |
+
self.timestamp = time.time()
|
98 |
+
self.lock = threading.Lock()
|
99 |
+
|
100 |
+
def consume(self, tokens=1):
|
101 |
+
with self.lock:
|
102 |
+
now = time.time()
|
103 |
+
elapsed = now - self.timestamp
|
104 |
+
refill = elapsed * self.rate
|
105 |
+
self.tokens = min(self.capacity, self.tokens + refill)
|
106 |
+
self.timestamp = now
|
107 |
+
if self.tokens >= tokens:
|
108 |
+
self.tokens -= tokens
|
109 |
+
return True
|
110 |
+
else:
|
111 |
+
return False
|
112 |
+
|
113 |
+
def wait_for_token(self, tokens=1):
|
114 |
+
while not self.consume(tokens):
|
115 |
+
time.sleep(0.05)
|
116 |
+
|
117 |
+
# Initialize rate limiters
|
118 |
+
rpm_rate = RPM_LIMIT / 60 # tokens per second
|
119 |
+
tpm_rate = TPM_LIMIT / 60 # tokens per second
|
120 |
+
|
121 |
+
rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
|
122 |
+
tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
|
123 |
+
|
124 |
+
# Queue for LLM tasks
|
125 |
+
llm_queue = Queue()
|
126 |
+
|
127 |
+
def categorize_based_on_summary(summary, url):
|
128 |
+
"""
|
129 |
+
Assign category based on keywords in the summary or URL.
|
130 |
+
"""
|
131 |
+
summary_lower = summary.lower()
|
132 |
+
url_lower = url.lower()
|
133 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
134 |
+
return 'Social Media'
|
135 |
+
elif 'wikipedia' in url_lower:
|
136 |
+
return 'Reference and Knowledge Bases'
|
137 |
+
elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
|
138 |
+
return 'Technology'
|
139 |
+
elif 'news' in summary_lower or 'media' in summary_lower:
|
140 |
+
return 'News and Media'
|
141 |
+
elif 'education' in summary_lower or 'learning' in summary_lower:
|
142 |
+
return 'Education and Learning'
|
143 |
+
# Add more conditions as needed
|
144 |
+
else:
|
145 |
+
return 'Uncategorized'
|
146 |
+
|
147 |
+
def validate_category(bookmark):
|
148 |
+
"""
|
149 |
+
Further validate and adjust the category if needed.
|
150 |
+
"""
|
151 |
+
# Example: Specific cases based on URL
|
152 |
+
url_lower = bookmark['url'].lower()
|
153 |
+
if 'facebook' in url_lower or 'x.com' in url_lower:
|
154 |
+
return 'Social Media'
|
155 |
+
elif 'wikipedia' in url_lower:
|
156 |
+
return 'Reference and Knowledge Bases'
|
157 |
+
elif 'aws.amazon.com' in url_lower:
|
158 |
+
return 'Technology'
|
159 |
+
# Add more specific cases as needed
|
160 |
+
else:
|
161 |
+
return bookmark['category']
|
162 |
|
163 |
def extract_main_content(soup):
|
164 |
"""
|
|
|
229 |
|
230 |
return metadata
|
231 |
|
232 |
+
def llm_worker():
|
233 |
"""
|
234 |
+
Worker thread to process LLM tasks from the queue while respecting rate limits.
|
235 |
"""
|
236 |
+
logger.info("LLM worker started.")
|
237 |
+
while True:
|
238 |
+
batch = []
|
|
|
|
|
|
|
239 |
try:
|
240 |
+
# Collect bookmarks up to BATCH_SIZE
|
241 |
+
while len(batch) < BATCH_SIZE:
|
242 |
+
bookmark = llm_queue.get(timeout=1)
|
243 |
+
if bookmark is None:
|
244 |
+
# Shutdown signal
|
245 |
+
logger.info("LLM worker shutting down.")
|
246 |
+
return
|
247 |
+
if not bookmark.get('dead_link') and not bookmark.get('slow_link'):
|
248 |
+
batch.append(bookmark)
|
249 |
+
else:
|
250 |
+
# Skip processing for dead or slow links
|
251 |
+
bookmark['summary'] = 'No summary available.'
|
252 |
+
bookmark['category'] = 'Uncategorized'
|
253 |
+
llm_queue.task_done()
|
254 |
+
|
255 |
+
except Empty:
|
256 |
+
pass # No more bookmarks at the moment
|
257 |
+
|
258 |
+
if batch:
|
259 |
+
try:
|
260 |
+
# Rate Limiting
|
261 |
+
rpm_bucket.wait_for_token()
|
262 |
+
# Estimate tokens: prompt + max_tokens
|
263 |
+
# Here, we assume max_tokens=150 per bookmark
|
264 |
+
total_tokens = 150 * len(batch)
|
265 |
+
tpm_bucket.wait_for_token(tokens=total_tokens)
|
266 |
+
|
267 |
+
# Prepare prompt
|
268 |
+
prompt = "You are an assistant that creates concise webpage summaries and assigns categories.\n\n"
|
269 |
+
prompt += "Provide summaries and categories for the following bookmarks:\n\n"
|
270 |
+
|
271 |
+
for idx, bookmark in enumerate(batch, 1):
|
272 |
+
prompt += f"Bookmark {idx}:\nURL: {bookmark['url']}\nTitle: {bookmark['title']}\n\n"
|
273 |
+
|
274 |
+
# Corrected f-string without backslashes
|
275 |
+
prompt += f"Categories:\n{', '.join([f'\"{cat}\"' for cat in CATEGORIES])}\n\n"
|
276 |
+
|
277 |
+
prompt += "Format your response as a JSON object where each key is the bookmark URL and the value is another JSON object containing 'summary' and 'category'.\n\n"
|
278 |
+
prompt += "Example:\n"
|
279 |
+
prompt += "{\n"
|
280 |
+
prompt += " \"https://example.com\": {\n"
|
281 |
+
prompt += " \"summary\": \"This is an example summary.\",\n"
|
282 |
+
prompt += " \"category\": \"Technology\"\n"
|
283 |
+
prompt += " }\n"
|
284 |
+
prompt += "}\n\n"
|
285 |
+
prompt += "Now, provide the summaries and categories for the bookmarks listed above."
|
286 |
+
|
287 |
+
response = openai.ChatCompletion.create(
|
288 |
+
model='llama-3.1-70b-versatile', # Ensure this model is correct and available
|
289 |
+
messages=[
|
290 |
+
{"role": "user", "content": prompt}
|
291 |
+
],
|
292 |
+
max_tokens=150 * len(batch),
|
293 |
+
temperature=0.5,
|
294 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
+
content = response['choices'][0]['message']['content'].strip()
|
297 |
+
if not content:
|
298 |
+
raise ValueError("Empty response received from the model.")
|
299 |
+
|
300 |
+
# Parse JSON response
|
301 |
+
try:
|
302 |
+
json_response = json.loads(content)
|
303 |
+
for bookmark in batch:
|
304 |
+
url = bookmark['url']
|
305 |
+
if url in json_response:
|
306 |
+
summary = json_response[url].get('summary', '').strip()
|
307 |
+
category = json_response[url].get('category', '').strip()
|
308 |
+
|
309 |
+
if not summary:
|
310 |
+
summary = 'No summary available.'
|
311 |
+
bookmark['summary'] = summary
|
312 |
+
|
313 |
+
if category in CATEGORIES:
|
314 |
+
bookmark['category'] = category
|
315 |
+
else:
|
316 |
+
# Fallback to keyword-based categorization
|
317 |
+
bookmark['category'] = categorize_based_on_summary(summary, url)
|
318 |
+
else:
|
319 |
+
logger.warning(f"No data returned for {url}. Using fallback methods.")
|
320 |
+
bookmark['summary'] = 'No summary available.'
|
321 |
+
bookmark['category'] = 'Uncategorized'
|
322 |
+
|
323 |
+
# Additional keyword-based validation
|
324 |
+
bookmark['category'] = validate_category(bookmark)
|
325 |
+
|
326 |
+
logger.info(f"Processed bookmark: {url}")
|
327 |
+
|
328 |
+
except json.JSONDecodeError:
|
329 |
+
logger.error("Failed to parse JSON response from LLM. Using fallback methods.")
|
330 |
+
for bookmark in batch:
|
331 |
+
bookmark['summary'] = 'No summary available.'
|
332 |
+
bookmark['category'] = categorize_based_on_summary(bookmark.get('summary', ''), bookmark['url'])
|
333 |
+
bookmark['category'] = validate_category(bookmark)
|
334 |
+
|
335 |
+
except Exception as e:
|
336 |
+
logger.error(f"Error processing LLM response: {e}", exc_info=True)
|
337 |
+
for bookmark in batch:
|
338 |
+
bookmark['summary'] = 'No summary available.'
|
339 |
+
bookmark['category'] = 'Uncategorized'
|
340 |
+
|
341 |
+
except openai.error.RateLimitError as e:
|
342 |
+
logger.warning(f"LLM Rate limit reached. Retrying after 60 seconds.")
|
343 |
+
# Re-enqueue the entire batch for retry
|
344 |
+
for bookmark in batch:
|
345 |
+
llm_queue.put(bookmark)
|
346 |
+
time.sleep(60) # Wait before retrying
|
347 |
+
continue # Skip the rest and retry
|
348 |
+
|
349 |
+
except Exception as e:
|
350 |
+
logger.error(f"Error during LLM processing: {e}", exc_info=True)
|
351 |
+
for bookmark in batch:
|
352 |
+
bookmark['summary'] = 'No summary available.'
|
353 |
+
bookmark['category'] = 'Uncategorized'
|
354 |
|
355 |
+
finally:
|
356 |
+
# Mark all bookmarks in the batch as done
|
357 |
+
for _ in batch:
|
358 |
+
llm_queue.task_done()
|
359 |
|
360 |
+
def categorize_based_on_summary(summary, url):
|
361 |
+
"""
|
362 |
+
Assign category based on keywords in the summary or URL.
|
363 |
+
"""
|
364 |
+
summary_lower = summary.lower()
|
365 |
+
url_lower = url.lower()
|
366 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
367 |
+
return 'Social Media'
|
368 |
+
elif 'wikipedia' in url_lower:
|
369 |
+
return 'Reference and Knowledge Bases'
|
370 |
+
elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
|
371 |
+
return 'Technology'
|
372 |
+
elif 'news' in summary_lower or 'media' in summary_lower:
|
373 |
+
return 'News and Media'
|
374 |
+
elif 'education' in summary_lower or 'learning' in summary_lower:
|
375 |
+
return 'Education and Learning'
|
376 |
+
# Add more conditions as needed
|
377 |
+
else:
|
378 |
+
return 'Uncategorized'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
|
380 |
+
def validate_category(bookmark):
|
381 |
"""
|
382 |
+
Further validate and adjust the category if needed.
|
383 |
"""
|
384 |
+
# Example: Specific cases based on URL
|
385 |
+
url_lower = bookmark['url'].lower()
|
386 |
+
if 'facebook' in url_lower or 'x.com' in url_lower:
|
387 |
+
return 'Social Media'
|
388 |
+
elif 'wikipedia' in url_lower:
|
389 |
+
return 'Reference and Knowledge Bases'
|
390 |
+
elif 'aws.amazon.com' in url_lower:
|
391 |
+
return 'Technology'
|
392 |
+
# Add more specific cases as needed
|
393 |
+
else:
|
394 |
+
return bookmark['category']
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
|
396 |
def fetch_url_info(bookmark):
|
397 |
"""
|
|
|
453 |
'slow_link': bookmark.get('slow_link', False),
|
454 |
}
|
455 |
|
456 |
+
def parse_bookmarks(file_content):
|
457 |
+
"""
|
458 |
+
Parse bookmarks from HTML file.
|
459 |
+
"""
|
460 |
+
logger.info("Parsing bookmarks")
|
461 |
+
try:
|
462 |
+
soup = BeautifulSoup(file_content, 'html.parser')
|
463 |
+
extracted_bookmarks = []
|
464 |
+
for link in soup.find_all('a'):
|
465 |
+
url = link.get('href')
|
466 |
+
title = link.text.strip()
|
467 |
+
if url and title:
|
468 |
+
if url.startswith('http://') or url.startswith('https://'):
|
469 |
+
extracted_bookmarks.append({'url': url, 'title': title})
|
470 |
+
else:
|
471 |
+
logger.info(f"Skipping non-http/https URL: {url}")
|
472 |
+
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
|
473 |
+
return extracted_bookmarks
|
474 |
+
except Exception as e:
|
475 |
+
logger.error("Error parsing bookmarks: %s", e, exc_info=True)
|
476 |
+
raise
|
477 |
+
|
478 |
def vectorize_and_index(bookmarks_list):
|
479 |
"""
|
480 |
Create vector embeddings for bookmarks and build FAISS index with ID mapping.
|
|
|
546 |
logger.info("HTML display generated")
|
547 |
return cards
|
548 |
|
549 |
+
def generate_summary_and_assign_category(bookmark):
|
550 |
+
"""
|
551 |
+
Generate a concise summary and assign a category using a single LLM call.
|
552 |
+
This function is now handled by the LLM worker thread.
|
553 |
+
"""
|
554 |
+
# This function is now deprecated and handled by the worker thread.
|
555 |
+
pass
|
556 |
+
|
557 |
def process_uploaded_file(file, state_bookmarks):
|
558 |
"""
|
559 |
Process the uploaded bookmarks file.
|
|
|
590 |
with ThreadPoolExecutor(max_workers=10) as executor:
|
591 |
executor.map(fetch_url_info, bookmarks)
|
592 |
|
593 |
+
# Enqueue bookmarks for LLM processing
|
594 |
+
logger.info("Enqueuing bookmarks for LLM processing")
|
595 |
+
for bookmark in bookmarks:
|
596 |
+
llm_queue.put(bookmark)
|
597 |
+
|
598 |
+
# Wait until all LLM tasks are completed
|
599 |
+
llm_queue.join()
|
600 |
+
logger.info("All LLM tasks have been processed")
|
601 |
|
602 |
try:
|
603 |
faiss_index = vectorize_and_index(bookmarks)
|
|
|
724 |
try:
|
725 |
chat_history.append({"role": "user", "content": user_query})
|
726 |
|
727 |
+
# Rate Limiting
|
728 |
+
rpm_bucket.wait_for_token()
|
729 |
+
# Estimate tokens: prompt + max_tokens
|
730 |
+
# Here, we assume max_tokens=300 per chatbot response
|
731 |
+
total_tokens = 300 # Adjust based on actual usage
|
732 |
+
tpm_bucket.wait_for_token(tokens=total_tokens)
|
|
|
|
|
|
|
733 |
|
734 |
query_vector = embedding_model.encode([user_query]).astype('float32')
|
735 |
k = 5
|
|
|
737 |
ids = ids.flatten()
|
738 |
|
739 |
id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
|
740 |
+
matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark and id_to_bookmark.get(id).get('summary')]
|
741 |
|
742 |
if not matching_bookmarks:
|
743 |
answer = "No relevant bookmarks found for your query."
|
|
|
757 |
Provide a concise and helpful response.
|
758 |
"""
|
759 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
760 |
response = openai.ChatCompletion.create(
|
761 |
+
model='llama-3.1-70b-versatile', # Ensure this model is correct and available
|
762 |
messages=[
|
763 |
{"role": "user", "content": prompt}
|
764 |
],
|
765 |
+
max_tokens=300,
|
766 |
temperature=0.7,
|
767 |
)
|
768 |
|
769 |
answer = response['choices'][0]['message']['content'].strip()
|
770 |
logger.info("Chatbot response generated")
|
|
|
771 |
|
772 |
chat_history.append({"role": "assistant", "content": answer})
|
773 |
return chat_history
|
|
|
898 |
""")
|
899 |
|
900 |
manage_output = gr.Textbox(label="🔄 Status", interactive=False)
|
901 |
+
|
902 |
# CheckboxGroup for selecting bookmarks
|
903 |
bookmark_selector = gr.CheckboxGroup(
|
904 |
label="✅ Select Bookmarks",
|
|
|
959 |
logger.info("Launching Gradio app")
|
960 |
demo.launch(debug=True)
|
961 |
except Exception as e:
|
962 |
+
logger.error(f"Error building Gradio app: {e}", exc_info=True)
|
963 |
+
print(f"Error building Gradio app: {e}")
|
964 |
|
965 |
if __name__ == "__main__":
|
966 |
+
# Start the LLM worker thread before launching the app
|
967 |
+
llm_thread = threading.Thread(target=llm_worker, daemon=True)
|
968 |
+
llm_thread.start()
|
969 |
+
|
970 |
build_app()
|