Shreyas094
commited on
Commit
•
07efc76
1
Parent(s):
1a81bf1
Update app.py
Browse files
app.py
CHANGED
@@ -371,71 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
371 |
break
|
372 |
|
373 |
for result in results:
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
|
|
|
|
409 |
|
410 |
-
content
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
|
|
|
|
|
|
417 |
continue
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
422 |
continue
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
"url
|
431 |
-
|
432 |
-
"
|
433 |
-
|
434 |
-
|
435 |
-
except requests.exceptions.RequestException as e:
|
436 |
-
logger.error(f"Error scraping {url}: {e}")
|
437 |
-
except Exception as e:
|
438 |
-
logger.error(f"Unexpected error while scraping {url}: {e}")
|
439 |
|
440 |
page += 1
|
441 |
|
|
|
371 |
break
|
372 |
|
373 |
for result in results:
|
374 |
+
if len(scraped_content) >= num_results:
|
375 |
+
break
|
376 |
+
|
377 |
+
url = result.get('url', '')
|
378 |
+
title = result.get('title', 'No title')
|
379 |
+
|
380 |
+
if not is_valid_url(url):
|
381 |
+
logger.warning(f"Invalid URL: {url}")
|
382 |
+
continue
|
383 |
+
|
384 |
+
try:
|
385 |
+
logger.info(f"Scraping content from: {url}")
|
386 |
+
|
387 |
+
# Implement a retry mechanism with different user agents
|
388 |
+
user_agents = [
|
389 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
390 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
391 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
392 |
+
]
|
393 |
+
|
394 |
+
content = ""
|
395 |
+
for ua in user_agents:
|
396 |
+
try:
|
397 |
+
if scraper == "bs4":
|
398 |
+
session.headers.update({'User-Agent': ua})
|
399 |
+
content = scrape_with_bs4(url, session, max_chars)
|
400 |
+
else: # trafilatura
|
401 |
+
# Use urllib to handle custom headers for trafilatura
|
402 |
+
req = Request(url, headers={'User-Agent': ua})
|
403 |
+
with urlopen(req) as response:
|
404 |
+
downloaded = response.read()
|
405 |
+
|
406 |
+
# Configure trafilatura to use a specific user agent
|
407 |
+
config = use_config()
|
408 |
+
config.set("DEFAULT", "USER_AGENT", ua)
|
409 |
+
|
410 |
+
content = scrape_with_trafilatura(url, max_chars)
|
411 |
|
412 |
+
if content:
|
413 |
+
break
|
414 |
+
except requests.exceptions.HTTPError as e:
|
415 |
+
if e.response.status_code == 403:
|
416 |
+
logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
|
417 |
+
continue
|
418 |
+
else:
|
419 |
+
raise
|
420 |
+
except Exception as e:
|
421 |
+
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
422 |
continue
|
423 |
+
|
424 |
+
if not content:
|
425 |
+
logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
|
|
|
426 |
continue
|
427 |
+
|
428 |
+
scraped_content.append({
|
429 |
+
"title": title,
|
430 |
+
"url": url,
|
431 |
+
"content": content, # No need to slice here as it's already limited
|
432 |
+
"scraper": scraper
|
433 |
+
})
|
434 |
+
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
435 |
+
except requests.exceptions.RequestException as e:
|
436 |
+
logger.error(f"Error scraping {url}: {e}")
|
437 |
+
except Exception as e:
|
438 |
+
logger.error(f"Unexpected error while scraping {url}: {e}")
|
|
|
|
|
|
|
|
|
439 |
|
440 |
page += 1
|
441 |
|