Shreyas094 commited on
Commit
07efc76
1 Parent(s): 1a81bf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -62
app.py CHANGED
@@ -371,71 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
371
  break
372
 
373
  for result in results:
374
- if len(scraped_content) >= num_results:
375
- break
376
-
377
- url = result.get('url', '')
378
- title = result.get('title', 'No title')
379
-
380
- if not is_valid_url(url):
381
- logger.warning(f"Invalid URL: {url}")
382
- continue
383
-
384
- try:
385
- logger.info(f"Scraping content from: {url}")
386
-
387
- # Implement a retry mechanism with different user agents
388
- user_agents = [
389
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
390
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
391
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
392
- ]
393
-
394
- content = ""
395
- for ua in user_agents:
396
- try:
397
- if scraper == "bs4":
398
- session.headers.update({'User-Agent': ua})
399
- content = scrape_with_bs4(url, session, max_chars)
400
- else: # trafilatura
401
- # Use urllib to handle custom headers for trafilatura
402
- req = Request(url, headers={'User-Agent': ua})
403
- with urlopen(req) as response:
404
- downloaded = response.read()
405
-
406
- # Configure trafilatura to use a specific user agent
407
- config = use_config()
408
- config.set("DEFAULT", "USER_AGENT", ua)
 
 
409
 
410
- content = scrape_with_trafilatura(url, max_chars)
411
-
412
- if content:
413
- break
414
- except requests.exceptions.HTTPError as e:
415
- if e.response.status_code == 403:
416
- logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
 
 
 
417
  continue
418
- else:
419
- raise
420
- except Exception as e:
421
- logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
422
  continue
423
-
424
- if not content:
425
- logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
426
- continue
427
-
428
- scraped_content.append({
429
- "title": title,
430
- "url": url,
431
- "content": content, # No need to slice here as it's already limited
432
- "scraper": scraper
433
- })
434
- logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
435
- except requests.exceptions.RequestException as e:
436
- logger.error(f"Error scraping {url}: {e}")
437
- except Exception as e:
438
- logger.error(f"Unexpected error while scraping {url}: {e}")
439
 
440
  page += 1
441
 
 
371
  break
372
 
373
  for result in results:
374
+ if len(scraped_content) >= num_results:
375
+ break
376
+
377
+ url = result.get('url', '')
378
+ title = result.get('title', 'No title')
379
+
380
+ if not is_valid_url(url):
381
+ logger.warning(f"Invalid URL: {url}")
382
+ continue
383
+
384
+ try:
385
+ logger.info(f"Scraping content from: {url}")
386
+
387
+ # Implement a retry mechanism with different user agents
388
+ user_agents = [
389
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
390
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
391
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
392
+ ]
393
+
394
+ content = ""
395
+ for ua in user_agents:
396
+ try:
397
+ if scraper == "bs4":
398
+ session.headers.update({'User-Agent': ua})
399
+ content = scrape_with_bs4(url, session, max_chars)
400
+ else: # trafilatura
401
+ # Use urllib to handle custom headers for trafilatura
402
+ req = Request(url, headers={'User-Agent': ua})
403
+ with urlopen(req) as response:
404
+ downloaded = response.read()
405
+
406
+ # Configure trafilatura to use a specific user agent
407
+ config = use_config()
408
+ config.set("DEFAULT", "USER_AGENT", ua)
409
+
410
+ content = scrape_with_trafilatura(url, max_chars)
411
 
412
+ if content:
413
+ break
414
+ except requests.exceptions.HTTPError as e:
415
+ if e.response.status_code == 403:
416
+ logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
417
+ continue
418
+ else:
419
+ raise
420
+ except Exception as e:
421
+ logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
422
  continue
423
+
424
+ if not content:
425
+ logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
 
426
  continue
427
+
428
+ scraped_content.append({
429
+ "title": title,
430
+ "url": url,
431
+ "content": content, # No need to slice here as it's already limited
432
+ "scraper": scraper
433
+ })
434
+ logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
435
+ except requests.exceptions.RequestException as e:
436
+ logger.error(f"Error scraping {url}: {e}")
437
+ except Exception as e:
438
+ logger.error(f"Unexpected error while scraping {url}: {e}")
 
 
 
 
439
 
440
  page += 1
441