lara1510 commited on
Commit
0bf16bf
1 Parent(s): dfe8078

Create scraperr.py

Browse files
Files changed (1) hide show
  1. scraperr.py +109 -0
scraperr.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp
3
+ import random
4
+ from bs4 import BeautifulSoup
5
+ from googlesearch import search
6
+ from urllib.parse import urlparse
7
+
8
+ async def fetch(url, session, retries=3):
9
+ for attempt in range(retries):
10
+ try:
11
+ async with session.get(url) as response:
12
+ return await response.text()
13
+ except aiohttp.ClientResponseError as e:
14
+ if e.status == 429: # HTTP 429: Too Many Requests
15
+ delay = 2 ** attempt # Exponential backoff
16
+ print(f"Too many requests. Retrying in {delay} seconds...")
17
+ await asyncio.sleep(delay)
18
+ else:
19
+ print(f"Error fetching URL: {e}")
20
+ return None
21
+ print("Maximum retries exceeded. Failed to fetch URL.")
22
+ return None
23
+
24
+
25
+
26
+ async def scrape_google(topic, num_results=20) -> str:
27
+ # Initialize a set to store unique website URLs
28
+ unique_websites = set()
29
+ # Initialize an empty string to accumulate outlines
30
+ outlines_str = ""
31
+
32
+ # Asynchronous HTTP session
33
+ async with aiohttp.ClientSession() as session:
34
+ # Perform Google search
35
+ search_results = search(topic, num=num_results, stop=num_results)
36
+ # Limit search results to 15 if num_results exceeds 15
37
+ search_results = random.sample(list(search_results), min(num_results, 15))
38
+
39
+ # Iterate through search results
40
+ for url in search_results:
41
+ # Exit loop if desired number of websites are found
42
+ if len(unique_websites) >= num_results:
43
+ break
44
+ # Scrape outlines from URL and accumulate them
45
+ outlines_str += await scrape_url(url, session, unique_websites)
46
+
47
+ # Return accumulated outlines string
48
+ return outlines_str
49
+
50
+ async def scrape_url(url, session, unique_websites) -> str:
51
+ try:
52
+ # Extract main domain from the URL
53
+ domain = urlparse(url).netloc.split('www.')[-1].split('/')[0]
54
+
55
+ # Check if the domain has already been processed
56
+ if domain in unique_websites:
57
+ return ""
58
+
59
+ # Fetch HTML content asynchronously
60
+ html_content = await fetch(url, session)
61
+ # Parse HTML content with BeautifulSoup
62
+ soup = BeautifulSoup(html_content, 'html.parser')
63
+ # Extract outlines from parsed HTML
64
+ outlines = extract_outlines(soup)
65
+
66
+ # Skip URLs with less than three outlines
67
+ if len(outlines) < 3:
68
+ return ""
69
+
70
+ # Filter irrelevant outlines
71
+ outlines = filter_outlines(outlines)
72
+ outlines_str = ""
73
+
74
+ # If outlines exist, accumulate them with website info
75
+ if outlines:
76
+ website_name = urlparse(url).netloc # Extract website name from URL
77
+ outlines_str += f"Website: {website_name}\n"
78
+ outlines_str += f"URL: {url}\n"
79
+ outlines_str += "Outlines:\n"
80
+ for outline in outlines:
81
+ outlines_str += ". "+ outline + "\n"
82
+ outlines_str += "----------------------------------------------------------------------------------------------\n"
83
+ # Add the domain to the set of unique websites
84
+ unique_websites.add(domain)
85
+
86
+ return outlines_str
87
+ except Exception as e:
88
+ # Handle exceptions and return empty string
89
+ print("Error processing URL:", url)
90
+ print(e)
91
+ return ""
92
+
93
+
94
+ def is_irrelevant_outline(outline):
95
+ # Minimum length threshold for relevant outlines
96
+ min_length_threshold = 20
97
+ return len(outline) < min_length_threshold
98
+
99
+ def filter_outlines(outlines):
100
+ filtered_outlines = [outline for outline in outlines if not is_irrelevant_outline(outline)]
101
+ filtered_outlines = list(set(filtered_outlines))
102
+ return filtered_outlines
103
+
104
+ def extract_outlines(soup):
105
+ outlines = []
106
+ headings = soup.find_all(['h1', 'h2', 'h3', 'h4'])
107
+ for heading in headings:
108
+ outlines.append(heading.text.strip())
109
+ return outlines