Spaces:
Sleeping
Sleeping
Create scraperr.py
Browse files- scraperr.py +109 -0
scraperr.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import aiohttp
|
3 |
+
import random
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from googlesearch import search
|
6 |
+
from urllib.parse import urlparse
|
7 |
+
|
8 |
+
async def fetch(url, session, retries=3):
|
9 |
+
for attempt in range(retries):
|
10 |
+
try:
|
11 |
+
async with session.get(url) as response:
|
12 |
+
return await response.text()
|
13 |
+
except aiohttp.ClientResponseError as e:
|
14 |
+
if e.status == 429: # HTTP 429: Too Many Requests
|
15 |
+
delay = 2 ** attempt # Exponential backoff
|
16 |
+
print(f"Too many requests. Retrying in {delay} seconds...")
|
17 |
+
await asyncio.sleep(delay)
|
18 |
+
else:
|
19 |
+
print(f"Error fetching URL: {e}")
|
20 |
+
return None
|
21 |
+
print("Maximum retries exceeded. Failed to fetch URL.")
|
22 |
+
return None
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
async def scrape_google(topic, num_results=20) -> str:
|
27 |
+
# Initialize a set to store unique website URLs
|
28 |
+
unique_websites = set()
|
29 |
+
# Initialize an empty string to accumulate outlines
|
30 |
+
outlines_str = ""
|
31 |
+
|
32 |
+
# Asynchronous HTTP session
|
33 |
+
async with aiohttp.ClientSession() as session:
|
34 |
+
# Perform Google search
|
35 |
+
search_results = search(topic, num=num_results, stop=num_results)
|
36 |
+
# Limit search results to 15 if num_results exceeds 15
|
37 |
+
search_results = random.sample(list(search_results), min(num_results, 15))
|
38 |
+
|
39 |
+
# Iterate through search results
|
40 |
+
for url in search_results:
|
41 |
+
# Exit loop if desired number of websites are found
|
42 |
+
if len(unique_websites) >= num_results:
|
43 |
+
break
|
44 |
+
# Scrape outlines from URL and accumulate them
|
45 |
+
outlines_str += await scrape_url(url, session, unique_websites)
|
46 |
+
|
47 |
+
# Return accumulated outlines string
|
48 |
+
return outlines_str
|
49 |
+
|
50 |
+
async def scrape_url(url, session, unique_websites) -> str:
|
51 |
+
try:
|
52 |
+
# Extract main domain from the URL
|
53 |
+
domain = urlparse(url).netloc.split('www.')[-1].split('/')[0]
|
54 |
+
|
55 |
+
# Check if the domain has already been processed
|
56 |
+
if domain in unique_websites:
|
57 |
+
return ""
|
58 |
+
|
59 |
+
# Fetch HTML content asynchronously
|
60 |
+
html_content = await fetch(url, session)
|
61 |
+
# Parse HTML content with BeautifulSoup
|
62 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
63 |
+
# Extract outlines from parsed HTML
|
64 |
+
outlines = extract_outlines(soup)
|
65 |
+
|
66 |
+
# Skip URLs with less than three outlines
|
67 |
+
if len(outlines) < 3:
|
68 |
+
return ""
|
69 |
+
|
70 |
+
# Filter irrelevant outlines
|
71 |
+
outlines = filter_outlines(outlines)
|
72 |
+
outlines_str = ""
|
73 |
+
|
74 |
+
# If outlines exist, accumulate them with website info
|
75 |
+
if outlines:
|
76 |
+
website_name = urlparse(url).netloc # Extract website name from URL
|
77 |
+
outlines_str += f"Website: {website_name}\n"
|
78 |
+
outlines_str += f"URL: {url}\n"
|
79 |
+
outlines_str += "Outlines:\n"
|
80 |
+
for outline in outlines:
|
81 |
+
outlines_str += ". "+ outline + "\n"
|
82 |
+
outlines_str += "----------------------------------------------------------------------------------------------\n"
|
83 |
+
# Add the domain to the set of unique websites
|
84 |
+
unique_websites.add(domain)
|
85 |
+
|
86 |
+
return outlines_str
|
87 |
+
except Exception as e:
|
88 |
+
# Handle exceptions and return empty string
|
89 |
+
print("Error processing URL:", url)
|
90 |
+
print(e)
|
91 |
+
return ""
|
92 |
+
|
93 |
+
|
94 |
+
def is_irrelevant_outline(outline):
|
95 |
+
# Minimum length threshold for relevant outlines
|
96 |
+
min_length_threshold = 20
|
97 |
+
return len(outline) < min_length_threshold
|
98 |
+
|
99 |
+
def filter_outlines(outlines):
|
100 |
+
filtered_outlines = [outline for outline in outlines if not is_irrelevant_outline(outline)]
|
101 |
+
filtered_outlines = list(set(filtered_outlines))
|
102 |
+
return filtered_outlines
|
103 |
+
|
104 |
+
def extract_outlines(soup):
|
105 |
+
outlines = []
|
106 |
+
headings = soup.find_all(['h1', 'h2', 'h3', 'h4'])
|
107 |
+
for heading in headings:
|
108 |
+
outlines.append(heading.text.strip())
|
109 |
+
return outlines
|