Arafath10 commited on
Commit
8efa796
1 Parent(s): 9571a2a

Create scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +79 -0
scraper.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scraper.py
2
+
3
+ import asyncio
4
+ from playwright.async_api import async_playwright
5
+ from bs4 import BeautifulSoup
6
+ import requests
7
+
8
+ class Scraper:
9
+ @staticmethod
10
+ async def power_scrapper(url):
11
+ async with async_playwright() as p:
12
+ browser = await p.chromium.launch(headless=True)
13
+ page = await browser.new_page()
14
+
15
+ # Block unnecessary resources to speed up loading
16
+ await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
17
+
18
+ # Open the target website
19
+ await page.goto(url, wait_until='domcontentloaded')
20
+
21
+ # Wait for a short time to ensure dynamic content is loaded
22
+ await page.wait_for_timeout(1000)
23
+
24
+ # Extract all links
25
+ links = await page.query_selector_all('a')
26
+ page_url = []
27
+ page_content = []
28
+ for link in links:
29
+ href = await link.get_attribute('href')
30
+ page_url.append(href)
31
+
32
+ # Extract all text content
33
+ elements = await page.query_selector_all('body *')
34
+
35
+ for element in elements:
36
+ text_content = await element.text_content()
37
+ if text_content and text_content.strip():
38
+ page_content.append(text_content.strip())
39
+
40
+ await browser.close()
41
+ return page_url, page_content
42
+
43
+ @staticmethod
44
+ def get_links(soup):
45
+ links = []
46
+ for link in soup.find_all('a'):
47
+ href = link.get('href')
48
+ links.append(href)
49
+ return links
50
+
51
+ @staticmethod
52
+ def get_text_content(soup):
53
+ text_elements = []
54
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
55
+ elements = soup.find_all(tag)
56
+ for element in elements:
57
+ text_elements.append(element.get_text())
58
+ return text_elements
59
+
60
+ @staticmethod
61
+ def get_title(soup):
62
+ title = soup.find('title').get_text()
63
+ return title
64
+
65
+ @staticmethod
66
+ async def scrape(url):
67
+ headers = {'User-Agent': 'Mozilla/5.0'}
68
+ response = requests.get(url, headers=headers)
69
+ soup = BeautifulSoup(response.content, 'html.parser')
70
+
71
+ title = Scraper.get_title(soup)
72
+ links = Scraper.get_links(soup)
73
+ text_content = Scraper.get_text_content(soup)
74
+
75
+ if not links:
76
+ print("Running alternative scrapper")
77
+ links, text_content = await Scraper.power_scrapper(url)
78
+
79
+ return {"title": title, "URL": links, "Content": text_content}