Spaces:
Runtime error
Runtime error
import requests | |
import functools | |
import shutil | |
import codecs | |
import sys | |
import os | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
# URL of the web page you want to extract data from | |
url = "https://google.com" | |
use_tor_network = False | |
if len(sys.argv) > 1: url = sys.argv[1] | |
output_folder = urlparse(url).netloc | |
# initialize a session | |
session = requests.session() | |
if use_tor_network: | |
session.request = functools.partial(session.request, timeout=30) | |
session.proxies = {'http': 'socks5h://localhost:9050', | |
'https': 'socks5h://localhost:9050'} | |
# define workspace from script location | |
workspace = os.path.dirname(os.path.realpath(__file__)) | |
class Extractor: | |
def __init__(self, url): | |
self.url = url | |
self.soup = BeautifulSoup(self.get_page_content(url), "html.parser") | |
self.scraped_urls = self.scrap_all_urls() | |
def run(self): | |
self.save_files(self.scraped_urls) | |
self.save_html() | |
def get_page_content(self, url): | |
try: | |
content = session.get(url) | |
content.encoding = 'utf-8' | |
return content.text | |
except: return None | |
# get the script files | |
def scrap_scripts(self): | |
script_urls = [] | |
for script_tag in self.soup.find_all("script"): | |
# if the tag has the attribute 'script' | |
script_url = script_tag.attrs.get("src") | |
if script_url: | |
if not script_url.startswith('http'): script_url = urljoin(self.url, script_url) | |
else: continue | |
new_url = self.url_to_local_path(script_url, keepQuery=True) | |
if new_url: | |
script_tag['src'] = new_url | |
script_urls.append(script_url.split('?')[0]) | |
return list(dict.fromkeys(script_urls)) | |
# get attributes | |
def scrap_form_attr(self): | |
urls = [] | |
for form_tag in self.soup.find_all("form"): | |
# if the tag has the attribute 'action' | |
form_url = form_tag.attrs.get("action") | |
if form_url: | |
if not form_url.startswith('http'): form_url = urljoin(self.url, form_tag.attrs.get("action")) | |
new_url = self.url_to_local_path(form_url, keepQuery=True) | |
if new_url: | |
form_tag['action'] = new_url | |
urls.append(form_url.split('?')[0]) | |
return list(dict.fromkeys(urls)) | |
def scrap_a_attr(self): | |
urls = [] | |
for link_tag in self.soup.find_all('a'): | |
# if the tag has the attribute 'href' | |
link_url = link_tag.attrs.get('href') | |
if link_url: | |
if not link_url.startswith('http'): link_url = urljoin(self.url, link_tag.attrs.get('href')) | |
new_url = self.url_to_local_path(link_url, keepQuery=True) | |
if new_url: | |
link_tag['href'] = new_url | |
urls.append(link_url.split('?')[0]) | |
return list(dict.fromkeys(urls)) | |
def scrap_img_attr(self): | |
urls = [] | |
for img_tag in self.soup.find_all('img'): | |
# if the tag has the attribute 'src' | |
img_url = img_tag.attrs.get('src') | |
if img_url: | |
if not img_url.startswith('http'): img_url = urljoin(self.url, img_tag.attrs.get('src')) | |
new_url = self.url_to_local_path(img_url, keepQuery=True) | |
if new_url: | |
img_tag['src'] = new_url | |
urls.append(img_url.split('?')[0]) | |
return list(dict.fromkeys(urls)) | |
def scrap_link_attr(self): | |
urls = [] | |
for link_tag in self.soup.find_all('link'): | |
# if the tag has the attribute 'href' | |
link_url = link_tag.attrs.get('href') | |
if link_url: | |
if not link_url.startswith('http'): link_url = urljoin(self.url, link_tag.attrs.get('href')) | |
new_url = self.url_to_local_path(link_url, keepQuery=True) | |
if new_url: | |
link_tag['href'] = new_url | |
urls.append(link_url.split('?')[0]) | |
return list(dict.fromkeys(urls)) | |
def scrap_btn_attr(self): | |
urls = [] | |
for buttons in self.soup.find_all('button'): | |
button_url = buttons.attrs.get('onclick') | |
if not button_url: return None | |
button_url = button_url.replace(' ','') | |
button_url = button_url[button_url.find('location.href='):].replace('location.href=','') | |
button_url = button_url.replace('\'', '') | |
button_url = button_url.replace('\"', '') | |
button_url = button_url.replace('`', '') | |
if button_url and button_url.startswith('/'): | |
if not button_url.startswith('http'): button_url = urljoin(self.url, buttons.get('onclick')) | |
new_url = self.url_to_local_path(button_url, keepQuery=True) | |
if new_url: | |
buttons['onclick'] = new_url | |
urls.append(button_url.split('?')[0]) | |
return list(dict.fromkeys(urls)) | |
# get assets (img and more) | |
def scrap_assets(self): | |
assets_urls = [] | |
form_attr = self.scrap_form_attr() | |
a_attr = self.scrap_a_attr() | |
img_attr = self.scrap_img_attr() | |
link_attr = self.scrap_link_attr() | |
btn_attr = self.scrap_btn_attr() | |
if form_attr: assets_urls = list(set(assets_urls + form_attr)) | |
if a_attr: assets_urls = list(set(assets_urls + a_attr)) | |
if img_attr: assets_urls = list(set(assets_urls + img_attr)) | |
if link_attr: assets_urls = list(set(assets_urls + link_attr)) | |
if btn_attr: assets_urls = list(set(assets_urls + btn_attr)) | |
return assets_urls | |
# scrap every urls | |
def scrap_all_urls(self): | |
urls = [] | |
urls.extend(self.scrap_scripts()) | |
urls.extend(self.scrap_assets()) | |
return list(dict.fromkeys(urls)) | |
# convert url to into local path | |
def url_to_local_path(self, url, keepQuery=False): | |
try: | |
new_url = urlparse(url).path | |
query = urlparse(url).query | |
if keepQuery and query: new_url += '?' + urlparse(url).query | |
if (new_url[0] == '/') or (new_url[0] == '\\'): new_url = new_url[1:] | |
except: | |
return None | |
return new_url | |
# download file from URL | |
def download_file(self, url, output_path): | |
# Remove query string and http from URL | |
url = url.split('?')[0] | |
file_name = url.split('/')[-1] | |
if len(file_name) == 0: return False | |
# Create output directory | |
if not os.path.exists(os.path.dirname(output_path)): | |
os.makedirs(os.path.dirname(output_path)) | |
# Get file content and save it | |
response = session.get(url) | |
with open(output_path, "wb") as file: | |
file.write(response.content) | |
print(f"Downloaded {file_name} to {os.path.relpath(output_path)}") | |
return True | |
def save_files(self, urls): | |
shutil.rmtree(os.path.join(workspace, output_folder), ignore_errors=True) | |
for url in urls: | |
output_path = self.url_to_local_path(url, keepQuery=False) | |
output_path = os.path.join(workspace, output_folder, output_path) | |
self.download_file(url, output_path) | |
return True | |
# save the HTML file | |
def save_html(self): | |
output_path = os.path.join(workspace, output_folder,'index.html') | |
prettyHTML = self.soup.prettify() | |
with codecs.open(output_path, 'w', 'utf-8') as file: | |
file.write(prettyHTML) | |
file.close() | |
print(f"Saved index.html to {os.path.relpath(output_path)}") | |
return True | |
extractor = Extractor(url) | |
print(f"Extracting files from {url}\n") | |
extractor.run() | |
print(f"\nTotal extracted files: {len(extractor.scraped_urls)}") |