|
import concurrent.futures |
|
import requests |
|
import re |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
import extensions.superboogav2.parameters as parameters |
|
|
|
from .data_processor import process_and_add_to_collector |
|
from .utils import create_metadata_source |
|
|
|
def _download_single(url): |
|
response = requests.get(url, timeout=5) |
|
if response.status_code == 200: |
|
return response.content |
|
else: |
|
raise Exception("Failed to download URL") |
|
|
|
|
|
def _download_urls(urls, threads=1): |
|
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: |
|
futures = [] |
|
for url in urls: |
|
future = executor.submit(_download_single, url) |
|
futures.append(future) |
|
|
|
results = [] |
|
i = 0 |
|
for future in concurrent.futures.as_completed(futures): |
|
try: |
|
result = future.result() |
|
results.append(result) |
|
i += 1 |
|
yield f"{i}/{len(urls)}", results |
|
except Exception: |
|
pass |
|
|
|
yield "Done", results |
|
|
|
|
|
def feed_url_into_collector(urls, collector): |
|
all_text = '' |
|
cumulative = '' |
|
|
|
urls = urls.strip().split('\n') |
|
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n' |
|
yield cumulative |
|
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()): |
|
yield cumulative + update |
|
|
|
cumulative += 'Processing the HTML sources...' |
|
yield cumulative |
|
for content in contents: |
|
soup = BeautifulSoup(content, features="lxml") |
|
for script in soup(["script", "style"]): |
|
script.extract() |
|
|
|
strings = soup.stripped_strings |
|
if parameters.get_is_strong_cleanup(): |
|
strings = [s for s in strings if re.search("[A-Za-z] ", s)] |
|
|
|
text = '\n'.join([s.strip() for s in strings]) |
|
all_text += text |
|
|
|
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download')) |