Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import json | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
import time | |
import re | |
import os | |
# URL of the your website | |
url = 'https://XXX.com' | |
# Set Chrome options to enable headless mode | |
chrome_options = Options() | |
chrome_options.add_argument('--headless') | |
chrome_options.add_argument('--no-sandbox') | |
chrome_options.add_argument('--disable-dev-shm-usage') | |
# Set the path to chromedriver executable | |
chromedriver_path = '/usr/local/bin/chromedriver' | |
# Create a new Chrome instance | |
driver = webdriver.Chrome(options=chrome_options) | |
# Load the website | |
driver.get(url) | |
# Wait for the page to fully render | |
time.sleep(5) | |
# Extract the rendered HTML | |
html = driver.page_source | |
# Close the Chrome instance | |
driver.quit() | |
# Parse the HTML using BeautifulSoup | |
soup = BeautifulSoup(html, 'html.parser') | |
data = {} | |
# h1〜h4のタグを取得 | |
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5']) | |
(h1_text,h2_text,h3_text,h4_text,h5_text)=("","","","","") | |
for heading in headings: | |
if heading.name == 'h1': | |
h1_text = heading.text | |
key = h1_text | |
elif heading.name == 'h2': | |
h2_text = heading.text | |
key = f"{h1_text}-{h2_text}" | |
elif heading.name == 'h3': | |
h3_text = heading.text | |
key = f"{h1_text}-{h2_text}-{h3_text}" | |
elif heading.name == 'h4': | |
h4_text = heading.text | |
key = f"{h1_text}-{h2_text}-{h3_text}-{h4_text}" | |
elif heading.name == 'h5': | |
h5_text = heading.text | |
key = f"{h1_text}-{h2_text}-{h3_text}-{h5_text}" | |
# 次の要素のテキストを取得 | |
sibling = heading.find_next_sibling() | |
value = '' | |
while sibling and not sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5']: | |
value += sibling.text | |
sibling = sibling.find_next_sibling() | |
data[key] = value.strip() | |
print(len(data),(data.keys())) |