Spaces:

kenken999
/

fastapi_django_main_live

Sleeping

App Files Files Community

fastapi_django_main_live / polls /crawl.py

kenken999

gpt

25e1c5e 5 months ago

raw

history blame

1.9 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	import time
	import re
	import os

	# URL of the your website
	url = 'https://XXX.com'

	# Set Chrome options to enable headless mode
	chrome_options = Options()
	chrome_options.add_argument('--headless')
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-dev-shm-usage')

	# Set the path to chromedriver executable
	chromedriver_path = '/usr/local/bin/chromedriver'

	# Create a new Chrome instance
	driver = webdriver.Chrome(options=chrome_options)

	# Load the website
	driver.get(url)

	# Wait for the page to fully render
	time.sleep(5)

	# Extract the rendered HTML
	html = driver.page_source

	# Close the Chrome instance
	driver.quit()

	# Parse the HTML using BeautifulSoup
	soup = BeautifulSoup(html, 'html.parser')

	data = {}

	# h1〜h4のタグを取得
	headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5'])
	(h1_text,h2_text,h3_text,h4_text,h5_text)=("","","","","")
	for heading in headings:
	if heading.name == 'h1':
	h1_text = heading.text
	key = h1_text
	elif heading.name == 'h2':
	h2_text = heading.text
	key = f"{h1_text}-{h2_text}"
	elif heading.name == 'h3':
	h3_text = heading.text
	key = f"{h1_text}-{h2_text}-{h3_text}"
	elif heading.name == 'h4':
	h4_text = heading.text
	key = f"{h1_text}-{h2_text}-{h3_text}-{h4_text}"
	elif heading.name == 'h5':
	h5_text = heading.text
	key = f"{h1_text}-{h2_text}-{h3_text}-{h5_text}"

	# 次の要素のテキストを取得
	sibling = heading.find_next_sibling()
	value = ''
	while sibling and not sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
	value += sibling.text
	sibling = sibling.find_next_sibling()

	data[key] = value.strip()

	print(len(data),(data.keys()))