Spaces:

hatamo
/

Antique_Auth_API

Running

App Files Files Community

Antique_Auth_API / code /web_scraper_allegro.py

hatamo

Modified scrapper for allegro

948dcae 10 days ago

raw

history blame contribute delete

3.74 kB

	import requests
	from bs4 import BeautifulSoup
	import re

	def scrape_allegro_with_bs4(url: str):
	"""BeautifulSoup bez Selenium — działa na HF Spaces"""

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	"Accept-Language": "pl-PL,pl;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Referer": "https://allegro.pl/",
	"DNT": "1"
	}

	try:
	print(f"🔍 Scraping: {url}")
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, "html.parser")

	# TITLE
	title = "untitled"
	try:
	title_tag = soup.find("h1")
	if title_tag:
	title = title_tag.get_text(strip=True)
	except:
	pass

	# IMAGES (szukaj w JSON-LD lub img tags)
	image_urls = set()
	try:
	# Metoda 1: Szukaj w script tag (JSON-LD)
	scripts = soup.find_all("script", type="application/ld+json")
	for script in scripts:
	try:
	import json
	data = json.loads(script.string)
	if isinstance(data, list):
	data = data[0]
	if "image" in data:
	images = data["image"]
	if isinstance(images, list):
	image_urls.update(images)
	else:
	image_urls.add(images)
	except:
	pass

	# Metoda 2: Szukaj img tags z allegroimg
	for img in soup.find_all("img"):
	src = img.get("src") or img.get("data-src")
	if src and "allegroimg.com" in src:
	# Normalize to original
	src = re.sub(r"/s\d+/", "/original/", src)
	image_urls.add(src)
	except Exception as e:
	print(f"Błąd zdjęć: {e}")

	# PARAMETERS (zwykle w tabeli)
	params = []
	try:
	for row in soup.find_all("tr"):
	cells = row.find_all("td")
	if len(cells) == 2:
	name = cells[0].get_text(strip=True)
	value = cells[1].get_text(strip=True)
	if name and value:
	params.append(f"{name}: {value}")
	except:
	pass

	# DESCRIPTION
	description = "No description"
	try:
	desc_div = soup.find("div", {"itemprop": "description"})
	if desc_div:
	description = desc_div.get_text(strip=True)[:500] # Limit
	except:
	# Fallback
	try:
	desc_div = soup.find("div", class_=re.compile("description"))
	if desc_div:
	description = desc_div.get_text(strip=True)[:500]
	except:
	pass

	return {
	"platform": "allegro",
	"url": url,
	"title": title,
	"description": description,
	"parameters": params,
	"image_urls": list(image_urls)
	}

	except requests.exceptions.RequestException as e:
	return {
	"status": "error",
	"error": f"Request failed: {str(e)}",
	"platform": "allegro",
	"url": url
	}


	if __name__ == "__main__":
	url = input("Allegro URL: ")
	data = scrape_allegro_with_bs4(url)
	print(data)