Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| def scrape_allegro_with_bs4(url: str): | |
| """BeautifulSoup bez Selenium — działa na HF Spaces""" | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Accept-Language": "pl-PL,pl;q=0.9", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Referer": "https://allegro.pl/", | |
| "DNT": "1" | |
| } | |
| try: | |
| print(f"🔍 Scraping: {url}") | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # TITLE | |
| title = "untitled" | |
| try: | |
| title_tag = soup.find("h1") | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| except: | |
| pass | |
| # IMAGES (szukaj w JSON-LD lub img tags) | |
| image_urls = set() | |
| try: | |
| # Metoda 1: Szukaj w script tag (JSON-LD) | |
| scripts = soup.find_all("script", type="application/ld+json") | |
| for script in scripts: | |
| try: | |
| import json | |
| data = json.loads(script.string) | |
| if isinstance(data, list): | |
| data = data[0] | |
| if "image" in data: | |
| images = data["image"] | |
| if isinstance(images, list): | |
| image_urls.update(images) | |
| else: | |
| image_urls.add(images) | |
| except: | |
| pass | |
| # Metoda 2: Szukaj img tags z allegroimg | |
| for img in soup.find_all("img"): | |
| src = img.get("src") or img.get("data-src") | |
| if src and "allegroimg.com" in src: | |
| # Normalize to original | |
| src = re.sub(r"/s\d+/", "/original/", src) | |
| image_urls.add(src) | |
| except Exception as e: | |
| print(f"Błąd zdjęć: {e}") | |
| # PARAMETERS (zwykle w tabeli) | |
| params = [] | |
| try: | |
| for row in soup.find_all("tr"): | |
| cells = row.find_all("td") | |
| if len(cells) == 2: | |
| name = cells[0].get_text(strip=True) | |
| value = cells[1].get_text(strip=True) | |
| if name and value: | |
| params.append(f"{name}: {value}") | |
| except: | |
| pass | |
| # DESCRIPTION | |
| description = "No description" | |
| try: | |
| desc_div = soup.find("div", {"itemprop": "description"}) | |
| if desc_div: | |
| description = desc_div.get_text(strip=True)[:500] # Limit | |
| except: | |
| # Fallback | |
| try: | |
| desc_div = soup.find("div", class_=re.compile("description")) | |
| if desc_div: | |
| description = desc_div.get_text(strip=True)[:500] | |
| except: | |
| pass | |
| return { | |
| "platform": "allegro", | |
| "url": url, | |
| "title": title, | |
| "description": description, | |
| "parameters": params, | |
| "image_urls": list(image_urls) | |
| } | |
| except requests.exceptions.RequestException as e: | |
| return { | |
| "status": "error", | |
| "error": f"Request failed: {str(e)}", | |
| "platform": "allegro", | |
| "url": url | |
| } | |
| if __name__ == "__main__": | |
| url = input("Allegro URL: ") | |
| data = scrape_allegro_with_bs4(url) | |
| print(data) | |