File size: 4,206 Bytes
9ea639d
 
 
 
 
 
 
 
 
 
 
 
003f3bf
 
9ea639d
aaf0bd8
9ea639d
 
 
 
 
 
 
aaf0bd8
 
 
 
 
 
 
 
003f3bf
09e0ac8
9ea639d
 
 
 
 
 
 
003f3bf
 
 
 
 
 
 
 
 
 
9ea639d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
003f3bf
9ea639d
 
003f3bf
 
 
 
9ea639d
 
003f3bf
 
aaf0bd8
 
 
003f3bf
 
aaf0bd8
 
5e15ed5
f08033c
003f3bf
 
 
 
aaf0bd8
 
003f3bf
 
f08033c
 
 
003f3bf
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import subprocess

subprocess.run(["playwright", "install"])
subprocess.run(["playwright", "install-deps"])
subprocess.run(
        "apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libatspi2.0-0 libxcomposite1 libxdamage1",
        shell=True,
        check=True,
    )

import asyncio
import os
import json
import gradio as gr
from playwright.async_api import async_playwright
from urllib.parse import quote_plus

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/91.0.4472.124 Safari/537.36"
)

def build_url(input_str):
    input_str = input_str.strip()
    if input_str.startswith("http://") or input_str.startswith("https://"):
        return input_str
    else:
        categoria = quote_plus(input_str)
        return f"https://www.deviantart.com/search?q={categoria}"

async def scrape_images(url, max_imgs):
    max_imgs = max(10, min(max_imgs, 1300))
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent=USER_AGENT,
            viewport={"width": 1366, "height": 768},
        )

        cookies_env = os.getenv("COOKIES_JSON")
        if cookies_env:
            try:
                cookies = json.loads(cookies_env)
                for cookie in cookies:
                    cookie["sameSite"] = "None" if cookie.get("sameSite") is None else cookie["sameSite"].capitalize()
                await context.add_cookies(cookies)
                print("✅ Cookies cargadas desde variable de entorno")
            except Exception as e:
                print(f"⚠️ Error cargando cookies desde variable de entorno: {e}")

        page = await context.new_page()
        await page.set_extra_http_headers({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://www.deviantart.com/",
        })

        await page.goto(url, timeout=60000)
        await page.wait_for_load_state("networkidle")

        collected_data = []
        seen_urls = set()
        scroll_attempts = 0
        max_scroll_attempts = 30

        while len(collected_data) < max_imgs and scroll_attempts < max_scroll_attempts:
            new_items = await page.evaluate("""() => Array.from(document.querySelectorAll('img[srcset]')).map(img => ({
                img_url: img.srcset.split(', ').pop().split(' ')[0],
                user: img.alt || "Desconocido"
            }))""")

            for item in new_items:
                if item["img_url"] not in seen_urls:
                    collected_data.append(item)
                    seen_urls.add(item["img_url"])

            if len(collected_data) < max_imgs:
                await page.evaluate("""window.scrollBy({top: window.innerHeight * 1.5, behavior: 'smooth'});""")
                await page.wait_for_timeout(3000)
                scroll_attempts += 1
                try:
                    await page.wait_for_selector('img[srcset]:not([data-loaded])', timeout=5000)
                except:
                    pass

        await browser.close()
        return collected_data[:max_imgs]

def run_scraper(user_input, max_imgs):
    url = build_url(user_input)
    print(f"Usando URL: {url}")
    return asyncio.run(scrape_images(url, int(max_imgs)))

def interface_fn(user_input, max_imgs):
    results = run_scraper(user_input, max_imgs)
    json_str = json.dumps(results, indent=2, ensure_ascii=False)
    return json_str

demo = gr.Interface(
    fn=interface_fn,
    inputs=[
        gr.Textbox(label="URL o Categoría DeviantArt", lines=1, 
                   placeholder="Pega una URL o escribe una categoría o usuario"),
        gr.Slider(minimum=10, maximum=300, step=1, value=30, label="Máximo de imágenes")
    ],
    outputs=gr.Code(label="JSON Resultante", language="json"),
    title="Scraper de Imágenes - Solo JSON",
    description="Introduce una URL o categoría. Te devuelve un JSON con los datos de las imágenes encontradas."
)

if __name__ == "__main__":
    demo.launch()