| import asyncio |
| import json |
| import os |
| import base64 |
| import sys |
| from pathlib import Path |
| from fastapi import FastAPI, WebSocket, WebSocketDisconnect, UploadFile, Form |
| from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse |
| from playwright.async_api import async_playwright, Page |
|
|
| app = FastAPI() |
|
|
| PORT = int(os.environ.get("PORT", 7860)) |
| BASE_DIR = Path(os.getcwd()) |
| SCRIPTS_DIR = BASE_DIR / "scripts" |
| RESULTS_DIR = BASE_DIR / "results" |
| SCRIPTS_DIR.mkdir(exist_ok=True) |
| RESULTS_DIR.mkdir(exist_ok=True) |
|
|
| |
| pw = None |
| browser = None |
| context = None |
| pages = [] |
| active_tab = 0 |
| console_logs = {} |
| network_logs = {} |
|
|
| |
| |
| |
| def resolve_url(q: str) -> str: |
| q = q.strip() |
| if not q: |
| return "https://example.com" |
| if q.startswith(("http://", "https://")): |
| return q |
| if "." in q and " " not in q and not q.startswith("localhost"): |
| return "https://" + q |
| return f"https://www.google.com/search?q={q.replace(' ', '+')}" |
|
|
| async def setup_page(page: Page, page_id: int): |
| """Attach console and network listeners to a page""" |
| pid = id(page) |
| console_logs[pid] = [] |
| network_logs[pid] = [] |
| |
| def handle_console(msg): |
| console_logs[pid].append({ |
| "type": msg.type, |
| "text": msg.text, |
| "args": [str(a) for a in msg.args[:5]] |
| }) |
| |
| def handle_request(req): |
| network_logs[pid].append({ |
| "type": "request", |
| "url": req.url, |
| "method": req.method, |
| "resourceType": req.resource_type |
| }) |
| |
| def handle_response(res): |
| network_logs[pid].append({ |
| "type": "response", |
| "url": res.url, |
| "status": res.status, |
| "ok": res.ok |
| }) |
| |
| page.on("console", handle_console) |
| page.on("request", handle_request) |
| page.on("response", handle_response) |
| await page.route("**/*", lambda route: route.continue_()) |
|
|
| async def send_tabs(ws: WebSocket): |
| """Send current tab list to client""" |
| tabs = [] |
| for p in pages: |
| try: |
| if not p.is_closed(): |
| tabs.append({"url": p.url, "title": await p.title()}) |
| else: |
| tabs.append({"url": "about:blank", "title": "Closed"}) |
| except: |
| tabs.append({"url": "about:blank", "title": "Error"}) |
| await ws.send_json({"type": "tabs", "tabs": tabs, "active": active_tab}) |
|
|
| def list_scripts(): |
| """List saved JS files""" |
| return [f.name for f in SCRIPTS_DIR.glob("*.js") if f.is_file()] |
|
|
| |
| |
| |
| async def start_browser(): |
| global pw, browser, context, pages |
| print("π Starting Playwright browser...") |
| pw = await async_playwright().start() |
| browser = await pw.chromium.launch( |
| headless=True, |
| args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu"] |
| ) |
| context = await browser.new_context( |
| viewport={"width": 1280, "height": 800}, |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" |
| ) |
| page = await context.new_page() |
| await setup_page(page, 0) |
| await page.goto("https://example.com") |
| pages.append(page) |
| print(f"β
Browser ready with 1 page") |
|
|
| @app.on_event("startup") |
| async def startup(): |
| await start_browser() |
|
|
| @app.on_event("shutdown") |
| async def shutdown(): |
| print("π Shutting down browser...") |
| if browser: |
| await browser.close() |
| if pw: |
| await pw.stop() |
|
|
| |
| |
| |
| HTML = """ |
| <!DOCTYPE html> |
| <html> |
| <head> |
| <meta charset="utf-8"> |
| <title>π·οΈ HF Scraping Browser</title> |
| <style> |
| *{box-sizing:border-box;margin:0;padding:0} |
| body{font-family:system-ui,sans-serif;background:#0f0f13;color:#e0e0e0;overflow:hidden;height:100vh;display:flex;flex-direction:column} |
| #toolbar{display:flex;gap:8px;padding:8px;background:#1a1a24;border-bottom:1px solid #333;align-items:center;flex-wrap:wrap} |
| #toolbar input{flex:1;min-width:200px;padding:8px 12px;background:#252535;border:1px solid #444;border-radius:4px;color:white;font-size:14px} |
| #toolbar button{padding:8px 14px;background:#3a3a5a;border:none;border-radius:4px;color:white;cursor:pointer;font-size:13px} |
| #toolbar button:hover{background:#4a4a6a} |
| #toolbar button.primary{background:#5d5dff} |
| #toolbar button.primary:hover{background:#7070ff} |
| #main{display:flex;flex:1;overflow:hidden} |
| #browser{flex:1;display:flex;flex-direction:column} |
| #canvas{flex:1;background:#000;cursor:crosshair} |
| #sidebar{width:380px;background:#15151e;border-left:1px solid #333;display:flex;flex-direction:column;overflow:hidden} |
| .sidebar-tab{padding:10px 14px;background:#1e1e2a;border-bottom:1px solid #333;cursor:pointer;font-size:13px} |
| .sidebar-tab.active{background:#2a2a3a;border-bottom-color:#5d5dff;color:#5d5dff} |
| .sidebar-content{flex:1;overflow-y:auto;padding:12px;display:none} |
| .sidebar-content.active{display:block} |
| .log-entry{padding:6px 8px;margin:4px 0;background:#1e1e2a;border-radius:3px;font-size:12px;font-family:monospace;white-space:pre-wrap;word-break:break-all} |
| .log-entry.console{border-left:3px solid #5d5dff} |
| .log-entry.network{border-left:3px solid #4caf50} |
| #js-editor{width:100%;height:180px;background:#1e1e2a;border:1px solid #444;border-radius:4px;color:#e0e0e0;padding:10px;font-family:monospace;font-size:13px;resize:vertical} |
| #js-output{background:#0d0d12;padding:10px;border-radius:4px;margin-top:8px;font-family:monospace;font-size:12px;max-height:180px;overflow-y:auto;white-space:pre-wrap} |
| #file-list{max-height:120px;overflow-y:auto} |
| .file-item{padding:6px 8px;margin:3px 0;background:#1e1e2a;border-radius:3px;font-size:12px;display:flex;justify-content:space-between} |
| .file-item button{padding:2px 6px;font-size:11px} |
| #tabs-bar{display:flex;gap:2px;padding:4px 8px;background:#1a1a24;border-top:1px solid #333;overflow-x:auto} |
| .tab{padding:6px 12px;background:#252535;border-radius:4px 4px 0 0;cursor:pointer;font-size:12px;white-space:nowrap;display:flex;align-items:center;gap:6px} |
| .tab.active{background:#3a3a5a;color:#5d5dff} |
| .tab-close{cursor:pointer;opacity:0.7} |
| .tab-close:hover{opacity:1;color:#f44} |
| .status{font-size:11px;color:#888;padding:4px 8px} |
| .hidden{display:none!important} |
| ::-webkit-scrollbar{width:6px} |
| ::-webkit-scrollbar-track{background:#1a1a24} |
| ::-webkit-scrollbar-thumb{background:#444;border-radius:3px} |
| </style> |
| </head> |
| <body> |
| |
| <div id="toolbar"> |
| <button onclick="nav('back')">β</button> |
| <button onclick="nav('forward')">β</button> |
| <button onclick="nav('reload')">β³</button> |
| <input id="url" placeholder="Enter URL or search..." onkeydown="if(event.key==='Enter')goto()"> |
| <button class="primary" onclick="goto()">Go</button> |
| <button onclick="toggleSidebar()">β° Panels</button> |
| <button onclick="executeJS()">βΆ Run JS</button> |
| <button onclick="downloadPage()">πΎ Save HTML</button> |
| <div class="status" id="status">Connecting...</div> |
| </div> |
| |
| <div id="main"> |
| <div id="browser"> |
| <canvas id="canvas"></canvas> |
| <div id="tabs-bar"></div> |
| </div> |
| <div id="sidebar"> |
| <div style="display:flex"> |
| <div class="sidebar-tab active" onclick="showPanel('console')">Console</div> |
| <div class="sidebar-tab" onclick="showPanel('network')">Network</div> |
| <div class="sidebar-tab" onclick="showPanel('scripts')">Scripts</div> |
| </div> |
| <div id="panel-console" class="sidebar-content active"></div> |
| <div id="panel-network" class="sidebar-content"><div id="network-list"></div></div> |
| <div id="panel-scripts" class="sidebar-content"> |
| <textarea id="js-editor" placeholder="// Write JS here return document.title;">return document.title;</textarea> |
| <div style="display:flex;gap:8px;margin:8px 0"> |
| <button onclick="executeJS()" style="flex:1" class="primary">Execute</button> |
| <button onclick="loadSample('scraper')">Sample</button> |
| </div> |
| <div id="js-output">Results here...</div> |
| <hr style="border-color:#333;margin:12px 0"> |
| <div style="display:flex;gap:4px;margin-bottom:8px"> |
| <input type="file" id="js-file" accept=".js" style="display:none" onchange="uploadJS(this)"> |
| <button onclick="document.getElementById('js-file').click()">π Upload</button> |
| <button onclick="saveJS()">πΎ Save</button> |
| </div> |
| <div id="file-list"></div> |
| </div> |
| </div> |
| </div> |
| |
| <script> |
| const protocol = location.protocol === 'https:' ? 'wss:' : 'ws:'; |
| const ws = new WebSocket(`${protocol}//${location.host}/ws`); |
| const canvas = document.getElementById('canvas'); |
| const ctx = canvas.getContext('2d'); |
| |
| function resize(){ |
| canvas.width = Math.max(300, window.innerWidth - 380); |
| canvas.height = window.innerHeight - 80; |
| } |
| window.onresize = resize; |
| resize(); |
| |
| ws.onopen = () => { |
| console.log('β
WS connected'); |
| setStatus('Connected'); |
| send({t:'init'}); |
| }; |
| ws.onclose = () => { setStatus('Disconnected'); setTimeout(()=>location.reload(),2000); }; |
| ws.onerror = (e) => { console.error('WS error',e); setStatus('Connection error'); }; |
| |
| ws.onmessage = (e) => { |
| if(e.data instanceof Blob){ |
| const img = new Image(); |
| img.onload = () => ctx.drawImage(img,0,0,canvas.width,canvas.height); |
| img.src = URL.createObjectURL(e.data); |
| } else { |
| try{ handleWSMessage(JSON.parse(e.data)); } |
| catch(err){ console.error('Parse error',err); } |
| } |
| }; |
| |
| function send(data){ |
| if(ws.readyState===1) ws.send(JSON.stringify(data)); |
| else setTimeout(()=>send(data),100); |
| } |
| |
| function handleWSMessage(msg){ |
| if(msg.type==='console') appendLog('panel-console',msg.data,'console'); |
| else if(msg.type==='network') appendLog('panel-network',msg.data,'network'); |
| else if(msg.type==='js-result'){ |
| const out=document.getElementById('js-output'); |
| out.textContent = typeof msg.data==='object'?JSON.stringify(msg.data,null,2):String(msg.data); |
| setStatus('β
JS done'); |
| } |
| else if(msg.type==='html') downloadBlob(msg.data,'page.html','text/html'); |
| else if(msg.type==='file-list') renderFileList(msg.files); |
| else if(msg.type==='tabs') renderTabs(msg.tabs,msg.active); |
| else if(msg.type==='status') setStatus(msg.text); |
| else if(msg.type==='error') setStatus('β '+msg.message); |
| } |
| |
| function appendLog(panelId,entry,cls){ |
| const panel=document.getElementById(panelId); |
| const div=document.createElement('div'); |
| div.className=`log-entry ${cls}`; |
| div.textContent=`[${entry.type?.toUpperCase()||'LOG'}] ${entry.text||entry.url||JSON.stringify(entry)}`; |
| panel.insertBefore(div,panel.firstChild); |
| if(panel.children.length>100) panel.removeChild(panel.lastChild); |
| } |
| |
| function renderFileList(files){ |
| document.getElementById('file-list').innerHTML = |
| (files||[]).map(f=>`<div class="file-item"><span>π ${f}</span><button onclick="loadScript('${f}')">Load</button></div>`).join('') |
| || '<div style="color:#666;font-size:12px">No scripts</div>'; |
| } |
| |
| function renderTabs(tabs,active){ |
| document.getElementById('tabs-bar').innerHTML = |
| (tabs||[]).map((t,i)=>`<div class="tab ${i===active?'active':''}" onclick="switchTab(${i})"> |
| ${t.title?.slice(0,15)||'Tab'}${(t.title?.length||0)>15?'...':''} |
| <span class="tab-close" onclick="event.stopPropagation();closeTab(${i})">Γ</span></div>`).join(''); |
| } |
| |
| function showPanel(name){ |
| document.querySelectorAll('.sidebar-tab').forEach(t=>t.classList.remove('active')); |
| document.querySelectorAll('.sidebar-content').forEach(c=>c.classList.remove('active')); |
| event.target.classList.add('active'); |
| document.getElementById(`panel-${name}`).classList.add('active'); |
| if(name==='scripts') send({t:'list-files'}); |
| } |
| |
| function setStatus(t){ document.getElementById('status').textContent=t; console.log('[Status]',t); } |
| |
| function goto(){ |
| const q=document.getElementById('url').value.trim(); |
| if(!q){ setStatus('β οΈ Enter URL'); return; } |
| setStatus(`π Loading: ${q.slice(0,40)}...`); |
| send({t:'goto',q}); |
| } |
| function nav(a){ setStatus(`π ${a}...`); send({t:a}); } |
| function toggleSidebar(){ |
| const sb=document.getElementById('sidebar'); |
| sb.classList.toggle('hidden'); |
| resize(); |
| } |
| canvas.onclick=(e)=>{ |
| const r=canvas.getBoundingClientRect(); |
| send({t:'click',x:e.clientX-r.left,y:e.clientY-r.top}); |
| }; |
| document.onkeydown=(e)=>{ |
| if(['INPUT','TEXTAREA'].includes(e.target.tagName)) return; |
| send({t:'key',k:e.key}); |
| }; |
| function executeJS(){ |
| const code=document.getElementById('js-editor').value; |
| send({t:'execute-js',code}); |
| setStatus('β³ Running JS...'); |
| } |
| function loadSample(type){ |
| const s={scraper:`// Extract links\nreturn Array.from(document.querySelectorAll('a')).map(a=>({text:a.innerText?.trim().slice(0,30),href:a.href})).filter(l=>l.href).slice(0,20);`}; |
| document.getElementById('js-editor').value=s[type]||''; |
| } |
| function uploadJS(input){ |
| const f=input.files[0]; if(!f) return; |
| const r=new FileReader(); |
| r.onload=(e)=>{ document.getElementById('js-editor').value=e.target.result; setStatus('π Loaded'); }; |
| r.readAsText(f); |
| } |
| function saveJS(){ |
| const name=prompt('Script name:','myscript'); if(!name) return; |
| send({t:'save-file',name:name+'.js',content:document.getElementById('js-editor').value}); |
| setStatus('πΎ Saved'); |
| } |
| function loadScript(name){ send({t:'load-file',name}); setStatus(`π₯ Loading ${name}...`); } |
| function downloadPage(){ send({t:'get-html'}); setStatus('β³ Preparing HTML...'); } |
| function downloadBlob(b64,filename,mime){ |
| const bytes=Uint8Array.from(atob(b64),c=>c.charCodeAt(0)); |
| const a=document.createElement('a'); |
| a.href=URL.createObjectURL(new Blob([bytes],{type:mime})); |
| a.download=filename; a.click(); |
| } |
| function switchTab(i){ send({t:'switch-tab',index:i}); } |
| function closeTab(i){ send({t:'close-tab',index:i}); } |
| |
| window.onload=()=>document.getElementById('url').focus(); |
| </script> |
| </body> |
| </html> |
| """ |
|
|
| |
| |
| |
| @app.websocket("/ws") |
| async def websocket_endpoint(ws: WebSocket): |
| global active_tab |
| await ws.accept() |
| print(f"β
WS accepted from {ws.client}") |
| |
| await send_tabs(ws) |
| |
| async def stream(): |
| while True: |
| try: |
| if not pages or active_tab >= len(pages): |
| await asyncio.sleep(0.1) |
| continue |
| page = pages[active_tab] |
| if page.is_closed(): |
| active_tab = max(0, active_tab - 1) |
| await send_tabs(ws) |
| continue |
| img = await asyncio.wait_for( |
| page.screenshot(type="jpeg", quality=65, scale="css"), |
| timeout=5.0 |
| ) |
| await ws.send_bytes(img) |
| await asyncio.sleep(0.033) |
| except asyncio.TimeoutError: |
| await asyncio.sleep(0.1) |
| except Exception as e: |
| print(f"β Stream error: {e}") |
| await asyncio.sleep(0.2) |
| |
| stream_task = asyncio.create_task(stream()) |
| |
| try: |
| while True: |
| msg = json.loads(await ws.receive_text()) |
| t = msg.get("t") |
| |
| if not pages: |
| await ws.send_json({"type":"error","message":"No pages"}) |
| continue |
| page = pages[active_tab] if active_tab < len(pages) else pages[0] |
| |
| print(f"π¨ {t} | tab={active_tab}") |
| |
| if t == "init": |
| await send_tabs(ws) |
| |
| elif t == "goto": |
| q = msg.get("q","").strip() |
| if not q: |
| await ws.send_json({"type":"status","text":"β οΈ Empty URL"}) |
| continue |
| url = resolve_url(q) |
| print(f"π Going to: {url}") |
| try: |
| await page.goto(url, wait_until="domcontentloaded", timeout=30000) |
| title = await page.title() |
| await ws.send_json({"type":"status","text":f"β
{title[:40]}"}) |
| await send_tabs(ws) |
| except Exception as e: |
| await ws.send_json({"type":"error","message":f"Nav failed: {str(e)[:80]}"}) |
| |
| elif t == "back": |
| try: await page.go_back(timeout=15000) |
| except Exception as e: await ws.send_json({"type":"error","message":f"Back: {e}"}) |
| elif t == "forward": |
| try: await page.go_forward(timeout=15000) |
| except Exception as e: await ws.send_json({"type":"error","message":f"Forward: {e}"}) |
| elif t == "reload": |
| try: await page.reload(wait_until="domcontentloaded", timeout=30000) |
| except Exception as e: await ws.send_json({"type":"error","message":f"Reload: {e}"}) |
| |
| elif t == "click": |
| try: await page.mouse.click(msg["x"], msg["y"], delay=10) |
| except: pass |
| elif t == "key": |
| try: await page.keyboard.press(msg["k"]) |
| except: pass |
| |
| elif t == "new-tab": |
| try: |
| np = await context.new_page() |
| await setup_page(np, len(pages)) |
| pages.append(np) |
| active_tab = len(pages) - 1 |
| await send_tabs(ws) |
| except Exception as e: |
| await ws.send_json({"type":"error","message":f"Tab: {e}"}) |
| elif t == "switch-tab": |
| i = msg.get("index",0) |
| if 0 <= i < len(pages): |
| active_tab = i |
| await pages[active_tab].bring_to_front() |
| await send_tabs(ws) |
| elif t == "close-tab": |
| i = msg.get("index",0) |
| if len(pages) > 1 and 0 <= i < len(pages): |
| await pages[i].close() |
| del pages[i] |
| if active_tab >= len(pages): active_tab = len(pages)-1 |
| await send_tabs(ws) |
| |
| elif t == "execute-js": |
| code = msg.get("code","") |
| try: |
| result = await page.evaluate(f"(async()=>{{try{{{code}}}catch(e){{return{{__err__:e.message}}}}}})()") |
| if isinstance(result,dict) and result.get("__err__"): |
| await ws.send_json({"type":"js-result","data":{"error":result["__err__"]}}) |
| else: |
| await ws.send_json({"type":"js-result","data":result}) |
| except Exception as e: |
| await ws.send_json({"type":"js-result","data":{"error":str(e)}}) |
| |
| elif t == "save-file": |
| name = "".join(c for c in msg.get("name","script.js") if c.isalnum() or c in "._-") |
| (SCRIPTS_DIR / name).write_text(msg.get("content","")) |
| await ws.send_json({"type":"file-list","files":list_scripts()}) |
| elif t == "load-file": |
| path = SCRIPTS_DIR / msg.get("name","") |
| if path.exists() and path.suffix==".js": |
| await ws.send_json({"type":"js-result","data":path.read_text()}) |
| else: |
| await ws.send_json({"type":"error","message":"Not found"}) |
| elif t == "list-files": |
| await ws.send_json({"type":"file-list","files":list_scripts()}) |
| |
| elif t == "get-html": |
| try: |
| html = await page.content() |
| b64 = base64.b64encode(html.encode('utf-8',errors='ignore')).decode() |
| await ws.send_json({"type":"html","data":b64}) |
| except Exception as e: |
| await ws.send_json({"type":"error","message":f"HTML: {e}"}) |
| |
| elif t == "clear-logs": |
| pid = id(page) |
| console_logs[pid] = [] |
| network_logs[pid] = [] |
| |
| |
| pid = id(page) |
| if console_logs.get(pid): |
| for log in console_logs[pid][-3:]: |
| await ws.send_json({"type":"console","data":log}) |
| console_logs[pid] = [] |
| if network_logs.get(pid): |
| for log in network_logs[pid][-5:]: |
| await ws.send_json({"type":"network","data":log}) |
| network_logs[pid] = [] |
| |
| except WebSocketDisconnect: |
| print("π Client disconnected") |
| except Exception as e: |
| print(f"π₯ WS error: {e}") |
| import traceback; traceback.print_exc() |
| finally: |
| stream_task.cancel() |
| try: await stream_task |
| except: pass |
|
|
| |
| |
| |
| @app.get("/", response_class=HTMLResponse) |
| async def home(): |
| return HTML |
|
|
| @app.get("/health") |
| async def health(): |
| return { |
| "status": "ok", |
| "tabs": len(pages), |
| "active": active_tab, |
| "alive": sum(1 for p in pages if not p.is_closed()) |
| } |
|
|
| @app.get("/scripts/{filename}") |
| async def get_script(filename: str): |
| path = SCRIPTS_DIR / filename |
| if path.exists() and path.suffix == ".js": |
| return PlainTextResponse(path.read_text(), media_type="application/javascript") |
| return JSONResponse({"error":"Not found"}, status_code=404) |
|
|
| @app.post("/upload-script") |
| async def upload_script(file: UploadFile = None, name: str = Form(None)): |
| if not file: |
| return JSONResponse({"error":"No file"}, status_code=400) |
| filename = (name or file.filename).replace(".js","") + ".js" |
| safe = "".join(c for c in filename if c.isalnum() or c in "._-") |
| content = await file.read() |
| (SCRIPTS_DIR / safe).write_bytes(content) |
| return {"status":"saved","file":safe} |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info") |