Spaces:
Sleeping
Sleeping
File size: 5,042 Bytes
e84fb4f d92c861 462e814 a6e8718 cb2cabb df6a783 cb2cabb a6e8718 462e814 d92c861 9ba3ade d92c861 accd0f3 92199f9 0f0c7dc ef24fee 92199f9 ef24fee 0f0c7dc cb2cabb cebdc58 cb2cabb 1d74e32 cb2cabb 1d74e32 cb2cabb 1d74e32 cb2cabb 1d74e32 cb2cabb 1d74e32 cb2cabb 1d74e32 cb2cabb 1d74e32 c0bee13 1d74e32 c0bee13 1d74e32 cb2cabb 1d74e32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from scraper import Scraper
#import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from fastapi import FastAPI
import random
import requests
from bs4 import BeautifulSoup
# Allow nested use of asyncio.run() in Jupyter
#nest_asyncio.apply()
try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
from pip.operations import freeze
pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
import time
@app.get("/get_scraped_data")
async def get_data(url: str):
try:
data = await Scraper.scrape(url)
return data
except:
return {"title": "error", "URL": url, "Content": "none"}
@app.get("/fast_scrape")
async def fast_scrape(url: str):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Step 1: Send a request to the webpage
response = requests.get(url, headers=headers)
# Step 2: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Step 3: Extract the body tag and get all text within it
body = soup.find('body')
body_text = body.get_text(separator=' ', strip=True) if body else ''
# Step 4: Output the body text
return body_text
# FastAPI route to scrape the website
@app.get("/scrape")
async def scrape_website(url: str):
async with async_playwright() as p:
# Launch browser in headless mode with custom args to bypass detection
browser = await p.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled", # Disable automation features
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-web-security",
"--disable-setuid-sandbox",
"--disable-features=IsolateOrigins,site-per-process"
]
)
# Create a new browser context
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Set additional headers to mimic real browsing
await context.set_extra_http_headers({
"Accept-Language": "en-US,en;q=0.9",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://www.nasdaq.com"
})
# Create a new page
page = await context.new_page()
# Hide WebDriver and other automation-related properties
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.navigator.chrome = { runtime: {} };
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
""")
# Block unnecessary resources (images, media, etc.)
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
try:
# Navigate to the page with random delays
await asyncio.sleep(random.uniform(1, 5)) # Random delay
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
# Randomized mouse movement and scrolling to mimic human interaction
await page.mouse.move(random.uniform(0, 100), random.uniform(0, 100))
await page.mouse.wheel(0, random.uniform(200, 400))
await asyncio.sleep(random.uniform(1, 5)) # Another random delay
# Get the title of the page
title = await page.title()
# Get all links on the page
links = await page.evaluate("""() => {
return Array.from(document.querySelectorAll('a')).map(a => a.href);
}""")
# Get page content (text from paragraphs and headers)
content = await page.evaluate("""() => {
let elements = Array.from(document.querySelectorAll('body *'));
return elements
.filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
.map(element => element.innerText.trim())
.join('\\n');
}""")
# Close the browser
await browser.close()
return {
"title": title,
"links": links,
"content": content
}
except Exception as e:
return {"error": str(e)}
|