File size: 5,042 Bytes
e84fb4f
d92c861
462e814
a6e8718
cb2cabb
 
 
 
 
df6a783
 
cb2cabb
a6e8718
462e814
d92c861
9ba3ade
 
 
 
 
 
 
d92c861
 
 
 
 
 
 
 
accd0f3
92199f9
0f0c7dc
 
ef24fee
92199f9
 
ef24fee
 
0f0c7dc
cb2cabb
cebdc58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb2cabb
 
1d74e32
cb2cabb
1d74e32
 
 
 
 
 
 
 
 
 
 
 
 
 
cb2cabb
 
 
 
1d74e32
cb2cabb
 
 
1d74e32
cb2cabb
 
1d74e32
cb2cabb
 
1d74e32
 
 
 
 
 
 
 
 
cb2cabb
 
 
1d74e32
 
c0bee13
 
1d74e32
c0bee13
 
1d74e32
 
cb2cabb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d74e32
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from scraper import Scraper
#import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from fastapi import FastAPI
import random

import requests
from bs4 import BeautifulSoup
# Allow nested use of asyncio.run() in Jupyter
#nest_asyncio.apply()


try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
    from pip.operations import freeze

pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)
import time

@app.get("/get_scraped_data")
async def get_data(url: str):
        try:
            data = await Scraper.scrape(url)
            return data
        except:
            return {"title": "error", "URL": url, "Content": "none"}



@app.get("/fast_scrape")
async def fast_scrape(url: str):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    # Step 1: Send a request to the webpage
    response = requests.get(url, headers=headers)
    
    # Step 2: Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Step 3: Extract the body tag and get all text within it
    body = soup.find('body')
    body_text = body.get_text(separator=' ', strip=True) if body else ''
    
    # Step 4: Output the body text
    return body_text

# FastAPI route to scrape the website
@app.get("/scrape")
async def scrape_website(url: str):
    async with async_playwright() as p:
        # Launch browser in headless mode with custom args to bypass detection
        browser = await p.chromium.launch(
            headless=True,
            args=[
                "--disable-blink-features=AutomationControlled",  # Disable automation features
                "--no-sandbox",
                "--disable-dev-shm-usage",
                "--disable-web-security",
                "--disable-setuid-sandbox",
                "--disable-features=IsolateOrigins,site-per-process"
            ]
        )

        # Create a new browser context
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        )

        # Set additional headers to mimic real browsing
        await context.set_extra_http_headers({
            "Accept-Language": "en-US,en;q=0.9",
            "Upgrade-Insecure-Requests": "1",
            "Referer": "https://www.nasdaq.com"
        })

        # Create a new page
        page = await context.new_page()

        # Hide WebDriver and other automation-related properties
        await page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
            window.navigator.chrome = { runtime: {} };
            Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
            Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
        """)

        # Block unnecessary resources (images, media, etc.)
        await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())

        try:
            # Navigate to the page with random delays
            await asyncio.sleep(random.uniform(1, 5))  # Random delay
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)

            # Randomized mouse movement and scrolling to mimic human interaction
            await page.mouse.move(random.uniform(0, 100), random.uniform(0, 100))
            await page.mouse.wheel(0, random.uniform(200, 400))
            await asyncio.sleep(random.uniform(1, 5))  # Another random delay

            # Get the title of the page
            title = await page.title()

            # Get all links on the page
            links = await page.evaluate("""() => {
                return Array.from(document.querySelectorAll('a')).map(a => a.href);
            }""")

            # Get page content (text from paragraphs and headers)
            content = await page.evaluate("""() => {
                let elements = Array.from(document.querySelectorAll('body *'));
                return elements
                    .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
                    .map(element => element.innerText.trim())
                    .join('\\n');
            }""")

            # Close the browser
            await browser.close()

            return {
                "title": title,
                "links": links,
                "content": content
            }

        except Exception as e:
            return {"error": str(e)}