webscrapper / main.py
Arafath10's picture
Update main.py
e84fb4f verified
raw
history blame
2.04 kB
from fastapi import FastAPI, HTTPException
import asyncio
from playwright.async_api import async_playwright
from fastapi.responses import HTMLResponse
from fastapi.responses import StreamingResponse
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from io import StringIO
import os
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
async def scrape_links():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Block unnecessary resources to speed up loading
await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
# Open the target website
await page.goto('https://www.fool.com/earnings/call-transcripts/2024/01/24/tesla-tsla-q4-2023-earnings-call-transcript/', wait_until='domcontentloaded')
# Wait for a short time to ensure dynamic content is loaded
await page.wait_for_timeout(10)
# Extract all links
links = await page.query_selector_all('a')
result = []
for link in links:
href = await link.get_attribute('href')
result.append({'href': href})
# Extract all text content
elements = await page.query_selector_all('body *')
for element in elements:
text_content = await element.text_content()
if text_content and text_content.strip():
result.append({'text': text_content.strip()})
await browser.close()
return result
@app.post("/get_webscrapet_data")
async def get_webscrapet_data(url: str):
try:
# Run the scraping function
results = await scrape_links()
return results
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))