Scrap / main.py
rkihacker's picture
Create main.py
4b17916 verified
raw
history blame
3.42 kB
import os
from fastapi import FastAPI, HTTPException
import requests
from bs4 import BeautifulSoup
import aiohttp
# --- Configuration ---
# It's recommended to use environment variables for sensitive data like API keys.
# Replace with your actual API key and endpoint.
LLM_API_URL = os.getenv("LLM_API_URL", "https://api.inference.net/v1/chat/completions")
LLM_API_KEY = os.getenv("LLM_API_KEY", "inference-00050468cc1c4a20bd5ca0997c752329") # Replace with your key
LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
app = FastAPI(
title="Web Scraper and AI Processor",
description="An API to scrape web content and process it with a large language model.",
version="1.0.0"
)
async def scrape_url(session, url: str):
"""Asynchronously scrapes the text content from a given URL."""
try:
async with session.get(url, timeout=10) as response:
response.raise_for_status()
html_content = await response.text()
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style elements
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
# Get text and clean it up
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
return " ".join(chunk for chunk in chunks if chunk)
except requests.exceptions.RequestException as e:
raise HTTPException(status_code=400, detail=f"Error fetching the URL: {e}")
async def process_with_llm(session, content: str, query: str):
"""Sends the scraped content and a query to the LLM for processing."""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {LLM_API_KEY}",
}
data = {
"messages": [
{
"role": "system",
"content": "You are a helpful assistant that analyzes web content."
},
{
"role": "user",
"content": f"Based on the following content, please answer this question: '{query}'\n\nContent:\n{content}"
}
],
"model": LLM_MODEL,
"stream": False # Set to False for a single response
}
try:
async with session.post(LLM_API_URL, headers=headers, json=data, timeout=30) as response:
response.raise_for_status()
return await response.json()
except aiohttp.ClientError as e:
raise HTTPException(status_code=500, detail=f"Error communicating with the LLM API: {e}")
@app.post("/scrape-and-process/")
async def scrape_and_process(url: str, query: str):
"""
Scrapes a URL, sends the content to a large language model with a query,
and returns the model's response.
"""
async with aiohttp.ClientSession() as session:
scraped_content = await scrape_url(session, url)
if not scraped_content:
raise HTTPException(status_code=404, detail="Could not scrape any content from the URL.")
llm_response = await process_with_llm(session, scraped_content, query)
return llm_response
@app.get("/")
def read_root():
return {"message": "Welcome to the Web Scraper and AI Processor API."}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)