|
|
import os |
|
|
from fastapi import FastAPI, HTTPException |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import aiohttp |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LLM_API_URL = os.getenv("LLM_API_URL", "https://api.inference.net/v1/chat/completions") |
|
|
LLM_API_KEY = os.getenv("LLM_API_KEY", "inference-00050468cc1c4a20bd5ca0997c752329") |
|
|
LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8" |
|
|
|
|
|
app = FastAPI( |
|
|
title="Web Scraper and AI Processor", |
|
|
description="An API to scrape web content and process it with a large language model.", |
|
|
version="1.0.0" |
|
|
) |
|
|
|
|
|
async def scrape_url(session, url: str): |
|
|
"""Asynchronously scrapes the text content from a given URL.""" |
|
|
try: |
|
|
async with session.get(url, timeout=10) as response: |
|
|
response.raise_for_status() |
|
|
html_content = await response.text() |
|
|
soup = BeautifulSoup(html_content, "html.parser") |
|
|
|
|
|
for script_or_style in soup(["script", "style"]): |
|
|
script_or_style.decompose() |
|
|
|
|
|
text = soup.get_text() |
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
return " ".join(chunk for chunk in chunks if chunk) |
|
|
except requests.exceptions.RequestException as e: |
|
|
raise HTTPException(status_code=400, detail=f"Error fetching the URL: {e}") |
|
|
|
|
|
async def process_with_llm(session, content: str, query: str): |
|
|
"""Sends the scraped content and a query to the LLM for processing.""" |
|
|
headers = { |
|
|
"Content-Type": "application/json", |
|
|
"Authorization": f"Bearer {LLM_API_KEY}", |
|
|
} |
|
|
data = { |
|
|
"messages": [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a helpful assistant that analyzes web content." |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f"Based on the following content, please answer this question: '{query}'\n\nContent:\n{content}" |
|
|
} |
|
|
], |
|
|
"model": LLM_MODEL, |
|
|
"stream": False |
|
|
} |
|
|
try: |
|
|
async with session.post(LLM_API_URL, headers=headers, json=data, timeout=30) as response: |
|
|
response.raise_for_status() |
|
|
return await response.json() |
|
|
except aiohttp.ClientError as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error communicating with the LLM API: {e}") |
|
|
|
|
|
@app.post("/scrape-and-process/") |
|
|
async def scrape_and_process(url: str, query: str): |
|
|
""" |
|
|
Scrapes a URL, sends the content to a large language model with a query, |
|
|
and returns the model's response. |
|
|
""" |
|
|
async with aiohttp.ClientSession() as session: |
|
|
scraped_content = await scrape_url(session, url) |
|
|
if not scraped_content: |
|
|
raise HTTPException(status_code=404, detail="Could not scrape any content from the URL.") |
|
|
|
|
|
llm_response = await process_with_llm(session, scraped_content, query) |
|
|
return llm_response |
|
|
|
|
|
@app.get("/") |
|
|
def read_root(): |
|
|
return {"message": "Welcome to the Web Scraper and AI Processor API."} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=8000) |