Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

Scrap / main.py

rkihacker

Create main.py

4b17916 verified 2 months ago

raw

history blame

3.42 kB

	import os
	from fastapi import FastAPI, HTTPException
	import requests
	from bs4 import BeautifulSoup
	import aiohttp

	# --- Configuration ---
	# It's recommended to use environment variables for sensitive data like API keys.
	# Replace with your actual API key and endpoint.
	LLM_API_URL = os.getenv("LLM_API_URL", "https://api.inference.net/v1/chat/completions")
	LLM_API_KEY = os.getenv("LLM_API_KEY", "inference-00050468cc1c4a20bd5ca0997c752329") # Replace with your key
	LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"

	app = FastAPI(
	title="Web Scraper and AI Processor",
	description="An API to scrape web content and process it with a large language model.",
	version="1.0.0"
	)

	async def scrape_url(session, url: str):
	"""Asynchronously scrapes the text content from a given URL."""
	try:
	async with session.get(url, timeout=10) as response:
	response.raise_for_status()
	html_content = await response.text()
	soup = BeautifulSoup(html_content, "html.parser")
	# Remove script and style elements
	for script_or_style in soup(["script", "style"]):
	script_or_style.decompose()
	# Get text and clean it up
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	return " ".join(chunk for chunk in chunks if chunk)
	except requests.exceptions.RequestException as e:
	raise HTTPException(status_code=400, detail=f"Error fetching the URL: {e}")

	async def process_with_llm(session, content: str, query: str):
	"""Sends the scraped content and a query to the LLM for processing."""
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {LLM_API_KEY}",
	}
	data = {
	"messages": [
	{
	"role": "system",
	"content": "You are a helpful assistant that analyzes web content."
	},
	{
	"role": "user",
	"content": f"Based on the following content, please answer this question: '{query}'\n\nContent:\n{content}"
	}
	],
	"model": LLM_MODEL,
	"stream": False # Set to False for a single response
	}
	try:
	async with session.post(LLM_API_URL, headers=headers, json=data, timeout=30) as response:
	response.raise_for_status()
	return await response.json()
	except aiohttp.ClientError as e:
	raise HTTPException(status_code=500, detail=f"Error communicating with the LLM API: {e}")

	@app.post("/scrape-and-process/")
	async def scrape_and_process(url: str, query: str):
	"""
	Scrapes a URL, sends the content to a large language model with a query,
	and returns the model's response.
	"""
	async with aiohttp.ClientSession() as session:
	scraped_content = await scrape_url(session, url)
	if not scraped_content:
	raise HTTPException(status_code=404, detail="Could not scrape any content from the URL.")

	llm_response = await process_with_llm(session, scraped_content, query)
	return llm_response

	@app.get("/")
	def read_root():
	return {"message": "Welcome to the Web Scraper and AI Processor API."}

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)