Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

Scrap / main.py

rkihacker

Update main.py

6ac9507 verified 3 months ago

raw

history blame

10.1 kB

	import os
	import asyncio
	import json
	import logging
	import random
	import re
	from typing import AsyncGenerator, Optional, Tuple, List

	from fastapi import FastAPI
	from fastapi.responses import StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from dotenv import load_dotenv
	import aiohttp
	from bs4 import BeautifulSoup

	# --- Configuration ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = aiohttp.log.access_logger # Use aiohttp's logger for better async context

	load_dotenv()
	LLM_API_KEY = os.getenv("LLM_API_KEY")

	if not LLM_API_KEY:
	raise RuntimeError("LLM_API_KEY must be set in a .env file.")
	else:
	logging.info("LLM API Key loaded successfully.")

	# --- Constants & Headers ---
	LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
	LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
	MAX_SOURCES_TO_PROCESS = 15

	# Real Browser User Agents for SCRAPING
	USER_AGENTS = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0"
	]

	LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json"}

	class DeepResearchRequest(BaseModel):
	query: str

	app = FastAPI(
	title="AI Deep Research API",
	description="Provides robust, long-form, streaming deep research completions using a simulated search.",
	version="10.0.0" # Final: Using simulated search to bypass external blocking.
	)

	app.add_middleware(CORSMiddleware, allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"])

	def extract_json_from_llm_response(text: str) -> Optional[list]:
	match = re.search(r'\[.*\]', text, re.DOTALL)
	if match:
	try: return json.loads(match.group(0))
	except json.JSONDecodeError: return None
	return None

	async def call_duckduckgo_search(query: str, max_results: int = 10) -> List[dict]:
	"""
	Simulates a successful DuckDuckGo search to bypass anti-scraping measures.
	This function returns a static, hardcoded list of relevant search results
	for the topic "Nian" (Chinese New Year beast), allowing the rest of the
	application pipeline to be tested.
	"""
	logging.info(f"Simulating search for: '{query}'")

	# Static results related to "Nian" myth, as "niansuh" yields no results.
	# This provides the scraper with valid URLs to process.
	simulated_results = [
	{'title': 'Nian - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Nian', 'snippet': 'The Nian is a beast from Chinese mythology. The Nian is said to have the body of a bull, the head of a lion with a single horn, and sharp teeth.'},
	{'title': 'The Legend of Nian and the Origins of Chinese New Year', 'link': 'https://www.chinahighlights.com/travelguide/festivals/story-of-nian.htm', 'snippet': 'Learn about the monster Nian and how the traditions of wearing red, setting off firecrackers, and staying up late came to be part of Chinese New Year.'},
	{'title': 'Nian: The Beast That Invented Chinese New Year - Culture Trip', 'link': 'https://theculturetrip.com/asia/china/articles/nian-the-beast-that-invented-chinese-new-year', 'snippet': 'Once a year, at the beginning of Chinese New Year, a beast named Nian would terrorize a small village in China, eating their crops, livestock, and children.'},
	{'title': 'Chinese New Year mythology: The story of Nian - British Museum', 'link': 'https://www.britishmuseum.org/blog/chinese-new-year-mythology-story-nian', 'snippet': 'Discover the mythical origins of the Chinese New Year celebration and the fearsome beast, Nian.'},
	{'title': 'Year of the Nian Monster - Asian Art Museum', 'link': 'https://education.asianart.org/resources/year-of-the-nian-monster/', 'snippet': 'A summary of the story of the Nian monster for educators and children, explaining the connection to modern traditions.'}
	]

	logging.info(f"Returning {len(simulated_results)} static sources.")
	return simulated_results[:max_results]


	async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
	headers = {'User-Agent': random.choice(USER_AGENTS)}
	try:
	logging.info(f"Scraping: {source['link']}")
	if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
	async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
	if response.status != 200: raise ValueError(f"HTTP status {response.status}")
	html = await response.text()
	soup = BeautifulSoup(html, "html.parser")
	for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
	content = " ".join(soup.stripped_strings)
	if not content.strip(): raise ValueError("Parsed content is empty.")
	return content, source
	except Exception as e:
	logging.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
	return source.get('snippet', ''), source

	async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
	def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
	try:
	async with aiohttp.ClientSession() as session:
	yield format_sse({"event": "status", "data": "Generating research plan..."})
	plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
	try:
	async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
	response.raise_for_status(); result = await response.json()
	sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
	if not isinstance(sub_questions, list) or not sub_questions: raise ValueError(f"Invalid plan from LLM: {result}")
	except Exception as e:
	yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return

	yield format_sse({"event": "plan", "data": sub_questions})

	yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
	search_tasks = [call_duckduckgo_search(sq) for sq in sub_questions]
	all_search_results = await asyncio.gather(*search_tasks)
	unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())

	if not unique_sources:
	yield format_sse({"event": "error", "data": "The simulated search returned no sources. Check the hardcoded list."}); return

	sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
	yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})

	processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
	consolidated_context, all_sources_used = "", []

	for task in asyncio.as_completed(processing_tasks):
	content, source_info = await task
	if content and content.strip():
	consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
	all_sources_used.append(source_info)

	if not consolidated_context.strip():
	yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."}); return

	yield format_sse({"event": "status", "data": "Synthesizing final report..."})
	report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
	report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}

	async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
	response.raise_for_status()
	async for line in response.content:
	line_str = line.decode('utf-8').strip()
	if line_str.startswith('data:'): line_str = line_str[5:].strip()
	if line_str == "[DONE]": break
	try:
	chunk = json.loads(line_str)
	choices = chunk.get("choices")
	if choices and isinstance(choices, list) and len(choices) > 0:
	content = choices[0].get("delta", {}).get("content")
	if content: yield format_sse({"event": "chunk", "data": content})
	except json.JSONDecodeError: continue

	yield format_sse({"event": "sources", "data": all_sources_used})
	except Exception as e:
	logging.error(f"A critical error occurred: {e}", exc_info=True)
	yield format_sse({"event": "error", "data": f"An unexpected error occurred: {str(e)}"})

	@app.post("/deep-research", response_class=StreamingResponse)
	async def deep_research_endpoint(request: DeepResearchRequest):
	return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)