Spaces:

nuseAI
/

FastAPI

Sleeping

FastAPI / nuse_modules /google_search.py

raghavNCI

text extractor changes

2e99a5a about 2 months ago

1.81 kB

	# nuse_modules/google_search.py

	import os
	import requests
	import time
	from typing import List
	from boilerpy3 import extractors

	article_extractor = extractors.ArticleExtractor()

	HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/114.0.0.0 Safari/537.36"
	)
	}

	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")

	# initialise once (thread-safe)
	article_extractor = extractors.ArticleExtractor()


	def extract_full_text(url: str) -> str:
	try:
	html = requests.get(url, headers=HEADERS, timeout=10).text
	return article_extractor.get_content(html) or ""
	except Exception as e:
	print(f"[SCRAPER ERROR] {url}: {e}")
	return ""


	def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
	"""
	Run a Google Custom Search and return a list of dicts with:
	title, link, snippet, content (full article text)
	"""
	query = " ".join(keywords)
	url = (
	"https://www.googleapis.com/customsearch/v1"
	f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
	f"&q={query}&num={num_results}"
	)

	try:
	res = requests.get(url, timeout=10)
	res.raise_for_status()
	data = res.json()

	results = []
	for item in data.get("items", []):
	link = item.get("link")
	article_text = extract_full_text(link)

	results.append({
	"title": item.get("title"),
	"link": link,
	"snippet": item.get("snippet"),
	"content": article_text,
	})

	return results

	except Exception as e:
	print(f"[ERROR] Google search failed: {e}")
	return []