Spaces:
Running
Running
from contextlib import asynccontextmanager | |
from typing import Optional | |
from duckduckgo_search import DDGS | |
from pydantic import BaseModel | |
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError | |
from urllib.parse import quote_plus | |
import logging | |
import re | |
class APIPatentResults(BaseModel): | |
"""Response of /search_patents endpoint""" | |
error: Optional[str] | |
results: Optional[list[dict]] | |
class APISearchResults(BaseModel): | |
error: Optional[str] | |
results: Optional[list[dict]] | |
class BraveSearchBlockedException(Exception): | |
"""Dummy exception to detect when the headless browser is flagged as suspicious.""" | |
def __init__(self, *args): | |
super().__init__("Brave Search blocked the request, likely due to flagging browser as suspicious") | |
pass | |
async def playwright_open_page(browser: Browser): | |
"""Context manager for playwright pages""" | |
context: BrowserContext = await browser.new_context() | |
page: Page = await context.new_page() | |
try: | |
yield page | |
finally: | |
await page.close() | |
await context.close() | |
# TODO: update to return same format for results | |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10): | |
"""Queries google patents for the specified query and number of results. Returns relevant patents""" | |
async with playwright_open_page(browser) as page: | |
async def _block_resources(route, request): | |
if request.resource_type in ["stylesheet", "image"]: | |
await route.abort() | |
else: | |
await route.continue_() | |
await page.route("**/*", _block_resources) | |
url = f"https://patents.google.com/?q=({quote_plus(q)})&num={n_results}" | |
await page.goto(url) | |
await page.wait_for_function( | |
f"""() => document.querySelectorAll('search-result-item').length >= {n_results}""", | |
timeout=30_000 | |
) | |
# regex to locate a patent id | |
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b" | |
items = await page.locator("search-result-item").all() | |
id_matches = [] | |
for item in items: | |
all_text = " ".join(await item.locator("span").all_inner_texts()) | |
found = re.findall(PATENT_ID_REGEX, all_text) | |
if found: | |
id_matches.append(found[0]) | |
patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id} | |
for id in id_matches] | |
return patents[:n_results] | |
async def query_brave_search(browser: Browser, q: str, n_results: int = 10): | |
"""Queries Brave Search for the specified query.""" | |
async with playwright_open_page(browser) as page: | |
async def _block_resources(route, request): | |
if request.resource_type in ["stylesheet", "image"]: | |
await route.abort() | |
else: | |
await route.continue_() | |
await page.route("**/*", _block_resources) | |
url = f"https://search.brave.com/search?q={quote_plus(q)}" | |
await page.goto(url) | |
results_cards = await page.locator('.snippet').all() | |
if len(results_cards) == 0: | |
page_content = await page.content() | |
if "suspicious" in page_content: | |
raise BraveSearchBlockedException() | |
results = [] | |
try: | |
for result in results_cards: | |
title = await result.locator('.title').all_inner_texts() | |
description = await result.locator('.snippet-description').all_inner_texts() | |
url = await result.locator('a').nth(0).get_attribute('href') | |
# Filter out results with no URL or brave-specific URLs | |
if url is None or url.startswith('/'): | |
continue | |
results.append({ | |
"title": title[0] if title else "", | |
"body": description[0] if description else "", | |
"href": url | |
}) | |
if len(results) >= n_results: | |
break | |
except TimeoutError as e: | |
logging.warning( | |
f"Timeout on selector while parsing Brave Search SERP: {e}") | |
return results | |
async def query_bing_search(browser: Browser, q: str, n_results: int = 10): | |
"""Queries bing search for the specified query""" | |
async with playwright_open_page(browser) as page: | |
async def _block_resources(route, request): | |
if request.resource_type in ["stylesheet", "image"]: | |
await route.abort() | |
else: | |
await route.continue_() | |
await page.route("**/*", _block_resources) | |
url = f"https://www.bing.com/search?q={quote_plus(q)}" | |
await page.goto(url) | |
await page.wait_for_selector("li.b_algo") | |
results = [] | |
items = await page.query_selector_all("li.b_algo") | |
for item in items[:n_results]: | |
title_el = await item.query_selector("h2 > a") | |
url = await title_el.get_attribute("href") if title_el else None | |
title = await title_el.inner_text() if title_el else "" | |
snippet = "" | |
# Try several fallback selectors | |
for selector in [ | |
"div.b_caption p", # typical snippet | |
"div.b_caption", # sometimes snippet is here | |
"div.b_snippet", # used in some result types | |
"div.b_text", # used in some panels | |
"p" # fallback to any paragraph | |
]: | |
snippet_el = await item.query_selector(selector) | |
if snippet_el: | |
snippet = await snippet_el.inner_text() | |
if snippet.strip(): | |
break | |
if title and url: | |
results.append({ | |
"title": title.strip(), | |
"href": url.strip(), | |
"body": snippet.strip() | |
}) | |
return results | |
async def query_ddg_search(q: str, n_results: int = 10): | |
"""Queries duckduckgo search for the specified query""" | |
ddgs = DDGS() | |
results = [] | |
for result in ddgs.text(q, max_results=n_results): | |
results.append( | |
{"title": result["title"], "body": result["body"], "href": result["href"]}) | |
return results | |