|
from asyncio import Semaphore |
|
import logging |
|
from urllib.parse import quote_plus |
|
from serp.base import PlaywrightSerpBackendBase, SerpQuery, SerpResultItem, playwright_open_page |
|
|
|
|
|
class BingSerpBackend(PlaywrightSerpBackendBase): |
|
|
|
def __init__(self): |
|
super().__init__() |
|
self.MAX_CONCURRENCY_SEMAPHORE = Semaphore(4) |
|
pass |
|
|
|
@property |
|
def name(self) -> str: |
|
return "bing" |
|
|
|
async def query_serp_page(self, browser, query: SerpQuery): |
|
async with playwright_open_page(browser, self.MAX_CONCURRENCY_SEMAPHORE) as page: |
|
async def _block_resources(route, request): |
|
if request.resource_type in ["stylesheet", "image"]: |
|
await route.abort() |
|
else: |
|
await route.continue_() |
|
|
|
await page.route("**/*", _block_resources) |
|
|
|
url = f"https://www.bing.com/search?q={quote_plus(query.query)}" |
|
logging.info(url) |
|
await page.goto(url) |
|
|
|
await page.wait_for_selector("li.b_algo") |
|
|
|
results = [] |
|
|
|
items = await page.query_selector_all("li.b_algo") |
|
for item in items[:query.n_results]: |
|
title_el = await item.query_selector("h2 > a") |
|
url = await title_el.get_attribute("href") if title_el else None |
|
title = await title_el.inner_text() if title_el else "" |
|
|
|
snippet = "" |
|
|
|
|
|
for selector in [ |
|
"div.b_caption p", |
|
"div.b_caption", |
|
"div.b_snippet", |
|
"div.b_text", |
|
"p" |
|
]: |
|
snippet_el = await item.query_selector(selector) |
|
if snippet_el: |
|
snippet = await snippet_el.inner_text() |
|
if snippet.strip(): |
|
break |
|
|
|
if title and url: |
|
results.append(SerpResultItem( |
|
title=title.strip(), href=url.strip(), body=snippet.strip())) |
|
|
|
return results |
|
|
|
@property |
|
def category(self): |
|
return "general" |
|
|
|
|