Spaces:
Running
Running
import nltk | |
import pandas as pd | |
from langchain_community.utilities import GoogleSerperAPIWrapper | |
from newspaper import Article | |
pd.options.mode.use_inf_as_na = True | |
class GoogleSerperSearchAgent: | |
def __init__(self, sources_path: str) -> None: | |
with open(sources_path) as f: | |
self._sources = [line.strip() for line in f.readlines()] | |
self._api = GoogleSerperAPIWrapper() | |
nltk.download("punkt") | |
def run(self, query: str, limit_per_source: int = 3) -> pd.DataFrame: | |
results = [] | |
for site in self._sources: | |
results.extend(self._search(query, site, limit_per_source)) | |
for result in results: | |
for key, value in self._fetch(result["url"]).items(): | |
result[key] = value | |
df = pd.DataFrame(results) | |
df.dropna(subset=["content"], inplace=True) | |
return df | |
def _search(self, query: str, site: str | None = None, limit: int = 3) -> list[dict[str, str]]: | |
if site is not None: | |
query += f" site:{site}" | |
self._api.k = limit | |
results = self._api.results(query)["organic"] | |
results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results] | |
return results | |
def _fetch(self, url: str) -> dict[str, str]: | |
article = Article(url=url) | |
article.download() | |
article.parse() | |
article.nlp() | |
return {"title": article.title, "author": article.authors[0] if article.authors else "", "content": article.text} | |