Spaces:
Running
Running
File size: 1,560 Bytes
c5f0fb9 845e223 c5f0fb9 845e223 e57e05b d74c328 845e223 c5f0fb9 a70af92 c5f0fb9 a70af92 c5f0fb9 a70af92 845e223 a70af92 845e223 a70af92 c5f0fb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import nltk
import pandas as pd
from langchain_community.utilities import GoogleSerperAPIWrapper
from newspaper import Article
pd.options.mode.use_inf_as_na = True
class GoogleSerperSearchAgent:
def __init__(self, sources_path: str) -> None:
with open(sources_path) as f:
self._sources = [line.strip() for line in f.readlines()]
self._api = GoogleSerperAPIWrapper()
nltk.download("punkt")
def run(self, query: str, limit_per_source: int = 3) -> pd.DataFrame:
results = []
for site in self._sources:
results.extend(self._search(query, site, limit_per_source))
for result in results:
for key, value in self._fetch(result["url"]).items():
result[key] = value
df = pd.DataFrame(results)
df.dropna(subset=["content"], inplace=True)
return df
def _search(self, query: str, site: str | None = None, limit: int = 3) -> list[dict[str, str]]:
if site is not None:
query += f" site:{site}"
self._api.k = limit
results = self._api.results(query)["organic"]
results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results]
return results
def _fetch(self, url: str) -> dict[str, str]:
article = Article(url=url)
article.download()
article.parse()
article.nlp()
return {"title": article.title, "author": article.authors[0] if article.authors else "", "content": article.text}
|