Spaces:
Running
Running
File size: 1,905 Bytes
c5f0fb9 845e223 c5f0fb9 25c72f2 c5f0fb9 845e223 e57e05b d74c328 845e223 c5f0fb9 ad719fb a70af92 c5f0fb9 25c72f2 c5f0fb9 a70af92 c5f0fb9 a70af92 f32c67b a70af92 845e223 a70af92 845e223 a70af92 c5f0fb9 25c72f2 c5f0fb9 25c72f2 c5f0fb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import nltk
import pandas as pd
from langchain_community.utilities import GoogleSerperAPIWrapper
from newspaper import Article
from newspaper.article import ArticleException
pd.options.mode.use_inf_as_na = True
class GoogleSerperSearchAgent:
def __init__(self, sources_path: str) -> None:
with open(sources_path) as f:
self._sources = [line.strip() for line in f.readlines()]
self._api = GoogleSerperAPIWrapper()
nltk.download("punkt")
def run(self, query: str, limit_per_source: int = 3) -> tuple[pd.DataFrame, dict[str, float]]:
results = []
for site in self._sources:
results.extend(self._search(query, site, limit_per_source))
for result in results:
article = self._fetch(result["url"])
if article is None:
continue
for key, value in article.items():
result[key] = value
df = pd.DataFrame(results)
df.dropna(subset=["content"], inplace=True)
df = df[["title", "author", "date", "url", "content"]]
return df, {"cost": 0.001 * len(self._sources)}
def _search(self, query: str, site: str | None = None, limit: int = 3) -> list[dict[str, str]]:
if site is not None:
query += f" site:{site}"
self._api.k = limit
results = self._api.results(query)["organic"]
results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results]
return results
def _fetch(self, url: str) -> dict[str, str] | None:
article = Article(url=url)
try:
article.download()
except ArticleException:
return None
article.parse()
article.nlp()
return {"title": article.title, "author": article.authors[0] if article.authors else "", "content": article.text}
|