File size: 1,905 Bytes
c5f0fb9
845e223
 
c5f0fb9
25c72f2
c5f0fb9
 
845e223
 
e57e05b
d74c328
 
 
 
845e223
 
c5f0fb9
 
ad719fb
a70af92
 
 
 
c5f0fb9
25c72f2
 
 
 
 
 
c5f0fb9
 
a70af92
c5f0fb9
a70af92
f32c67b
 
 
a70af92
 
 
 
845e223
 
 
 
a70af92
845e223
a70af92
c5f0fb9
25c72f2
c5f0fb9
 
25c72f2
 
 
 
 
c5f0fb9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import nltk
import pandas as pd
from langchain_community.utilities import GoogleSerperAPIWrapper
from newspaper import Article
from newspaper.article import ArticleException

pd.options.mode.use_inf_as_na = True


class GoogleSerperSearchAgent:
    def __init__(self, sources_path: str) -> None:
        with open(sources_path) as f:
            self._sources = [line.strip() for line in f.readlines()]

        self._api = GoogleSerperAPIWrapper()

        nltk.download("punkt")

    def run(self, query: str, limit_per_source: int = 3) -> tuple[pd.DataFrame, dict[str, float]]:
        results = []
        for site in self._sources:
            results.extend(self._search(query, site, limit_per_source))

        for result in results:
            article = self._fetch(result["url"])

            if article is None:
                continue

            for key, value in article.items():
                result[key] = value

        df = pd.DataFrame(results)
        df.dropna(subset=["content"], inplace=True)

        df = df[["title", "author", "date", "url", "content"]]

        return df, {"cost": 0.001 * len(self._sources)}

    def _search(self, query: str, site: str | None = None, limit: int = 3) -> list[dict[str, str]]:
        if site is not None:
            query += f" site:{site}"

        self._api.k = limit

        results = self._api.results(query)["organic"]
        results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results]

        return results

    def _fetch(self, url: str) -> dict[str, str] | None:
        article = Article(url=url)

        try:
            article.download()
        except ArticleException:
            return None

        article.parse()
        article.nlp()

        return {"title": article.title, "author": article.authors[0] if article.authors else "", "content": article.text}