Spaces:

Dediro
/

cynthesis-v4

Running

abdoh-alkhateeb commited on 22 days ago

Commit

c5f0fb9

•

1 Parent(s): cb980d4

Improve serper agent

Files changed (1) hide show

agents/google_serper_search_agent.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import pandas as pd
 from langchain_community.utilities import GoogleSerperAPIWrapper
 class GoogleSerperSearchAgent:
@@ -9,12 +13,19 @@ class GoogleSerperSearchAgent:
         self._api = GoogleSerperAPIWrapper()
     def run(self, query: str, limit_per_source: int = 3) -> pd.DataFrame:
         results = []
         for site in self._sources:
             results.extend(self._search(query, site, limit_per_source))
         df = pd.DataFrame(results)
         return df
@@ -28,3 +39,12 @@ class GoogleSerperSearchAgent:
         results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results]
         return results

+import nltk
 import pandas as pd
 from langchain_community.utilities import GoogleSerperAPIWrapper
+from newspaper import Article
+pd.options.mode.use_inf_as_na = True
 class GoogleSerperSearchAgent:
         self._api = GoogleSerperAPIWrapper()
+        nltk.download("punkt")
     def run(self, query: str, limit_per_source: int = 3) -> pd.DataFrame:
         results = []
         for site in self._sources:
             results.extend(self._search(query, site, limit_per_source))
+        for result in results:
+            for key, value in self._fetch(result["url"]).items():
+                result[key] = value
         df = pd.DataFrame(results)
+        df.dropna(subset=["content"], inplace=True)
         return df
         results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results]
         return results
+    def _fetch(self, url: str) -> dict[str, str]:
+        article = Article(url=url)
+        article.download()
+        article.parse()
+        article.nlp()
+        return {"title": article.title, "author": article.authors[0] if article.authors else "", "content": article.text}