raghavNCI commited on
Commit
2f96339
·
1 Parent(s): 0a59790

google search once again

Browse files
Files changed (3) hide show
  1. Dockerfile +0 -2
  2. nuse_modules/google_search.py +20 -12
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -11,8 +11,6 @@ WORKDIR /app
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
- RUN pip install --no-cache-dir trafilatura
15
- RUN pip install --no-cache-dir "lxml[html_clean]"
16
 
17
  COPY --chown=user . /app
18
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
14
 
15
  COPY --chown=user . /app
16
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
nuse_modules/google_search.py CHANGED
@@ -4,27 +4,35 @@ import os
4
  import requests
5
  import time
6
  from typing import List
7
- from trafilatura import fetch_url, extract
8
 
9
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
10
- GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
 
 
 
11
 
12
 
13
  def extract_full_text(url: str) -> str:
 
 
 
 
14
  try:
15
- downloaded = fetch_url(url)
16
- if downloaded:
17
- content = extract(downloaded, include_comments=False, include_tables=False)
18
- return content or ""
19
  except Exception as e:
20
  print(f"[SCRAPER ERROR] {url}: {e}")
21
- return ""
22
 
23
 
24
  def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
 
 
 
 
25
  query = " ".join(keywords)
26
  url = (
27
- f"https://www.googleapis.com/customsearch/v1"
28
  f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
29
  f"&q={query}&num={num_results}"
30
  )
@@ -33,17 +41,17 @@ def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
33
  res = requests.get(url, timeout=10)
34
  res.raise_for_status()
35
  data = res.json()
36
- results = []
37
 
 
38
  for item in data.get("items", []):
39
  link = item.get("link")
40
  article_text = extract_full_text(link)
41
 
42
  results.append({
43
- "title": item.get("title"),
44
- "link": link,
45
  "snippet": item.get("snippet"),
46
- "content": article_text
47
  })
48
 
49
  return results
 
4
  import requests
5
  import time
6
  from typing import List
7
+ from boilerpy3 import extractors # ← switched library
8
 
9
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
10
+ GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
11
+
12
+ # initialise once (thread-safe)
13
+ article_extractor = extractors.ArticleExtractor()
14
 
15
 
16
  def extract_full_text(url: str) -> str:
17
+ """
18
+ Download a page and return its readable main text.
19
+ Falls back to empty string on any failure.
20
+ """
21
  try:
22
+ return article_extractor.get_content_from_url(url) or ""
 
 
 
23
  except Exception as e:
24
  print(f"[SCRAPER ERROR] {url}: {e}")
25
+ return ""
26
 
27
 
28
  def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
29
+ """
30
+ Run a Google Custom Search and return a list of dicts with:
31
+ title, link, snippet, content (full article text)
32
+ """
33
  query = " ".join(keywords)
34
  url = (
35
+ "https://www.googleapis.com/customsearch/v1"
36
  f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
37
  f"&q={query}&num={num_results}"
38
  )
 
41
  res = requests.get(url, timeout=10)
42
  res.raise_for_status()
43
  data = res.json()
 
44
 
45
+ results = []
46
  for item in data.get("items", []):
47
  link = item.get("link")
48
  article_text = extract_full_text(link)
49
 
50
  results.append({
51
+ "title": item.get("title"),
52
+ "link": link,
53
  "snippet": item.get("snippet"),
54
+ "content": article_text,
55
  })
56
 
57
  return results
requirements.txt CHANGED
@@ -6,4 +6,5 @@ redis
6
  transformers
7
  accelerate
8
  torch
9
- huggingface_hub
 
 
6
  transformers
7
  accelerate
8
  torch
9
+ huggingface_hub
10
+ boilerpy3==1.0.6