raghavNCI
commited on
Commit
·
2f96339
1
Parent(s):
0a59790
google search once again
Browse files- Dockerfile +0 -2
- nuse_modules/google_search.py +20 -12
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -11,8 +11,6 @@ WORKDIR /app
|
|
11 |
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
-
RUN pip install --no-cache-dir trafilatura
|
15 |
-
RUN pip install --no-cache-dir "lxml[html_clean]"
|
16 |
|
17 |
COPY --chown=user . /app
|
18 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
11 |
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
|
|
|
|
14 |
|
15 |
COPY --chown=user . /app
|
16 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
nuse_modules/google_search.py
CHANGED
@@ -4,27 +4,35 @@ import os
|
|
4 |
import requests
|
5 |
import time
|
6 |
from typing import List
|
7 |
-
from
|
8 |
|
9 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
10 |
-
GOOGLE_CX_ID
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
def extract_full_text(url: str) -> str:
|
|
|
|
|
|
|
|
|
14 |
try:
|
15 |
-
|
16 |
-
if downloaded:
|
17 |
-
content = extract(downloaded, include_comments=False, include_tables=False)
|
18 |
-
return content or ""
|
19 |
except Exception as e:
|
20 |
print(f"[SCRAPER ERROR] {url}: {e}")
|
21 |
-
|
22 |
|
23 |
|
24 |
def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
|
|
|
|
|
|
|
|
|
25 |
query = " ".join(keywords)
|
26 |
url = (
|
27 |
-
|
28 |
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
|
29 |
f"&q={query}&num={num_results}"
|
30 |
)
|
@@ -33,17 +41,17 @@ def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
|
|
33 |
res = requests.get(url, timeout=10)
|
34 |
res.raise_for_status()
|
35 |
data = res.json()
|
36 |
-
results = []
|
37 |
|
|
|
38 |
for item in data.get("items", []):
|
39 |
link = item.get("link")
|
40 |
article_text = extract_full_text(link)
|
41 |
|
42 |
results.append({
|
43 |
-
"title":
|
44 |
-
"link":
|
45 |
"snippet": item.get("snippet"),
|
46 |
-
"content": article_text
|
47 |
})
|
48 |
|
49 |
return results
|
|
|
4 |
import requests
|
5 |
import time
|
6 |
from typing import List
|
7 |
+
from boilerpy3 import extractors # ← switched library
|
8 |
|
9 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
10 |
+
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
11 |
+
|
12 |
+
# initialise once (thread-safe)
|
13 |
+
article_extractor = extractors.ArticleExtractor()
|
14 |
|
15 |
|
16 |
def extract_full_text(url: str) -> str:
|
17 |
+
"""
|
18 |
+
Download a page and return its readable main text.
|
19 |
+
Falls back to empty string on any failure.
|
20 |
+
"""
|
21 |
try:
|
22 |
+
return article_extractor.get_content_from_url(url) or ""
|
|
|
|
|
|
|
23 |
except Exception as e:
|
24 |
print(f"[SCRAPER ERROR] {url}: {e}")
|
25 |
+
return ""
|
26 |
|
27 |
|
28 |
def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
|
29 |
+
"""
|
30 |
+
Run a Google Custom Search and return a list of dicts with:
|
31 |
+
title, link, snippet, content (full article text)
|
32 |
+
"""
|
33 |
query = " ".join(keywords)
|
34 |
url = (
|
35 |
+
"https://www.googleapis.com/customsearch/v1"
|
36 |
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
|
37 |
f"&q={query}&num={num_results}"
|
38 |
)
|
|
|
41 |
res = requests.get(url, timeout=10)
|
42 |
res.raise_for_status()
|
43 |
data = res.json()
|
|
|
44 |
|
45 |
+
results = []
|
46 |
for item in data.get("items", []):
|
47 |
link = item.get("link")
|
48 |
article_text = extract_full_text(link)
|
49 |
|
50 |
results.append({
|
51 |
+
"title": item.get("title"),
|
52 |
+
"link": link,
|
53 |
"snippet": item.get("snippet"),
|
54 |
+
"content": article_text,
|
55 |
})
|
56 |
|
57 |
return results
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ redis
|
|
6 |
transformers
|
7 |
accelerate
|
8 |
torch
|
9 |
-
huggingface_hub
|
|
|
|
6 |
transformers
|
7 |
accelerate
|
8 |
torch
|
9 |
+
huggingface_hub
|
10 |
+
boilerpy3==1.0.6
|