Add Gradio web search application and update README with usage instructions
Browse files- README.md +28 -1
- app.py +104 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -9,4 +9,31 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Gradio News‑to‑Context Service
|
| 13 |
+
|
| 14 |
+
## Prerequisites
|
| 15 |
+
|
| 16 |
+
`$ pip install gradio httpx trafilatura python-dateutil`
|
| 17 |
+
|
| 18 |
+
## Environment
|
| 19 |
+
|
| 20 |
+
`export SERPER_API_KEY="YOUR‑KEY‑HERE"`
|
| 21 |
+
|
| 22 |
+
## How it works – design notes
|
| 23 |
+
|
| 24 |
+
| Step | Technique | Why it matters |
|
| 25 |
+
|---|---|---|
|
| 26 |
+
| API search | Serper’s Google‑News JSON is fast, cost‑effective and immune to Google’s bot‑blocking. | |
|
| 27 |
+
| Concurrency | `httpx.AsyncClient` + `asyncio.gather` gets 10 articles in < 2 s on typical broadband. | |
|
| 28 |
+
| Extraction | Trafilatura consistently tops accuracy charts for main‑content extraction and needs no browser or heavy ML models. | |
|
| 29 |
+
| Date parsing | `python‑dateutil` converts fuzzy strings (“16 hours ago”) into ISO YYYY‑MM‑DD so the LLM sees absolute dates. | |
|
| 30 |
+
| LLM‑friendly output | Markdown headings and horizontal rules make chunk boundaries explicit; hyperlinks preserved for optional citation. | |
|
| 31 |
+
|
| 32 |
+
## Extending in production
|
| 33 |
+
|
| 34 |
+
* **Caching** – add `aiocache` or Redis to avoid re‑fetching identical URLs within TTL.
|
| 35 |
+
* **Long‑content trimming** – if each article can exceed your LLM’s context window, pipe `body` through a sentence‑ranker or GPT‑based summariser before concatenation.
|
| 36 |
+
* **Paywalls / PDFs** – guard `extract_main_text` with fallback libraries (e.g. `readability‑lxml` or `pymupdf`) for unusual formats.
|
| 37 |
+
* **Rate‑limiting** – Serper free tier allows 100 req/day; wrap the call with exponential‑backoff on HTTP 429.
|
| 38 |
+
|
| 39 |
+
Drop this file into any Python‑3.10+ environment, set `SERPER_API_KEY`, pip install the three libraries, and you have a ready‑to‑embed “query‑» context” micro‑service for your LLM pipeline.
|
app.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Web Search - Feed LLMs with fresh sources
|
| 3 |
+
==========================================
|
| 4 |
+
|
| 5 |
+
Prerequisites
|
| 6 |
+
-------------
|
| 7 |
+
$ pip install gradio httpx trafilatura python-dateutil
|
| 8 |
+
|
| 9 |
+
Environment
|
| 10 |
+
-----------
|
| 11 |
+
export SERPER_API_KEY="YOUR‑KEY‑HERE"
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os, json, asyncio, httpx, trafilatura, gradio as gr
|
| 15 |
+
from dateutil import parser as dateparser
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
|
| 19 |
+
SERPER_ENDPOINT = "https://google.serper.dev/news"
|
| 20 |
+
HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
### 1 ─ Serper call -------------------------------------------------------------
|
| 24 |
+
async def get_serper_news(query: str, num: int = 10) -> list[dict]:
|
| 25 |
+
payload = {"q": query, "type": "news", "num": num, "page": 1}
|
| 26 |
+
async with httpx.AsyncClient(timeout=15) as client:
|
| 27 |
+
resp = await client.post(SERPER_ENDPOINT, headers=HEADERS, json=payload)
|
| 28 |
+
resp.raise_for_status()
|
| 29 |
+
return resp.json()["news"]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
### 2 ─ Concurrent HTML downloads ----------------------------------------------
|
| 33 |
+
async def fetch_html_many(urls: list[str]) -> list[str]:
|
| 34 |
+
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
| 35 |
+
tasks = [client.get(u) for u in urls]
|
| 36 |
+
responses = await asyncio.gather(*tasks, return_exceptions=True)
|
| 37 |
+
html_pages = []
|
| 38 |
+
for r in responses:
|
| 39 |
+
if isinstance(r, Exception):
|
| 40 |
+
html_pages.append("") # keep positions aligned
|
| 41 |
+
else:
|
| 42 |
+
html_pages.append(r.text)
|
| 43 |
+
return html_pages
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
### 3 ─ Main‑content extraction -------------------------------------------------
|
| 47 |
+
def extract_main_text(html: str) -> str:
|
| 48 |
+
if not html:
|
| 49 |
+
return ""
|
| 50 |
+
# Trafilatura auto‑detects language, removes boilerplate & returns plain text.
|
| 51 |
+
return (
|
| 52 |
+
trafilatura.extract(html, include_formatting=False, include_comments=False)
|
| 53 |
+
or ""
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
### 4 ─ Orchestration -----------------------------------------------------------
|
| 58 |
+
async def build_context(query: str, k: int = 10) -> str:
|
| 59 |
+
news_items = await get_serper_news(query, num=k)
|
| 60 |
+
urls = [n["link"] for n in news_items]
|
| 61 |
+
raw_pages = await fetch_html_many(urls)
|
| 62 |
+
|
| 63 |
+
chunks = []
|
| 64 |
+
for meta, html in zip(news_items, raw_pages):
|
| 65 |
+
body = extract_main_text(html)
|
| 66 |
+
if not body:
|
| 67 |
+
continue # skip if extraction failed
|
| 68 |
+
# Normalise Serper’s relative date (“21 hours ago”) to ISO date
|
| 69 |
+
try:
|
| 70 |
+
date_iso = dateparser.parse(meta.get("date", ""), fuzzy=True).strftime(
|
| 71 |
+
"%Y-%m-%d"
|
| 72 |
+
)
|
| 73 |
+
except Exception:
|
| 74 |
+
date_iso = meta.get("date", "")
|
| 75 |
+
chunk = (
|
| 76 |
+
f"## {meta['title']}\n"
|
| 77 |
+
f"**Source:** {meta['source']} "
|
| 78 |
+
f"**Date:** {date_iso}\n"
|
| 79 |
+
f"{meta['link']}\n\n"
|
| 80 |
+
f"{body.strip()}\n"
|
| 81 |
+
)
|
| 82 |
+
chunks.append(chunk)
|
| 83 |
+
|
| 84 |
+
return "\n---\n".join(chunks) or "No extractable content found."
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
### 5 ─ Gradio user interface ---------------------------------------------------
|
| 88 |
+
async def handler(user_query: str, k: int) -> str:
|
| 89 |
+
if not SERPER_API_KEY:
|
| 90 |
+
return "✖️ SERPER_API_KEY is not set."
|
| 91 |
+
return await build_context(user_query, k)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
with gr.Blocks(title="WebSearch") as demo:
|
| 95 |
+
gr.Markdown("# 🔍 Web Search\n" "Feed LLMs with fresh sources.")
|
| 96 |
+
query = gr.Textbox(label="Query", placeholder='e.g. "apple inc"')
|
| 97 |
+
top_k = gr.Slider(1, 20, value=10, label="How many results?")
|
| 98 |
+
out = gr.Textbox(label="Extracted Context", lines=25)
|
| 99 |
+
run = gr.Button("Fetch")
|
| 100 |
+
run.click(handler, inputs=[query, top_k], outputs=out)
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
# Launch in shareable mode when running on Colab/VMs; edit as you wish.
|
| 104 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
httpx
|
| 3 |
+
trafilatura
|
| 4 |
+
python-dateutil
|