WebGPT / app.py
LejobuildYT's picture
Update app.py
b3ba34d verified
import re
import requests
from bs4 import BeautifulSoup
# DuckDuckGo Search
from duckduckgo_search import DDGS
# SerpAPI client for various search engines (Google, Bing, etc.)
from serpapi import GoogleSearch
from rake_nltk import Rake
import gradio as gr
from transformers import pipeline
# 1) Keyword extractor using RAKE
rake = Rake()
def extract_keywords(text: str) -> list[str]:
rake.extract_keywords_from_text(text)
return rake.get_ranked_phrases()[:5]
# 2) Search functions
def ddg_search_links(query: str, num: int = 5) -> list[str]:
ddgs = DDGS()
results = ddgs.text(query, max_results=num)
return [r['href'] for r in results]
def google_search_links(query: str, num: int = 5) -> list[str]:
from googlesearch import search
return list(search(query, num_results=num, pause=2))
def serpapi_search_links(query: str, api_key: str, engine: str = 'bing', num: int = 5) -> list[str]:
params = {"engine": engine, "q": query, "api_key": api_key}
client = GoogleSearch(params)
data = client.get_dict()
results = data.get('organic_results', [])
return [r['link'] for r in results if not r.get('sponsored')][:num]
# 3) Fetch page text for summarization
def fetch_text(url: str) -> str:
try:
resp = requests.get(url, timeout=3)
soup = BeautifulSoup(resp.text, 'html.parser')
texts = soup.find_all(['p', 'h1', 'h2', 'h3'])
return ' '.join(t.get_text() for t in texts)
except:
return ''
# 4) Model loader: lightweight HF model
generator = pipeline('text-generation', model='google/flan-t5-small', trust_remote_code=True)
def model_answer(prompt: str) -> str:
return generator(prompt, max_length=256, do_sample=False)[0]['generated_text']
# 5) Detect forbidden search phrases
FORBID_PATTERNS = [
"bitte nicht im internet suchen", "keine websuche", "mach das ohne web",
"ohne online", "nur dein wissen", "nicht googeln", "such nicht"
]
def search_forbidden(prompt: str) -> bool:
pl = prompt.lower()
return any(phrase in pl for phrase in FORBID_PATTERNS)
# 6) Check if answer is uncertain
UNCERTAIN_MARKERS = [
"ich weiß nicht", "nicht in meinen daten", "keine information", "ich bin mir nicht sicher"
]
def is_uncertain(answer: str) -> bool:
al = answer.lower()
return any(marker in al for marker in UNCERTAIN_MARKERS)
# 7) Core processing logic
def process(prompt: str, web_enabled: bool, serpapi_key: str) -> str:
# Extract keywords for search
keywords = extract_keywords(prompt)
query = ' '.join(keywords)
# If user forbids search
if search_forbidden(prompt):
ans = model_answer(prompt)
if is_uncertain(ans):
return (
"Ich weiß leider nichts über das Thema aus meinem Training. "
"Da du Websuche verboten hast, versuche ich es trotzdem, "
"aber es kann ungenau sein.\n\n" + ans
)
return ans
# If websearch disabled, just use model
if not web_enabled:
return model_answer(prompt)
# Websearch enabled: model first
ans = model_answer(prompt)
if not is_uncertain(ans):
return ans
# Model uncertain: perform multi-engine search
links = []
links += google_search_links(query)
links += ddg_search_links(query)
if serpapi_key:
links += serpapi_search_links(query, serpapi_key, engine='bing')
links += serpapi_search_links(query, serpapi_key, engine='google')
unique_links = list(dict.fromkeys(links))
# Fetch top 3 pages and summarize
texts = [fetch_text(u) for u in unique_links[:3]]
combined = '\n'.join(texts)
summary = generator(combined, max_length=256, do_sample=False)[0]['generated_text']
return summary
# 8) Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# Intelligente KI mit Multi-Engine-Websuche")
with gr.Row():
prompt_input = gr.Textbox(label="Dein Prompt", lines=3)
web_switch = gr.Checkbox(label="Websuche aktivieren", value=False)
serp_input = gr.Textbox(label="SerpAPI Key (optional für SerpAPI-Suche)", placeholder="API Key einfügen")
btn = gr.Button("Antwort generieren")
output = gr.Textbox(label="Antwort", lines=10)
btn.click(
fn=process,
inputs=[prompt_input, web_switch, serp_input],
outputs=output
)
gr.Spacer()
gr.Markdown("---")
gr.Markdown("*Hinweis: Suche nur bei aktivierter Websuche und nicht bei verbotenen Phrasen.*")
demo.launch()