import re import requests from bs4 import BeautifulSoup # DuckDuckGo Search from duckduckgo_search import DDGS # SerpAPI client for various search engines (Google, Bing, etc.) from serpapi import GoogleSearch from rake_nltk import Rake import gradio as gr from transformers import pipeline # 1) Keyword extractor using RAKE rake = Rake() def extract_keywords(text: str) -> list[str]: rake.extract_keywords_from_text(text) return rake.get_ranked_phrases()[:5] # 2) Search functions def ddg_search_links(query: str, num: int = 5) -> list[str]: ddgs = DDGS() results = ddgs.text(query, max_results=num) return [r['href'] for r in results] def google_search_links(query: str, num: int = 5) -> list[str]: from googlesearch import search return list(search(query, num_results=num, pause=2)) def serpapi_search_links(query: str, api_key: str, engine: str = 'bing', num: int = 5) -> list[str]: params = {"engine": engine, "q": query, "api_key": api_key} client = GoogleSearch(params) data = client.get_dict() results = data.get('organic_results', []) return [r['link'] for r in results if not r.get('sponsored')][:num] # 3) Fetch page text for summarization def fetch_text(url: str) -> str: try: resp = requests.get(url, timeout=3) soup = BeautifulSoup(resp.text, 'html.parser') texts = soup.find_all(['p', 'h1', 'h2', 'h3']) return ' '.join(t.get_text() for t in texts) except: return '' # 4) Model loader: lightweight HF model generator = pipeline('text-generation', model='google/flan-t5-small', trust_remote_code=True) def model_answer(prompt: str) -> str: return generator(prompt, max_length=256, do_sample=False)[0]['generated_text'] # 5) Detect forbidden search phrases FORBID_PATTERNS = [ "bitte nicht im internet suchen", "keine websuche", "mach das ohne web", "ohne online", "nur dein wissen", "nicht googeln", "such nicht" ] def search_forbidden(prompt: str) -> bool: pl = prompt.lower() return any(phrase in pl for phrase in FORBID_PATTERNS) # 6) Check if answer is uncertain UNCERTAIN_MARKERS = [ "ich weiß nicht", "nicht in meinen daten", "keine information", "ich bin mir nicht sicher" ] def is_uncertain(answer: str) -> bool: al = answer.lower() return any(marker in al for marker in UNCERTAIN_MARKERS) # 7) Core processing logic def process(prompt: str, web_enabled: bool, serpapi_key: str) -> str: # Extract keywords for search keywords = extract_keywords(prompt) query = ' '.join(keywords) # If user forbids search if search_forbidden(prompt): ans = model_answer(prompt) if is_uncertain(ans): return ( "Ich weiß leider nichts über das Thema aus meinem Training. " "Da du Websuche verboten hast, versuche ich es trotzdem, " "aber es kann ungenau sein.\n\n" + ans ) return ans # If websearch disabled, just use model if not web_enabled: return model_answer(prompt) # Websearch enabled: model first ans = model_answer(prompt) if not is_uncertain(ans): return ans # Model uncertain: perform multi-engine search links = [] links += google_search_links(query) links += ddg_search_links(query) if serpapi_key: links += serpapi_search_links(query, serpapi_key, engine='bing') links += serpapi_search_links(query, serpapi_key, engine='google') unique_links = list(dict.fromkeys(links)) # Fetch top 3 pages and summarize texts = [fetch_text(u) for u in unique_links[:3]] combined = '\n'.join(texts) summary = generator(combined, max_length=256, do_sample=False)[0]['generated_text'] return summary # 8) Gradio UI with gr.Blocks() as demo: gr.Markdown("# Intelligente KI mit Multi-Engine-Websuche") with gr.Row(): prompt_input = gr.Textbox(label="Dein Prompt", lines=3) web_switch = gr.Checkbox(label="Websuche aktivieren", value=False) serp_input = gr.Textbox(label="SerpAPI Key (optional für SerpAPI-Suche)", placeholder="API Key einfügen") btn = gr.Button("Antwort generieren") output = gr.Textbox(label="Antwort", lines=10) btn.click( fn=process, inputs=[prompt_input, web_switch, serp_input], outputs=output ) gr.Spacer() gr.Markdown("---") gr.Markdown("*Hinweis: Suche nur bei aktivierter Websuche und nicht bei verbotenen Phrasen.*") demo.launch()