File size: 4,498 Bytes
41dfbd8
 
 
b3ba34d
99fa459
1d7b739
b3ba34d
41dfbd8
f03f83e
41dfbd8
f03f83e
99fa459
41dfbd8
1d7b739
41dfbd8
1d7b739
f03f83e
41dfbd8
1d7b739
99fa459
 
 
f03f83e
1d7b739
b3ba34d
 
1d7b739
 
 
b3ba34d
1d7b739
 
41dfbd8
f03f83e
99fa459
1d7b739
41dfbd8
 
 
 
99fa459
41dfbd8
 
f03f83e
99fa459
41dfbd8
f03f83e
1d7b739
41dfbd8
f03f83e
99fa459
 
41dfbd8
 
 
99fa459
41dfbd8
99fa459
f03f83e
99fa459
 
 
 
 
41dfbd8
99fa459
f03f83e
99fa459
1d7b739
99fa459
 
 
 
 
41dfbd8
 
 
 
 
 
 
 
 
99fa459
 
41dfbd8
 
99fa459
 
41dfbd8
 
 
99fa459
1d7b739
99fa459
 
 
1d7b739
 
 
99fa459
 
1d7b739
99fa459
41dfbd8
99fa459
41dfbd8
 
 
 
 
 
99fa459
 
1d7b739
41dfbd8
 
99fa459
 
b3ba34d
99fa459
 
 
41dfbd8
b3ba34d
 
 
 
1d7b739
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import requests
from bs4 import BeautifulSoup
# DuckDuckGo Search
from duckduckgo_search import DDGS
# SerpAPI client for various search engines (Google, Bing, etc.)
from serpapi import GoogleSearch
from rake_nltk import Rake
import gradio as gr
from transformers import pipeline

# 1) Keyword extractor using RAKE
rake = Rake()
def extract_keywords(text: str) -> list[str]:
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()[:5]

# 2) Search functions
def ddg_search_links(query: str, num: int = 5) -> list[str]:
    ddgs = DDGS()
    results = ddgs.text(query, max_results=num)
    return [r['href'] for r in results]

def google_search_links(query: str, num: int = 5) -> list[str]:
    from googlesearch import search
    return list(search(query, num_results=num, pause=2))

def serpapi_search_links(query: str, api_key: str, engine: str = 'bing', num: int = 5) -> list[str]:
    params = {"engine": engine, "q": query, "api_key": api_key}
    client = GoogleSearch(params)
    data = client.get_dict()
    results = data.get('organic_results', [])
    return [r['link'] for r in results if not r.get('sponsored')][:num]

# 3) Fetch page text for summarization
def fetch_text(url: str) -> str:
    try:
        resp = requests.get(url, timeout=3)
        soup = BeautifulSoup(resp.text, 'html.parser')
        texts = soup.find_all(['p', 'h1', 'h2', 'h3'])
        return ' '.join(t.get_text() for t in texts)
    except:
        return ''

# 4) Model loader: lightweight HF model
generator = pipeline('text-generation', model='google/flan-t5-small', trust_remote_code=True)

def model_answer(prompt: str) -> str:
    return generator(prompt, max_length=256, do_sample=False)[0]['generated_text']

# 5) Detect forbidden search phrases
FORBID_PATTERNS = [
    "bitte nicht im internet suchen", "keine websuche", "mach das ohne web",
    "ohne online", "nur dein wissen", "nicht googeln", "such nicht"
]
def search_forbidden(prompt: str) -> bool:
    pl = prompt.lower()
    return any(phrase in pl for phrase in FORBID_PATTERNS)

# 6) Check if answer is uncertain
UNCERTAIN_MARKERS = [
    "ich weiß nicht", "nicht in meinen daten", "keine information", "ich bin mir nicht sicher"
]
def is_uncertain(answer: str) -> bool:
    al = answer.lower()
    return any(marker in al for marker in UNCERTAIN_MARKERS)

# 7) Core processing logic
def process(prompt: str, web_enabled: bool, serpapi_key: str) -> str:
    # Extract keywords for search
    keywords = extract_keywords(prompt)
    query = ' '.join(keywords)

    # If user forbids search
    if search_forbidden(prompt):
        ans = model_answer(prompt)
        if is_uncertain(ans):
            return (
                "Ich weiß leider nichts über das Thema aus meinem Training. "
                "Da du Websuche verboten hast, versuche ich es trotzdem, "
                "aber es kann ungenau sein.\n\n" + ans
            )
        return ans

    # If websearch disabled, just use model
    if not web_enabled:
        return model_answer(prompt)

    # Websearch enabled: model first
    ans = model_answer(prompt)
    if not is_uncertain(ans):
        return ans

    # Model uncertain: perform multi-engine search
    links = []
    links += google_search_links(query)
    links += ddg_search_links(query)
    if serpapi_key:
        links += serpapi_search_links(query, serpapi_key, engine='bing')
        links += serpapi_search_links(query, serpapi_key, engine='google')
    unique_links = list(dict.fromkeys(links))

    # Fetch top 3 pages and summarize
    texts = [fetch_text(u) for u in unique_links[:3]]
    combined = '\n'.join(texts)
    summary = generator(combined, max_length=256, do_sample=False)[0]['generated_text']
    return summary

# 8) Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Intelligente KI mit Multi-Engine-Websuche")
    with gr.Row():
        prompt_input = gr.Textbox(label="Dein Prompt", lines=3)
        web_switch = gr.Checkbox(label="Websuche aktivieren", value=False)
    serp_input = gr.Textbox(label="SerpAPI Key (optional für SerpAPI-Suche)", placeholder="API Key einfügen")
    btn = gr.Button("Antwort generieren")
    output = gr.Textbox(label="Antwort", lines=10)

    btn.click(
        fn=process,
        inputs=[prompt_input, web_switch, serp_input],
        outputs=output
    )

    gr.Spacer()
    gr.Markdown("---")
    gr.Markdown("*Hinweis: Suche nur bei aktivierter Websuche und nicht bei verbotenen Phrasen.*")

demo.launch()