File size: 6,431 Bytes
e3f9c03
 
 
 
 
 
 
 
 
 
 
11133cd
e3f9c03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11133cd
e3f9c03
 
 
 
2981248
75b97b8
 
 
 
 
2981248
 
e3f9c03
 
 
11133cd
e3f9c03
 
 
 
 
 
 
 
 
11133cd
 
e3f9c03
 
11133cd
e3f9c03
 
 
 
 
 
 
 
 
 
11133cd
e3f9c03
11133cd
e3f9c03
 
 
11133cd
 
 
 
 
 
 
e3f9c03
 
 
 
 
 
 
 
 
 
 
11133cd
e3f9c03
11133cd
e3f9c03
11133cd
 
e3f9c03
 
 
11133cd
e3f9c03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11133cd
 
 
 
 
e3f9c03
 
 
11133cd
 
 
 
 
 
e3f9c03
 
 
 
11133cd
e3f9c03
 
 
 
 
 
11133cd
e3f9c03
 
 
 
 
 
f1cfb87
e3f9c03
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
import gradio as gr
import faiss
import pickle
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from sentence_transformers import SentenceTransformer
from huggingface_hub import InferenceClient, HfApi

# Hugging Face Space persistence
HF_REPO_ID = "MoslemBot/kajiweb"
HF_API_TOKEN = os.getenv("HF_TOKEN")
api = HfApi()

def upload_to_hub(local_path, remote_path):
    api.upload_file(
        path_or_fileobj=local_path,
        path_in_repo=remote_path,
        repo_id=HF_REPO_ID,
        repo_type="space",
        token=HF_API_TOKEN
    )
    print(f"βœ… Uploaded to Hub: {remote_path}")

# Initialize embedder and LLM client
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
llm = InferenceClient(token=os.getenv("HF_TOKEN"))

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

def extract_links_and_text(base_url, max_depth=1, visited=None):
    if visited is None:
        visited = set()
    if base_url in visited or max_depth < 0:
        return []

    visited.add(base_url)
    print(f"πŸ”— Crawling: {base_url}")
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": base_url,
            "Connection": "keep-alive",
        }
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
        result = [(page_text, base_url)] if page_text.strip() else []

        links = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            full_url = urljoin(base_url, href)
            if urlparse(full_url).netloc == urlparse(base_url).netloc:
                links.add(full_url)

        for link in links:
            result.extend(extract_links_and_text(link, max_depth=max_depth-1, visited=visited))
        return result
    except Exception as e:
        print(f"❌ Failed to fetch {base_url}: {e}")
        return []

# Save webpage content and index it
def save_webpage(url, title):
    folder = os.path.join(DATA_DIR, title.strip())
    if os.path.exists(folder):
        return f"'{title}' already exists. Use a different title."

    os.makedirs(folder, exist_ok=True)

    # Extract text from webpage and its linked pages
    page_data = extract_links_and_text(url, max_depth=1)

    if not page_data:
        return "❌ No text extracted from the webpage."

    # Chunk text
    chunks = []
    sources = []
    for text, source_url in page_data:
        for i in range(0, len(text), 500):
            chunk = text[i:i+500]
            chunks.append(chunk)
            sources.append(source_url)

    # Embed and index
    embeddings = embedder.encode(chunks)

    print("Embeddings shape:", embeddings.shape)
    if len(embeddings.shape) != 2:
        raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    # Save index and metadata locally
    index_path = os.path.join(folder, "index.faiss")
    meta_path = os.path.join(folder, "meta.pkl")
    faiss.write_index(index, index_path)
    with open(meta_path, "wb") as f:
        pickle.dump(list(zip(chunks, sources)), f)

    # Upload to hub
    upload_to_hub(index_path, f"data/{title}/index.faiss")
    upload_to_hub(meta_path, f"data/{title}/meta.pkl")

    return f"βœ… Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."

# Return all available webpage titles
def list_titles():
    print(f"Listing in: {DATA_DIR} β†’ {os.listdir(DATA_DIR)}")
    return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]

# Ask question using selected webpages as context
def ask_question(message, history, selected_titles):
    if not selected_titles:
        return "❗ Please select at least one webpage."

    combined_answer = ""
    for title in selected_titles:
        folder = os.path.join(DATA_DIR, title)
        try:
            index = faiss.read_index(os.path.join(folder, "index.faiss"))
            with open(os.path.join(folder, "meta.pkl"), "rb") as f:
                chunk_data = pickle.load(f)  # List of (chunk, url)

            chunks = [cd[0] for cd in chunk_data]
            urls = [cd[1] for cd in chunk_data]

            q_embed = embedder.encode([message])
            D, I = index.search(q_embed, k=3)

            response_context = ""
            sources_set = set()
            for idx in I[0]:
                response_context += f"[{urls[idx]}]\n{chunks[idx]}\n\n"
                sources_set.add(urls[idx])

            response = llm.chat_completion(
                messages=[
                    {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
                    {"role": "user", "content": f"Context:\n{response_context}\n\nQuestion: {message}"}
                ],
                model="deepseek-ai/DeepSeek-R1-0528",
                max_tokens=2048,
            )

            response = response.choices[0].message["content"]
            combined_answer += f"**{title}** (sources: {', '.join(sources_set)}):\n{response.strip()}\n\n"
        except Exception as e:
            combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"

    return combined_answer.strip()

# Gradio UI
with gr.Blocks() as demo:
    with gr.Tab("🌐 Index Web Page"):
        url = gr.Textbox(label="Web Page URL")
        title = gr.Textbox(label="Title for Web Page")
        index_btn = gr.Button("Fetch and Index (with crawl)")
        index_status = gr.Textbox(label="Status")
        index_btn.click(fn=save_webpage, inputs=[url, title], outputs=index_status)

    with gr.Tab("πŸ’¬ Chat with Web Pages"):
        page_selector = gr.CheckboxGroup(label="Select Indexed Pages", choices=list_titles())
        refresh_btn = gr.Button("πŸ”„ Refresh List")
        refresh_btn.click(fn=list_titles, outputs=page_selector)
        chat = gr.ChatInterface(fn=ask_question, additional_inputs=[page_selector])

demo.launch()